1 /*
   2  * Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "jvm.h"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "compiler/compiler_globals.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/collectedHeap.inline.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/bytecodeHistogram.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/resourceArea.hpp"
  38 #include "memory/universe.hpp"
  39 #include "oops/accessDecorators.hpp"
  40 #include "oops/compressedOops.inline.hpp"
  41 #include "oops/klass.inline.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "runtime/flags/flagSetting.hpp"
  44 #include "runtime/interfaceSupport.inline.hpp"
  45 #include "runtime/jniHandles.hpp"
  46 #include "runtime/objectMonitor.hpp"
  47 #include "runtime/os.hpp"
  48 #include "runtime/safepoint.hpp"
  49 #include "runtime/safepointMechanism.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "runtime/thread.hpp"
  53 #include "utilities/macros.hpp"
  54 #include "crc32c.h"
  55 
  56 #ifdef PRODUCT
  57 #define BLOCK_COMMENT(str) /* nothing */
  58 #define STOP(error) stop(error)
  59 #else
  60 #define BLOCK_COMMENT(str) block_comment(str)
  61 #define STOP(error) block_comment(error); stop(error)
  62 #endif
  63 
  64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  65 
  66 #ifdef ASSERT
  67 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  68 #endif
  69 
  70 static Assembler::Condition reverse[] = {
  71     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  72     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  73     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  74     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  75     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  76     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  77     Assembler::above          /* belowEqual    = 0x6 */ ,
  78     Assembler::belowEqual     /* above         = 0x7 */ ,
  79     Assembler::positive       /* negative      = 0x8 */ ,
  80     Assembler::negative       /* positive      = 0x9 */ ,
  81     Assembler::noParity       /* parity        = 0xa */ ,
  82     Assembler::parity         /* noParity      = 0xb */ ,
  83     Assembler::greaterEqual   /* less          = 0xc */ ,
  84     Assembler::less           /* greaterEqual  = 0xd */ ,
  85     Assembler::greater        /* lessEqual     = 0xe */ ,
  86     Assembler::lessEqual      /* greater       = 0xf, */
  87 
  88 };
  89 
  90 
  91 // Implementation of MacroAssembler
  92 
  93 // First all the versions that have distinct versions depending on 32/64 bit
  94 // Unless the difference is trivial (1 line or so).
  95 
  96 #ifndef _LP64
  97 
  98 // 32bit versions
  99 
 100 Address MacroAssembler::as_Address(AddressLiteral adr) {
 101   return Address(adr.target(), adr.rspec());
 102 }
 103 
 104 Address MacroAssembler::as_Address(ArrayAddress adr) {
 105   return Address::make_array(adr);
 106 }
 107 
 108 void MacroAssembler::call_VM_leaf_base(address entry_point,
 109                                        int number_of_arguments) {
 110   call(RuntimeAddress(entry_point));
 111   increment(rsp, number_of_arguments * wordSize);
 112 }
 113 
 114 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 115   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 116 }
 117 
 118 
 119 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 120   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 121 }
 122 
 123 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 124   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 125 }
 126 
 127 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 128   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 129 }
 130 
 131 void MacroAssembler::extend_sign(Register hi, Register lo) {
 132   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 133   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 134     cdql();
 135   } else {
 136     movl(hi, lo);
 137     sarl(hi, 31);
 138   }
 139 }
 140 
 141 void MacroAssembler::jC2(Register tmp, Label& L) {
 142   // set parity bit if FPU flag C2 is set (via rax)
 143   save_rax(tmp);
 144   fwait(); fnstsw_ax();
 145   sahf();
 146   restore_rax(tmp);
 147   // branch
 148   jcc(Assembler::parity, L);
 149 }
 150 
 151 void MacroAssembler::jnC2(Register tmp, Label& L) {
 152   // set parity bit if FPU flag C2 is set (via rax)
 153   save_rax(tmp);
 154   fwait(); fnstsw_ax();
 155   sahf();
 156   restore_rax(tmp);
 157   // branch
 158   jcc(Assembler::noParity, L);
 159 }
 160 
 161 // 32bit can do a case table jump in one instruction but we no longer allow the base
 162 // to be installed in the Address class
 163 void MacroAssembler::jump(ArrayAddress entry) {
 164   jmp(as_Address(entry));
 165 }
 166 
 167 // Note: y_lo will be destroyed
 168 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 169   // Long compare for Java (semantics as described in JVM spec.)
 170   Label high, low, done;
 171 
 172   cmpl(x_hi, y_hi);
 173   jcc(Assembler::less, low);
 174   jcc(Assembler::greater, high);
 175   // x_hi is the return register
 176   xorl(x_hi, x_hi);
 177   cmpl(x_lo, y_lo);
 178   jcc(Assembler::below, low);
 179   jcc(Assembler::equal, done);
 180 
 181   bind(high);
 182   xorl(x_hi, x_hi);
 183   increment(x_hi);
 184   jmp(done);
 185 
 186   bind(low);
 187   xorl(x_hi, x_hi);
 188   decrementl(x_hi);
 189 
 190   bind(done);
 191 }
 192 
 193 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 194     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 195 }
 196 
 197 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 198   // leal(dst, as_Address(adr));
 199   // see note in movl as to why we must use a move
 200   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 201 }
 202 
 203 void MacroAssembler::leave() {
 204   mov(rsp, rbp);
 205   pop(rbp);
 206 }
 207 
 208 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 209   // Multiplication of two Java long values stored on the stack
 210   // as illustrated below. Result is in rdx:rax.
 211   //
 212   // rsp ---> [  ??  ] \               \
 213   //            ....    | y_rsp_offset  |
 214   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 215   //          [ y_hi ]                  | (in bytes)
 216   //            ....                    |
 217   //          [ x_lo ]                 /
 218   //          [ x_hi ]
 219   //            ....
 220   //
 221   // Basic idea: lo(result) = lo(x_lo * y_lo)
 222   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 223   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 224   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 225   Label quick;
 226   // load x_hi, y_hi and check if quick
 227   // multiplication is possible
 228   movl(rbx, x_hi);
 229   movl(rcx, y_hi);
 230   movl(rax, rbx);
 231   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 232   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 233   // do full multiplication
 234   // 1st step
 235   mull(y_lo);                                    // x_hi * y_lo
 236   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 237   // 2nd step
 238   movl(rax, x_lo);
 239   mull(rcx);                                     // x_lo * y_hi
 240   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 241   // 3rd step
 242   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 243   movl(rax, x_lo);
 244   mull(y_lo);                                    // x_lo * y_lo
 245   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 246 }
 247 
 248 void MacroAssembler::lneg(Register hi, Register lo) {
 249   negl(lo);
 250   adcl(hi, 0);
 251   negl(hi);
 252 }
 253 
 254 void MacroAssembler::lshl(Register hi, Register lo) {
 255   // Java shift left long support (semantics as described in JVM spec., p.305)
 256   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 257   // shift value is in rcx !
 258   assert(hi != rcx, "must not use rcx");
 259   assert(lo != rcx, "must not use rcx");
 260   const Register s = rcx;                        // shift count
 261   const int      n = BitsPerWord;
 262   Label L;
 263   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 264   cmpl(s, n);                                    // if (s < n)
 265   jcc(Assembler::less, L);                       // else (s >= n)
 266   movl(hi, lo);                                  // x := x << n
 267   xorl(lo, lo);
 268   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 269   bind(L);                                       // s (mod n) < n
 270   shldl(hi, lo);                                 // x := x << s
 271   shll(lo);
 272 }
 273 
 274 
 275 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 276   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 277   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 278   assert(hi != rcx, "must not use rcx");
 279   assert(lo != rcx, "must not use rcx");
 280   const Register s = rcx;                        // shift count
 281   const int      n = BitsPerWord;
 282   Label L;
 283   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 284   cmpl(s, n);                                    // if (s < n)
 285   jcc(Assembler::less, L);                       // else (s >= n)
 286   movl(lo, hi);                                  // x := x >> n
 287   if (sign_extension) sarl(hi, 31);
 288   else                xorl(hi, hi);
 289   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 290   bind(L);                                       // s (mod n) < n
 291   shrdl(lo, hi);                                 // x := x >> s
 292   if (sign_extension) sarl(hi);
 293   else                shrl(hi);
 294 }
 295 
 296 void MacroAssembler::movoop(Register dst, jobject obj) {
 297   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 298 }
 299 
 300 void MacroAssembler::movoop(Address dst, jobject obj) {
 301   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 302 }
 303 
 304 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 305   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 306 }
 307 
 308 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 309   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 310 }
 311 
 312 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 313   // scratch register is not used,
 314   // it is defined to match parameters of 64-bit version of this method.
 315   if (src.is_lval()) {
 316     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 317   } else {
 318     movl(dst, as_Address(src));
 319   }
 320 }
 321 
 322 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 323   movl(as_Address(dst), src);
 324 }
 325 
 326 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 327   movl(dst, as_Address(src));
 328 }
 329 
 330 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 331 void MacroAssembler::movptr(Address dst, intptr_t src) {
 332   movl(dst, src);
 333 }
 334 
 335 
 336 void MacroAssembler::pop_callee_saved_registers() {
 337   pop(rcx);
 338   pop(rdx);
 339   pop(rdi);
 340   pop(rsi);
 341 }
 342 
 343 void MacroAssembler::push_callee_saved_registers() {
 344   push(rsi);
 345   push(rdi);
 346   push(rdx);
 347   push(rcx);
 348 }
 349 
 350 void MacroAssembler::pushoop(jobject obj) {
 351   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 352 }
 353 
 354 void MacroAssembler::pushklass(Metadata* obj) {
 355   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 356 }
 357 
 358 void MacroAssembler::pushptr(AddressLiteral src) {
 359   if (src.is_lval()) {
 360     push_literal32((int32_t)src.target(), src.rspec());
 361   } else {
 362     pushl(as_Address(src));
 363   }
 364 }
 365 
 366 static void pass_arg0(MacroAssembler* masm, Register arg) {
 367   masm->push(arg);
 368 }
 369 
 370 static void pass_arg1(MacroAssembler* masm, Register arg) {
 371   masm->push(arg);
 372 }
 373 
 374 static void pass_arg2(MacroAssembler* masm, Register arg) {
 375   masm->push(arg);
 376 }
 377 
 378 static void pass_arg3(MacroAssembler* masm, Register arg) {
 379   masm->push(arg);
 380 }
 381 
 382 #ifndef PRODUCT
 383 extern "C" void findpc(intptr_t x);
 384 #endif
 385 
 386 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 387   // In order to get locks to work, we need to fake a in_VM state
 388   JavaThread* thread = JavaThread::current();
 389   JavaThreadState saved_state = thread->thread_state();
 390   thread->set_thread_state(_thread_in_vm);
 391   if (ShowMessageBoxOnError) {
 392     JavaThread* thread = JavaThread::current();
 393     JavaThreadState saved_state = thread->thread_state();
 394     thread->set_thread_state(_thread_in_vm);
 395     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 396       ttyLocker ttyl;
 397       BytecodeCounter::print();
 398     }
 399     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 400     // This is the value of eip which points to where verify_oop will return.
 401     if (os::message_box(msg, "Execution stopped, print registers?")) {
 402       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 403       BREAKPOINT;
 404     }
 405   }
 406   fatal("DEBUG MESSAGE: %s", msg);
 407 }
 408 
 409 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 410   ttyLocker ttyl;
 411   FlagSetting fs(Debugging, true);
 412   tty->print_cr("eip = 0x%08x", eip);
 413 #ifndef PRODUCT
 414   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 415     tty->cr();
 416     findpc(eip);
 417     tty->cr();
 418   }
 419 #endif
 420 #define PRINT_REG(rax) \
 421   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 422   PRINT_REG(rax);
 423   PRINT_REG(rbx);
 424   PRINT_REG(rcx);
 425   PRINT_REG(rdx);
 426   PRINT_REG(rdi);
 427   PRINT_REG(rsi);
 428   PRINT_REG(rbp);
 429   PRINT_REG(rsp);
 430 #undef PRINT_REG
 431   // Print some words near top of staack.
 432   int* dump_sp = (int*) rsp;
 433   for (int col1 = 0; col1 < 8; col1++) {
 434     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 435     os::print_location(tty, *dump_sp++);
 436   }
 437   for (int row = 0; row < 16; row++) {
 438     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 439     for (int col = 0; col < 8; col++) {
 440       tty->print(" 0x%08x", *dump_sp++);
 441     }
 442     tty->cr();
 443   }
 444   // Print some instructions around pc:
 445   Disassembler::decode((address)eip-64, (address)eip);
 446   tty->print_cr("--------");
 447   Disassembler::decode((address)eip, (address)eip+32);
 448 }
 449 
 450 void MacroAssembler::stop(const char* msg) {
 451   ExternalAddress message((address)msg);
 452   // push address of message
 453   pushptr(message.addr());
 454   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 455   pusha();                                            // push registers
 456   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 457   hlt();
 458 }
 459 
 460 void MacroAssembler::warn(const char* msg) {
 461   push_CPU_state();
 462 
 463   ExternalAddress message((address) msg);
 464   // push address of message
 465   pushptr(message.addr());
 466 
 467   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 468   addl(rsp, wordSize);       // discard argument
 469   pop_CPU_state();
 470 }
 471 
 472 void MacroAssembler::print_state() {
 473   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 474   pusha();                                            // push registers
 475 
 476   push_CPU_state();
 477   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 478   pop_CPU_state();
 479 
 480   popa();
 481   addl(rsp, wordSize);
 482 }
 483 
 484 #else // _LP64
 485 
 486 // 64 bit versions
 487 
 488 Address MacroAssembler::as_Address(AddressLiteral adr) {
 489   // amd64 always does this as a pc-rel
 490   // we can be absolute or disp based on the instruction type
 491   // jmp/call are displacements others are absolute
 492   assert(!adr.is_lval(), "must be rval");
 493   assert(reachable(adr), "must be");
 494   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 495 
 496 }
 497 
 498 Address MacroAssembler::as_Address(ArrayAddress adr) {
 499   AddressLiteral base = adr.base();
 500   lea(rscratch1, base);
 501   Address index = adr.index();
 502   assert(index._disp == 0, "must not have disp"); // maybe it can?
 503   Address array(rscratch1, index._index, index._scale, index._disp);
 504   return array;
 505 }
 506 
 507 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 508   Label L, E;
 509 
 510 #ifdef _WIN64
 511   // Windows always allocates space for it's register args
 512   assert(num_args <= 4, "only register arguments supported");
 513   subq(rsp,  frame::arg_reg_save_area_bytes);
 514 #endif
 515 
 516   // Align stack if necessary
 517   testl(rsp, 15);
 518   jcc(Assembler::zero, L);
 519 
 520   subq(rsp, 8);
 521   {
 522     call(RuntimeAddress(entry_point));
 523     oopmap_metadata(-1);
 524   }
 525   addq(rsp, 8);
 526   jmp(E);
 527 
 528   bind(L);
 529   {
 530     call(RuntimeAddress(entry_point));
 531     oopmap_metadata(-1);
 532   }
 533 
 534   bind(E);
 535 
 536 #ifdef _WIN64
 537   // restore stack pointer
 538   addq(rsp, frame::arg_reg_save_area_bytes);
 539 #endif
 540 
 541 }
 542 
 543 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 544   assert(!src2.is_lval(), "should use cmpptr");
 545 
 546   if (reachable(src2)) {
 547     cmpq(src1, as_Address(src2));
 548   } else {
 549     lea(rscratch1, src2);
 550     Assembler::cmpq(src1, Address(rscratch1, 0));
 551   }
 552 }
 553 
 554 int MacroAssembler::corrected_idivq(Register reg) {
 555   // Full implementation of Java ldiv and lrem; checks for special
 556   // case as described in JVM spec., p.243 & p.271.  The function
 557   // returns the (pc) offset of the idivl instruction - may be needed
 558   // for implicit exceptions.
 559   //
 560   //         normal case                           special case
 561   //
 562   // input : rax: dividend                         min_long
 563   //         reg: divisor   (may not be eax/edx)   -1
 564   //
 565   // output: rax: quotient  (= rax idiv reg)       min_long
 566   //         rdx: remainder (= rax irem reg)       0
 567   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 568   static const int64_t min_long = 0x8000000000000000;
 569   Label normal_case, special_case;
 570 
 571   // check for special case
 572   cmp64(rax, ExternalAddress((address) &min_long));
 573   jcc(Assembler::notEqual, normal_case);
 574   xorl(rdx, rdx); // prepare rdx for possible special case (where
 575                   // remainder = 0)
 576   cmpq(reg, -1);
 577   jcc(Assembler::equal, special_case);
 578 
 579   // handle normal case
 580   bind(normal_case);
 581   cdqq();
 582   int idivq_offset = offset();
 583   idivq(reg);
 584 
 585   // normal and special case exit
 586   bind(special_case);
 587 
 588   return idivq_offset;
 589 }
 590 
 591 void MacroAssembler::decrementq(Register reg, int value) {
 592   if (value == min_jint) { subq(reg, value); return; }
 593   if (value <  0) { incrementq(reg, -value); return; }
 594   if (value == 0) {                        ; return; }
 595   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 596   /* else */      { subq(reg, value)       ; return; }
 597 }
 598 
 599 void MacroAssembler::decrementq(Address dst, int value) {
 600   if (value == min_jint) { subq(dst, value); return; }
 601   if (value <  0) { incrementq(dst, -value); return; }
 602   if (value == 0) {                        ; return; }
 603   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 604   /* else */      { subq(dst, value)       ; return; }
 605 }
 606 
 607 void MacroAssembler::incrementq(AddressLiteral dst) {
 608   if (reachable(dst)) {
 609     incrementq(as_Address(dst));
 610   } else {
 611     lea(rscratch1, dst);
 612     incrementq(Address(rscratch1, 0));
 613   }
 614 }
 615 
 616 void MacroAssembler::incrementq(Register reg, int value) {
 617   if (value == min_jint) { addq(reg, value); return; }
 618   if (value <  0) { decrementq(reg, -value); return; }
 619   if (value == 0) {                        ; return; }
 620   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 621   /* else */      { addq(reg, value)       ; return; }
 622 }
 623 
 624 void MacroAssembler::incrementq(Address dst, int value) {
 625   if (value == min_jint) { addq(dst, value); return; }
 626   if (value <  0) { decrementq(dst, -value); return; }
 627   if (value == 0) {                        ; return; }
 628   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 629   /* else */      { addq(dst, value)       ; return; }
 630 }
 631 
 632 // 32bit can do a case table jump in one instruction but we no longer allow the base
 633 // to be installed in the Address class
 634 void MacroAssembler::jump(ArrayAddress entry) {
 635   lea(rscratch1, entry.base());
 636   Address dispatch = entry.index();
 637   assert(dispatch._base == noreg, "must be");
 638   dispatch._base = rscratch1;
 639   jmp(dispatch);
 640 }
 641 
 642 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 643   ShouldNotReachHere(); // 64bit doesn't use two regs
 644   cmpq(x_lo, y_lo);
 645 }
 646 
 647 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 648     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 649 }
 650 
 651 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 652   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 653   movptr(dst, rscratch1);
 654 }
 655 
 656 void MacroAssembler::leave() {
 657   // %%% is this really better? Why not on 32bit too?
 658   emit_int8((unsigned char)0xC9); // LEAVE
 659 }
 660 
 661 void MacroAssembler::lneg(Register hi, Register lo) {
 662   ShouldNotReachHere(); // 64bit doesn't use two regs
 663   negq(lo);
 664 }
 665 
 666 void MacroAssembler::movoop(Register dst, jobject obj) {
 667   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 668 }
 669 
 670 void MacroAssembler::movoop(Address dst, jobject obj) {
 671   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 672   movq(dst, rscratch1);
 673 }
 674 
 675 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 676   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 677 }
 678 
 679 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 680   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 681   movq(dst, rscratch1);
 682 }
 683 
 684 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 685   if (src.is_lval()) {
 686     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 687   } else {
 688     if (reachable(src)) {
 689       movq(dst, as_Address(src));
 690     } else {
 691       lea(scratch, src);
 692       movq(dst, Address(scratch, 0));
 693     }
 694   }
 695 }
 696 
 697 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 698   movq(as_Address(dst), src);
 699 }
 700 
 701 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 702   movq(dst, as_Address(src));
 703 }
 704 
 705 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 706 void MacroAssembler::movptr(Address dst, intptr_t src) {
 707   if (is_simm32(src)) {
 708     movptr(dst, checked_cast<int32_t>(src));
 709   } else {
 710     mov64(rscratch1, src);
 711     movq(dst, rscratch1);
 712   }
 713 }
 714 
 715 // These are mostly for initializing NULL
 716 void MacroAssembler::movptr(Address dst, int32_t src) {
 717   movslq(dst, src);
 718 }
 719 
 720 void MacroAssembler::movptr(Register dst, int32_t src) {
 721   mov64(dst, (intptr_t)src);
 722 }
 723 
 724 void MacroAssembler::pushoop(jobject obj) {
 725   movoop(rscratch1, obj);
 726   push(rscratch1);
 727 }
 728 
 729 void MacroAssembler::pushklass(Metadata* obj) {
 730   mov_metadata(rscratch1, obj);
 731   push(rscratch1);
 732 }
 733 
 734 void MacroAssembler::pushptr(AddressLiteral src) {
 735   lea(rscratch1, src);
 736   if (src.is_lval()) {
 737     push(rscratch1);
 738   } else {
 739     pushq(Address(rscratch1, 0));
 740   }
 741 }
 742 
 743 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 744   reset_last_Java_frame(r15_thread, clear_fp);
 745 }
 746 
 747 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 748                                          Register last_java_fp,
 749                                          address  last_java_pc) {
 750   vzeroupper();
 751   // determine last_java_sp register
 752   if (!last_java_sp->is_valid()) {
 753     last_java_sp = rsp;
 754   }
 755 
 756   // last_java_fp is optional
 757   if (last_java_fp->is_valid()) {
 758     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 759            last_java_fp);
 760   }
 761 
 762   // last_java_pc is optional
 763   if (last_java_pc != NULL) {
 764     Address java_pc(r15_thread,
 765                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 766     lea(rscratch1, InternalAddress(last_java_pc));
 767     movptr(java_pc, rscratch1);
 768   }
 769 
 770   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 771 }
 772 
 773 static void pass_arg0(MacroAssembler* masm, Register arg) {
 774   if (c_rarg0 != arg ) {
 775     masm->mov(c_rarg0, arg);
 776   }
 777 }
 778 
 779 static void pass_arg1(MacroAssembler* masm, Register arg) {
 780   if (c_rarg1 != arg ) {
 781     masm->mov(c_rarg1, arg);
 782   }
 783 }
 784 
 785 static void pass_arg2(MacroAssembler* masm, Register arg) {
 786   if (c_rarg2 != arg ) {
 787     masm->mov(c_rarg2, arg);
 788   }
 789 }
 790 
 791 static void pass_arg3(MacroAssembler* masm, Register arg) {
 792   if (c_rarg3 != arg ) {
 793     masm->mov(c_rarg3, arg);
 794   }
 795 }
 796 
 797 void MacroAssembler::stop(const char* msg) {
 798   if (ShowMessageBoxOnError) {
 799     address rip = pc();
 800     pusha(); // get regs on stack
 801     lea(c_rarg1, InternalAddress(rip));
 802     movq(c_rarg2, rsp); // pass pointer to regs array
 803   }
 804   lea(c_rarg0, ExternalAddress((address) msg));
 805   andq(rsp, -16); // align stack as required by ABI
 806   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 807   hlt();
 808 }
 809 
 810 void MacroAssembler::warn(const char* msg) {
 811   push(rbp);
 812   movq(rbp, rsp);
 813   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 814   push_CPU_state();   // keeps alignment at 16 bytes
 815   lea(c_rarg0, ExternalAddress((address) msg));
 816   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
 817   call(rax);
 818   pop_CPU_state();
 819   mov(rsp, rbp);
 820   pop(rbp);
 821 }
 822 
 823 void MacroAssembler::_assert_asm(Assembler::Condition cc, const char* msg) {
 824 #ifdef ASSERT
 825   Label OK;
 826   jcc(cc, OK);
 827   stop(msg);
 828   bind(OK);
 829 #endif
 830 }
 831 
 832 void MacroAssembler::print_state() {
 833   address rip = pc();
 834   pusha();            // get regs on stack
 835   push(rbp);
 836   movq(rbp, rsp);
 837   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 838   push_CPU_state();   // keeps alignment at 16 bytes
 839 
 840   lea(c_rarg0, InternalAddress(rip));
 841   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 842   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 843 
 844   pop_CPU_state();
 845   mov(rsp, rbp);
 846   pop(rbp);
 847   popa();
 848 }
 849 
 850 #ifndef PRODUCT
 851 extern "C" void findpc(intptr_t x);
 852 #endif
 853 
 854 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 855   // In order to get locks to work, we need to fake a in_VM state
 856   if (ShowMessageBoxOnError) {
 857     JavaThread* thread = JavaThread::current();
 858     JavaThreadState saved_state = thread->thread_state();
 859     thread->set_thread_state(_thread_in_vm);
 860 #ifndef PRODUCT
 861     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 862       ttyLocker ttyl;
 863       BytecodeCounter::print();
 864     }
 865 #endif
 866     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 867     // XXX correct this offset for amd64
 868     // This is the value of eip which points to where verify_oop will return.
 869     if (os::message_box(msg, "Execution stopped, print registers?")) {
 870       print_state64(pc, regs);
 871       BREAKPOINT;
 872     }
 873   }
 874   fatal("DEBUG MESSAGE: %s", msg);
 875 }
 876 
 877 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 878   ttyLocker ttyl;
 879   FlagSetting fs(Debugging, true);
 880   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 881 #ifndef PRODUCT
 882   tty->cr();
 883   findpc(pc);
 884   tty->cr();
 885 #endif
 886 #define PRINT_REG(rax, value) \
 887   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 888   PRINT_REG(rax, regs[15]);
 889   PRINT_REG(rbx, regs[12]);
 890   PRINT_REG(rcx, regs[14]);
 891   PRINT_REG(rdx, regs[13]);
 892   PRINT_REG(rdi, regs[8]);
 893   PRINT_REG(rsi, regs[9]);
 894   PRINT_REG(rbp, regs[10]);
 895   // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
 896   PRINT_REG(rsp, (intptr_t)(&regs[16]));
 897   PRINT_REG(r8 , regs[7]);
 898   PRINT_REG(r9 , regs[6]);
 899   PRINT_REG(r10, regs[5]);
 900   PRINT_REG(r11, regs[4]);
 901   PRINT_REG(r12, regs[3]);
 902   PRINT_REG(r13, regs[2]);
 903   PRINT_REG(r14, regs[1]);
 904   PRINT_REG(r15, regs[0]);
 905 #undef PRINT_REG
 906   // Print some words near the top of the stack.
 907   int64_t* rsp = &regs[16];
 908   int64_t* dump_sp = rsp;
 909   for (int col1 = 0; col1 < 8; col1++) {
 910     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 911     os::print_location(tty, *dump_sp++);
 912   }
 913   for (int row = 0; row < 25; row++) {
 914     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 915     for (int col = 0; col < 4; col++) {
 916       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 917     }
 918     tty->cr();
 919   }
 920   // Print some instructions around pc:
 921   Disassembler::decode((address)pc-64, (address)pc);
 922   tty->print_cr("--------");
 923   Disassembler::decode((address)pc, (address)pc+32);
 924 }
 925 
 926 // The java_calling_convention describes stack locations as ideal slots on
 927 // a frame with no abi restrictions. Since we must observe abi restrictions
 928 // (like the placement of the register window) the slots must be biased by
 929 // the following value.
 930 static int reg2offset_in(VMReg r) {
 931   // Account for saved rbp and return address
 932   // This should really be in_preserve_stack_slots
 933   return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
 934 }
 935 
 936 static int reg2offset_out(VMReg r) {
 937   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
 938 }
 939 
 940 // A long move
 941 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst) {
 942 
 943   // The calling conventions assures us that each VMregpair is either
 944   // all really one physical register or adjacent stack slots.
 945 
 946   if (src.is_single_phys_reg() ) {
 947     if (dst.is_single_phys_reg()) {
 948       if (dst.first() != src.first()) {
 949         mov(dst.first()->as_Register(), src.first()->as_Register());
 950       }
 951     } else {
 952       assert(dst.is_single_reg(), "not a stack pair");
 953       movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
 954     }
 955   } else if (dst.is_single_phys_reg()) {
 956     assert(src.is_single_reg(),  "not a stack pair");
 957     movq(dst.first()->as_Register(), Address(rbp, reg2offset_out(src.first())));
 958   } else {
 959     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 960     movq(rax, Address(rbp, reg2offset_in(src.first())));
 961     movq(Address(rsp, reg2offset_out(dst.first())), rax);
 962   }
 963 }
 964 
 965 // A double move
 966 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst) {
 967 
 968   // The calling conventions assures us that each VMregpair is either
 969   // all really one physical register or adjacent stack slots.
 970 
 971   if (src.is_single_phys_reg() ) {
 972     if (dst.is_single_phys_reg()) {
 973       // In theory these overlap but the ordering is such that this is likely a nop
 974       if ( src.first() != dst.first()) {
 975         movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
 976       }
 977     } else {
 978       assert(dst.is_single_reg(), "not a stack pair");
 979       movdbl(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
 980     }
 981   } else if (dst.is_single_phys_reg()) {
 982     assert(src.is_single_reg(),  "not a stack pair");
 983     movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_out(src.first())));
 984   } else {
 985     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 986     movq(rax, Address(rbp, reg2offset_in(src.first())));
 987     movq(Address(rsp, reg2offset_out(dst.first())), rax);
 988   }
 989 }
 990 
 991 
 992 // A float arg may have to do float reg int reg conversion
 993 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst) {
 994   assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
 995 
 996   // The calling conventions assures us that each VMregpair is either
 997   // all really one physical register or adjacent stack slots.
 998 
 999   if (src.first()->is_stack()) {
1000     if (dst.first()->is_stack()) {
1001       movl(rax, Address(rbp, reg2offset_in(src.first())));
1002       movptr(Address(rsp, reg2offset_out(dst.first())), rax);
1003     } else {
1004       // stack to reg
1005       assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1006       movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first())));
1007     }
1008   } else if (dst.first()->is_stack()) {
1009     // reg to stack
1010     assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1011     movflt(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
1012   } else {
1013     // reg to reg
1014     // In theory these overlap but the ordering is such that this is likely a nop
1015     if ( src.first() != dst.first()) {
1016       movdbl(dst.first()->as_XMMRegister(),  src.first()->as_XMMRegister());
1017     }
1018   }
1019 }
1020 
1021 // On 64 bit we will store integer like items to the stack as
1022 // 64 bits items (x86_32/64 abi) even though java would only store
1023 // 32bits for a parameter. On 32bit it will simply be 32 bits
1024 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
1025 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst) {
1026   if (src.first()->is_stack()) {
1027     if (dst.first()->is_stack()) {
1028       // stack to stack
1029       movslq(rax, Address(rbp, reg2offset_in(src.first())));
1030       movq(Address(rsp, reg2offset_out(dst.first())), rax);
1031     } else {
1032       // stack to reg
1033       movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1034     }
1035   } else if (dst.first()->is_stack()) {
1036     // reg to stack
1037     // Do we really have to sign extend???
1038     // __ movslq(src.first()->as_Register(), src.first()->as_Register());
1039     movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1040   } else {
1041     // Do we really have to sign extend???
1042     // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
1043     if (dst.first() != src.first()) {
1044       movq(dst.first()->as_Register(), src.first()->as_Register());
1045     }
1046   }
1047 }
1048 
1049 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
1050   if (src.first()->is_stack()) {
1051     if (dst.first()->is_stack()) {
1052       // stack to stack
1053       movq(rax, Address(rbp, reg2offset_in(src.first())));
1054       movq(Address(rsp, reg2offset_out(dst.first())), rax);
1055     } else {
1056       // stack to reg
1057       movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1058     }
1059   } else if (dst.first()->is_stack()) {
1060     // reg to stack
1061     movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1062   } else {
1063     if (dst.first() != src.first()) {
1064       movq(dst.first()->as_Register(), src.first()->as_Register());
1065     }
1066   }
1067 }
1068 
1069 // An oop arg. Must pass a handle not the oop itself
1070 void MacroAssembler::object_move(OopMap* map,
1071                         int oop_handle_offset,
1072                         int framesize_in_slots,
1073                         VMRegPair src,
1074                         VMRegPair dst,
1075                         bool is_receiver,
1076                         int* receiver_offset) {
1077 
1078   // must pass a handle. First figure out the location we use as a handle
1079 
1080   Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1081 
1082   // See if oop is NULL if it is we need no handle
1083 
1084   if (src.first()->is_stack()) {
1085 
1086     // Oop is already on the stack as an argument
1087     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1088     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1089     if (is_receiver) {
1090       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1091     }
1092 
1093     cmpptr(Address(rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD);
1094     lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1095     // conditionally move a NULL
1096     cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1097   } else {
1098 
1099     // Oop is in an a register we must store it to the space we reserve
1100     // on the stack for oop_handles and pass a handle if oop is non-NULL
1101 
1102     const Register rOop = src.first()->as_Register();
1103     int oop_slot;
1104     if (rOop == j_rarg0)
1105       oop_slot = 0;
1106     else if (rOop == j_rarg1)
1107       oop_slot = 1;
1108     else if (rOop == j_rarg2)
1109       oop_slot = 2;
1110     else if (rOop == j_rarg3)
1111       oop_slot = 3;
1112     else if (rOop == j_rarg4)
1113       oop_slot = 4;
1114     else {
1115       assert(rOop == j_rarg5, "wrong register");
1116       oop_slot = 5;
1117     }
1118 
1119     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1120     int offset = oop_slot*VMRegImpl::stack_slot_size;
1121 
1122     map->set_oop(VMRegImpl::stack2reg(oop_slot));
1123     // Store oop in handle area, may be NULL
1124     movptr(Address(rsp, offset), rOop);
1125     if (is_receiver) {
1126       *receiver_offset = offset;
1127     }
1128 
1129     cmpptr(rOop, (int32_t)NULL_WORD);
1130     lea(rHandle, Address(rsp, offset));
1131     // conditionally move a NULL from the handle area where it was just stored
1132     cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1133   }
1134 
1135   // If arg is on the stack then place it otherwise it is already in correct reg.
1136   if (dst.first()->is_stack()) {
1137     movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1138   }
1139 }
1140 
1141 #endif // _LP64
1142 
1143 // Now versions that are common to 32/64 bit
1144 
1145 void MacroAssembler::oopmap_metadata(int index) {
1146   // if (index != -1) tty->print_cr("oopmap_metadata %d", index);
1147   // mov64(r10, 1234); // TODO: Add a new relocInfo with external semantics. see relocInfo::metadata_type
1148 }
1149 
1150 void MacroAssembler::addptr(Register dst, int32_t imm32) {
1151   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
1152 }
1153 
1154 void MacroAssembler::addptr(Register dst, Register src) {
1155   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1156 }
1157 
1158 void MacroAssembler::addptr(Address dst, Register src) {
1159   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1160 }
1161 
1162 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
1163   if (reachable(src)) {
1164     Assembler::addsd(dst, as_Address(src));
1165   } else {
1166     lea(rscratch1, src);
1167     Assembler::addsd(dst, Address(rscratch1, 0));
1168   }
1169 }
1170 
1171 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
1172   if (reachable(src)) {
1173     addss(dst, as_Address(src));
1174   } else {
1175     lea(rscratch1, src);
1176     addss(dst, Address(rscratch1, 0));
1177   }
1178 }
1179 
1180 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
1181   if (reachable(src)) {
1182     Assembler::addpd(dst, as_Address(src));
1183   } else {
1184     lea(rscratch1, src);
1185     Assembler::addpd(dst, Address(rscratch1, 0));
1186   }
1187 }
1188 
1189 // See 8273459.  Function for ensuring 64-byte alignment, intended for stubs only.
1190 // Stub code is generated once and never copied.
1191 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
1192 void MacroAssembler::align64() {
1193   align(64, (unsigned long long) pc());
1194 }
1195 
1196 void MacroAssembler::align32() {
1197   align(32, (unsigned long long) pc());
1198 }
1199 
1200 void MacroAssembler::align(int modulus) {
1201   // 8273459: Ensure alignment is possible with current segment alignment
1202   assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
1203   align(modulus, offset());
1204 }
1205 
1206 void MacroAssembler::align(int modulus, int target) {
1207   if (target % modulus != 0) {
1208     nop(modulus - (target % modulus));
1209   }
1210 }
1211 
1212 void MacroAssembler::push_f(XMMRegister r) {
1213   subptr(rsp, wordSize);
1214   movflt(Address(rsp, 0), r);
1215 }
1216 
1217 void MacroAssembler::pop_f(XMMRegister r) {
1218   movflt(r, Address(rsp, 0));
1219   addptr(rsp, wordSize);
1220 }
1221 
1222 void MacroAssembler::push_d(XMMRegister r) {
1223   subptr(rsp, 2 * wordSize);
1224   movdbl(Address(rsp, 0), r);
1225 }
1226 
1227 void MacroAssembler::pop_d(XMMRegister r) {
1228   movdbl(r, Address(rsp, 0));
1229   addptr(rsp, 2 * Interpreter::stackElementSize);
1230 }
1231 
1232 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1233   // Used in sign-masking with aligned address.
1234   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1235   if (reachable(src)) {
1236     Assembler::andpd(dst, as_Address(src));
1237   } else {
1238     lea(scratch_reg, src);
1239     Assembler::andpd(dst, Address(scratch_reg, 0));
1240   }
1241 }
1242 
1243 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1244   // Used in sign-masking with aligned address.
1245   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1246   if (reachable(src)) {
1247     Assembler::andps(dst, as_Address(src));
1248   } else {
1249     lea(scratch_reg, src);
1250     Assembler::andps(dst, Address(scratch_reg, 0));
1251   }
1252 }
1253 
1254 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1255   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1256 }
1257 
1258 void MacroAssembler::atomic_incl(Address counter_addr) {
1259   lock();
1260   incrementl(counter_addr);
1261 }
1262 
1263 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1264   if (reachable(counter_addr)) {
1265     atomic_incl(as_Address(counter_addr));
1266   } else {
1267     lea(scr, counter_addr);
1268     atomic_incl(Address(scr, 0));
1269   }
1270 }
1271 
1272 #ifdef _LP64
1273 void MacroAssembler::atomic_incq(Address counter_addr) {
1274   lock();
1275   incrementq(counter_addr);
1276 }
1277 
1278 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1279   if (reachable(counter_addr)) {
1280     atomic_incq(as_Address(counter_addr));
1281   } else {
1282     lea(scr, counter_addr);
1283     atomic_incq(Address(scr, 0));
1284   }
1285 }
1286 #endif
1287 
1288 // Writes to stack successive pages until offset reached to check for
1289 // stack overflow + shadow pages.  This clobbers tmp.
1290 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1291   movptr(tmp, rsp);
1292   // Bang stack for total size given plus shadow page size.
1293   // Bang one page at a time because large size can bang beyond yellow and
1294   // red zones.
1295   Label loop;
1296   bind(loop);
1297   movl(Address(tmp, (-os::vm_page_size())), size );
1298   subptr(tmp, os::vm_page_size());
1299   subl(size, os::vm_page_size());
1300   jcc(Assembler::greater, loop);
1301 
1302   // Bang down shadow pages too.
1303   // At this point, (tmp-0) is the last address touched, so don't
1304   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1305   // was post-decremented.)  Skip this address by starting at i=1, and
1306   // touch a few more pages below.  N.B.  It is important to touch all
1307   // the way down including all pages in the shadow zone.
1308   for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1309     // this could be any sized move but this is can be a debugging crumb
1310     // so the bigger the better.
1311     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1312   }
1313 }
1314 
1315 void MacroAssembler::reserved_stack_check() {
1316     // testing if reserved zone needs to be enabled
1317     Label no_reserved_zone_enabling;
1318     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1319     NOT_LP64(get_thread(rsi);)
1320 
1321     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1322     jcc(Assembler::below, no_reserved_zone_enabling);
1323 
1324     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1325     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1326     should_not_reach_here();
1327 
1328     bind(no_reserved_zone_enabling);
1329 }
1330 
1331 void MacroAssembler::c2bool(Register x) {
1332   // implements x == 0 ? 0 : 1
1333   // note: must only look at least-significant byte of x
1334   //       since C-style booleans are stored in one byte
1335   //       only! (was bug)
1336   andl(x, 0xFF);
1337   setb(Assembler::notZero, x);
1338 }
1339 
1340 // Wouldn't need if AddressLiteral version had new name
1341 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1342   Assembler::call(L, rtype);
1343 }
1344 
1345 void MacroAssembler::call(Register entry) {
1346   Assembler::call(entry);
1347 }
1348 
1349 void MacroAssembler::call(AddressLiteral entry) {
1350   if (reachable(entry)) {
1351     Assembler::call_literal(entry.target(), entry.rspec());
1352   } else {
1353     lea(rscratch1, entry);
1354     Assembler::call(rscratch1);
1355   }
1356 }
1357 
1358 void MacroAssembler::ic_call(address entry, jint method_index) {
1359   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
1360   movptr(rax, (intptr_t)Universe::non_oop_word());
1361   call(AddressLiteral(entry, rh));
1362 }
1363 
1364 // Implementation of call_VM versions
1365 
1366 void MacroAssembler::call_VM(Register oop_result,
1367                              address entry_point,
1368                              bool check_exceptions) {
1369   Label C, E;
1370   call(C, relocInfo::none);
1371   jmp(E);
1372 
1373   bind(C);
1374   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1375   ret(0);
1376 
1377   bind(E);
1378 }
1379 
1380 void MacroAssembler::call_VM(Register oop_result,
1381                              address entry_point,
1382                              Register arg_1,
1383                              bool check_exceptions) {
1384   Label C, E;
1385   call(C, relocInfo::none);
1386   jmp(E);
1387 
1388   bind(C);
1389   pass_arg1(this, arg_1);
1390   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1391   ret(0);
1392 
1393   bind(E);
1394 }
1395 
1396 void MacroAssembler::call_VM(Register oop_result,
1397                              address entry_point,
1398                              Register arg_1,
1399                              Register arg_2,
1400                              bool check_exceptions) {
1401   Label C, E;
1402   call(C, relocInfo::none);
1403   jmp(E);
1404 
1405   bind(C);
1406 
1407   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1408 
1409   pass_arg2(this, arg_2);
1410   pass_arg1(this, arg_1);
1411   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1412   ret(0);
1413 
1414   bind(E);
1415 }
1416 
1417 void MacroAssembler::call_VM(Register oop_result,
1418                              address entry_point,
1419                              Register arg_1,
1420                              Register arg_2,
1421                              Register arg_3,
1422                              bool check_exceptions) {
1423   Label C, E;
1424   call(C, relocInfo::none);
1425   jmp(E);
1426 
1427   bind(C);
1428 
1429   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1430   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1431   pass_arg3(this, arg_3);
1432 
1433   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1434   pass_arg2(this, arg_2);
1435 
1436   pass_arg1(this, arg_1);
1437   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1438   ret(0);
1439 
1440   bind(E);
1441 }
1442 
1443 void MacroAssembler::call_VM(Register oop_result,
1444                              Register last_java_sp,
1445                              address entry_point,
1446                              int number_of_arguments,
1447                              bool check_exceptions) {
1448   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1449   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1450 }
1451 
1452 void MacroAssembler::call_VM(Register oop_result,
1453                              Register last_java_sp,
1454                              address entry_point,
1455                              Register arg_1,
1456                              bool check_exceptions) {
1457   pass_arg1(this, arg_1);
1458   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1459 }
1460 
1461 void MacroAssembler::call_VM(Register oop_result,
1462                              Register last_java_sp,
1463                              address entry_point,
1464                              Register arg_1,
1465                              Register arg_2,
1466                              bool check_exceptions) {
1467 
1468   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1469   pass_arg2(this, arg_2);
1470   pass_arg1(this, arg_1);
1471   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1472 }
1473 
1474 void MacroAssembler::call_VM(Register oop_result,
1475                              Register last_java_sp,
1476                              address entry_point,
1477                              Register arg_1,
1478                              Register arg_2,
1479                              Register arg_3,
1480                              bool check_exceptions) {
1481   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1482   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1483   pass_arg3(this, arg_3);
1484   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1485   pass_arg2(this, arg_2);
1486   pass_arg1(this, arg_1);
1487   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1488 }
1489 
1490 void MacroAssembler::super_call_VM(Register oop_result,
1491                                    Register last_java_sp,
1492                                    address entry_point,
1493                                    int number_of_arguments,
1494                                    bool check_exceptions) {
1495   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1496   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1497 }
1498 
1499 void MacroAssembler::super_call_VM(Register oop_result,
1500                                    Register last_java_sp,
1501                                    address entry_point,
1502                                    Register arg_1,
1503                                    bool check_exceptions) {
1504   pass_arg1(this, arg_1);
1505   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1506 }
1507 
1508 void MacroAssembler::super_call_VM(Register oop_result,
1509                                    Register last_java_sp,
1510                                    address entry_point,
1511                                    Register arg_1,
1512                                    Register arg_2,
1513                                    bool check_exceptions) {
1514 
1515   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1516   pass_arg2(this, arg_2);
1517   pass_arg1(this, arg_1);
1518   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1519 }
1520 
1521 void MacroAssembler::super_call_VM(Register oop_result,
1522                                    Register last_java_sp,
1523                                    address entry_point,
1524                                    Register arg_1,
1525                                    Register arg_2,
1526                                    Register arg_3,
1527                                    bool check_exceptions) {
1528   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1529   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1530   pass_arg3(this, arg_3);
1531   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1532   pass_arg2(this, arg_2);
1533   pass_arg1(this, arg_1);
1534   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1535 }
1536 
1537 void MacroAssembler::call_VM_base(Register oop_result,
1538                                   Register java_thread,
1539                                   Register last_java_sp,
1540                                   address  entry_point,
1541                                   int      number_of_arguments,
1542                                   bool     check_exceptions) {
1543   // determine java_thread register
1544   if (!java_thread->is_valid()) {
1545 #ifdef _LP64
1546     java_thread = r15_thread;
1547 #else
1548     java_thread = rdi;
1549     get_thread(java_thread);
1550 #endif // LP64
1551   }
1552   // determine last_java_sp register
1553   if (!last_java_sp->is_valid()) {
1554     last_java_sp = rsp;
1555   }
1556   // debugging support
1557   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
1558   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
1559 #ifdef ASSERT
1560   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1561   // r12 is the heapbase.
1562   LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
1563 #endif // ASSERT
1564 
1565   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
1566   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1567 
1568   // push java thread (becomes first argument of C function)
1569 
1570   NOT_LP64(push(java_thread); number_of_arguments++);
1571   LP64_ONLY(mov(c_rarg0, r15_thread));
1572 
1573   // set last Java frame before call
1574   assert(last_java_sp != rbp, "can't use ebp/rbp");
1575 
1576   // Only interpreter should have to set fp
1577   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
1578 
1579   // do the call, remove parameters
1580   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1581 
1582   // restore the thread (cannot use the pushed argument since arguments
1583   // may be overwritten by C code generated by an optimizing compiler);
1584   // however can use the register value directly if it is callee saved.
1585   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
1586     // rdi & rsi (also r15) are callee saved -> nothing to do
1587 #ifdef ASSERT
1588     guarantee(java_thread != rax, "change this code");
1589     push(rax);
1590     { Label L;
1591       get_thread(rax);
1592       cmpptr(java_thread, rax);
1593       jcc(Assembler::equal, L);
1594       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
1595       bind(L);
1596     }
1597     pop(rax);
1598 #endif
1599   } else {
1600     get_thread(java_thread);
1601   }
1602   // reset last Java frame
1603   // Only interpreter should have to clear fp
1604   reset_last_Java_frame(java_thread, true);
1605 
1606    // C++ interp handles this in the interpreter
1607   check_and_handle_popframe(java_thread);
1608   check_and_handle_earlyret(java_thread);
1609 
1610   if (check_exceptions) {
1611     // check for pending exceptions (java_thread is set upon return)
1612     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
1613 #ifndef _LP64
1614     jump_cc(Assembler::notEqual,
1615             RuntimeAddress(StubRoutines::forward_exception_entry()));
1616 #else
1617     // This used to conditionally jump to forward_exception however it is
1618     // possible if we relocate that the branch will not reach. So we must jump
1619     // around so we can always reach
1620 
1621     Label ok;
1622     jcc(Assembler::equal, ok);
1623     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1624     bind(ok);
1625 #endif // LP64
1626   }
1627 
1628   // get oop result if there is one and reset the value in the thread
1629   if (oop_result->is_valid()) {
1630     get_vm_result(oop_result, java_thread);
1631   }
1632 }
1633 
1634 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1635 
1636   // Calculate the value for last_Java_sp
1637   // somewhat subtle. call_VM does an intermediate call
1638   // which places a return address on the stack just under the
1639   // stack pointer as the user finsihed with it. This allows
1640   // use to retrieve last_Java_pc from last_Java_sp[-1].
1641   // On 32bit we then have to push additional args on the stack to accomplish
1642   // the actual requested call. On 64bit call_VM only can use register args
1643   // so the only extra space is the return address that call_VM created.
1644   // This hopefully explains the calculations here.
1645 
1646 #ifdef _LP64
1647   // We've pushed one address, correct last_Java_sp
1648   lea(rax, Address(rsp, wordSize));
1649 #else
1650   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
1651 #endif // LP64
1652 
1653   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
1654 
1655 }
1656 
1657 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1658 void MacroAssembler::call_VM_leaf0(address entry_point) {
1659   MacroAssembler::call_VM_leaf_base(entry_point, 0);
1660 }
1661 
1662 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1663   call_VM_leaf_base(entry_point, number_of_arguments);
1664 }
1665 
1666 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1667   pass_arg0(this, arg_0);
1668   call_VM_leaf(entry_point, 1);
1669 }
1670 
1671 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1672 
1673   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1674   pass_arg1(this, arg_1);
1675   pass_arg0(this, arg_0);
1676   call_VM_leaf(entry_point, 2);
1677 }
1678 
1679 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1680   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1681   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1682   pass_arg2(this, arg_2);
1683   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1684   pass_arg1(this, arg_1);
1685   pass_arg0(this, arg_0);
1686   call_VM_leaf(entry_point, 3);
1687 }
1688 
1689 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1690   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1691   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1692   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1693   pass_arg3(this, arg_3);
1694   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1695   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1696   pass_arg2(this, arg_2);
1697   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1698   pass_arg1(this, arg_1);
1699   pass_arg0(this, arg_0);
1700   call_VM_leaf(entry_point, 3);
1701 }
1702 
1703 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1704   pass_arg0(this, arg_0);
1705   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1706 }
1707 
1708 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1709 
1710   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1711   pass_arg1(this, arg_1);
1712   pass_arg0(this, arg_0);
1713   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1714 }
1715 
1716 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1717   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1718   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1719   pass_arg2(this, arg_2);
1720   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1721   pass_arg1(this, arg_1);
1722   pass_arg0(this, arg_0);
1723   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1724 }
1725 
1726 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1727   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1728   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1729   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1730   pass_arg3(this, arg_3);
1731   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1732   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1733   pass_arg2(this, arg_2);
1734   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1735   pass_arg1(this, arg_1);
1736   pass_arg0(this, arg_0);
1737   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1738 }
1739 
1740 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1741   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1742   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
1743   verify_oop_msg(oop_result, "broken oop in call_VM_base");
1744 }
1745 
1746 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1747   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1748   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
1749 }
1750 
1751 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
1752 }
1753 
1754 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
1755 }
1756 
1757 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
1758   if (reachable(src1)) {
1759     cmpl(as_Address(src1), imm);
1760   } else {
1761     lea(rscratch1, src1);
1762     cmpl(Address(rscratch1, 0), imm);
1763   }
1764 }
1765 
1766 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
1767   assert(!src2.is_lval(), "use cmpptr");
1768   if (reachable(src2)) {
1769     cmpl(src1, as_Address(src2));
1770   } else {
1771     lea(rscratch1, src2);
1772     cmpl(src1, Address(rscratch1, 0));
1773   }
1774 }
1775 
1776 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1777   Assembler::cmpl(src1, imm);
1778 }
1779 
1780 void MacroAssembler::cmp32(Register src1, Address src2) {
1781   Assembler::cmpl(src1, src2);
1782 }
1783 
1784 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1785   ucomisd(opr1, opr2);
1786 
1787   Label L;
1788   if (unordered_is_less) {
1789     movl(dst, -1);
1790     jcc(Assembler::parity, L);
1791     jcc(Assembler::below , L);
1792     movl(dst, 0);
1793     jcc(Assembler::equal , L);
1794     increment(dst);
1795   } else { // unordered is greater
1796     movl(dst, 1);
1797     jcc(Assembler::parity, L);
1798     jcc(Assembler::above , L);
1799     movl(dst, 0);
1800     jcc(Assembler::equal , L);
1801     decrementl(dst);
1802   }
1803   bind(L);
1804 }
1805 
1806 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1807   ucomiss(opr1, opr2);
1808 
1809   Label L;
1810   if (unordered_is_less) {
1811     movl(dst, -1);
1812     jcc(Assembler::parity, L);
1813     jcc(Assembler::below , L);
1814     movl(dst, 0);
1815     jcc(Assembler::equal , L);
1816     increment(dst);
1817   } else { // unordered is greater
1818     movl(dst, 1);
1819     jcc(Assembler::parity, L);
1820     jcc(Assembler::above , L);
1821     movl(dst, 0);
1822     jcc(Assembler::equal , L);
1823     decrementl(dst);
1824   }
1825   bind(L);
1826 }
1827 
1828 
1829 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
1830   if (reachable(src1)) {
1831     cmpb(as_Address(src1), imm);
1832   } else {
1833     lea(rscratch1, src1);
1834     cmpb(Address(rscratch1, 0), imm);
1835   }
1836 }
1837 
1838 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
1839 #ifdef _LP64
1840   if (src2.is_lval()) {
1841     movptr(rscratch1, src2);
1842     Assembler::cmpq(src1, rscratch1);
1843   } else if (reachable(src2)) {
1844     cmpq(src1, as_Address(src2));
1845   } else {
1846     lea(rscratch1, src2);
1847     Assembler::cmpq(src1, Address(rscratch1, 0));
1848   }
1849 #else
1850   if (src2.is_lval()) {
1851     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1852   } else {
1853     cmpl(src1, as_Address(src2));
1854   }
1855 #endif // _LP64
1856 }
1857 
1858 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
1859   assert(src2.is_lval(), "not a mem-mem compare");
1860 #ifdef _LP64
1861   // moves src2's literal address
1862   movptr(rscratch1, src2);
1863   Assembler::cmpq(src1, rscratch1);
1864 #else
1865   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1866 #endif // _LP64
1867 }
1868 
1869 void MacroAssembler::cmpoop(Register src1, Register src2) {
1870   cmpptr(src1, src2);
1871 }
1872 
1873 void MacroAssembler::cmpoop(Register src1, Address src2) {
1874   cmpptr(src1, src2);
1875 }
1876 
1877 #ifdef _LP64
1878 void MacroAssembler::cmpoop(Register src1, jobject src2) {
1879   movoop(rscratch1, src2);
1880   cmpptr(src1, rscratch1);
1881 }
1882 #endif
1883 
1884 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
1885   if (reachable(adr)) {
1886     lock();
1887     cmpxchgptr(reg, as_Address(adr));
1888   } else {
1889     lea(rscratch1, adr);
1890     lock();
1891     cmpxchgptr(reg, Address(rscratch1, 0));
1892   }
1893 }
1894 
1895 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1896   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
1897 }
1898 
1899 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
1900   if (reachable(src)) {
1901     Assembler::comisd(dst, as_Address(src));
1902   } else {
1903     lea(rscratch1, src);
1904     Assembler::comisd(dst, Address(rscratch1, 0));
1905   }
1906 }
1907 
1908 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
1909   if (reachable(src)) {
1910     Assembler::comiss(dst, as_Address(src));
1911   } else {
1912     lea(rscratch1, src);
1913     Assembler::comiss(dst, Address(rscratch1, 0));
1914   }
1915 }
1916 
1917 
1918 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
1919   Condition negated_cond = negate_condition(cond);
1920   Label L;
1921   jcc(negated_cond, L);
1922   pushf(); // Preserve flags
1923   atomic_incl(counter_addr);
1924   popf();
1925   bind(L);
1926 }
1927 
1928 int MacroAssembler::corrected_idivl(Register reg) {
1929   // Full implementation of Java idiv and irem; checks for
1930   // special case as described in JVM spec., p.243 & p.271.
1931   // The function returns the (pc) offset of the idivl
1932   // instruction - may be needed for implicit exceptions.
1933   //
1934   //         normal case                           special case
1935   //
1936   // input : rax,: dividend                         min_int
1937   //         reg: divisor   (may not be rax,/rdx)   -1
1938   //
1939   // output: rax,: quotient  (= rax, idiv reg)       min_int
1940   //         rdx: remainder (= rax, irem reg)       0
1941   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1942   const int min_int = 0x80000000;
1943   Label normal_case, special_case;
1944 
1945   // check for special case
1946   cmpl(rax, min_int);
1947   jcc(Assembler::notEqual, normal_case);
1948   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1949   cmpl(reg, -1);
1950   jcc(Assembler::equal, special_case);
1951 
1952   // handle normal case
1953   bind(normal_case);
1954   cdql();
1955   int idivl_offset = offset();
1956   idivl(reg);
1957 
1958   // normal and special case exit
1959   bind(special_case);
1960 
1961   return idivl_offset;
1962 }
1963 
1964 
1965 
1966 void MacroAssembler::decrementl(Register reg, int value) {
1967   if (value == min_jint) {subl(reg, value) ; return; }
1968   if (value <  0) { incrementl(reg, -value); return; }
1969   if (value == 0) {                        ; return; }
1970   if (value == 1 && UseIncDec) { decl(reg) ; return; }
1971   /* else */      { subl(reg, value)       ; return; }
1972 }
1973 
1974 void MacroAssembler::decrementl(Address dst, int value) {
1975   if (value == min_jint) {subl(dst, value) ; return; }
1976   if (value <  0) { incrementl(dst, -value); return; }
1977   if (value == 0) {                        ; return; }
1978   if (value == 1 && UseIncDec) { decl(dst) ; return; }
1979   /* else */      { subl(dst, value)       ; return; }
1980 }
1981 
1982 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
1983   assert (shift_value > 0, "illegal shift value");
1984   Label _is_positive;
1985   testl (reg, reg);
1986   jcc (Assembler::positive, _is_positive);
1987   int offset = (1 << shift_value) - 1 ;
1988 
1989   if (offset == 1) {
1990     incrementl(reg);
1991   } else {
1992     addl(reg, offset);
1993   }
1994 
1995   bind (_is_positive);
1996   sarl(reg, shift_value);
1997 }
1998 
1999 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2000   if (reachable(src)) {
2001     Assembler::divsd(dst, as_Address(src));
2002   } else {
2003     lea(rscratch1, src);
2004     Assembler::divsd(dst, Address(rscratch1, 0));
2005   }
2006 }
2007 
2008 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2009   if (reachable(src)) {
2010     Assembler::divss(dst, as_Address(src));
2011   } else {
2012     lea(rscratch1, src);
2013     Assembler::divss(dst, Address(rscratch1, 0));
2014   }
2015 }
2016 
2017 void MacroAssembler::enter() {
2018   push(rbp);
2019   mov(rbp, rsp);
2020 }
2021 
2022 void MacroAssembler::post_call_nop() {
2023   emit_int8((int8_t)0x0f);
2024   emit_int8((int8_t)0x1f);
2025   emit_int8((int8_t)0x84);
2026   emit_int8((int8_t)0x00);
2027   emit_int32(0x00);
2028 }
2029 
2030 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2031 void MacroAssembler::fat_nop() {
2032   if (UseAddressNop) {
2033     addr_nop_5();
2034   } else {
2035     emit_int8((int8_t)0x26); // es:
2036     emit_int8((int8_t)0x2e); // cs:
2037     emit_int8((int8_t)0x64); // fs:
2038     emit_int8((int8_t)0x65); // gs:
2039     emit_int8((int8_t)0x90);
2040   }
2041 }
2042 
2043 #ifndef _LP64
2044 void MacroAssembler::fcmp(Register tmp) {
2045   fcmp(tmp, 1, true, true);
2046 }
2047 
2048 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2049   assert(!pop_right || pop_left, "usage error");
2050   if (VM_Version::supports_cmov()) {
2051     assert(tmp == noreg, "unneeded temp");
2052     if (pop_left) {
2053       fucomip(index);
2054     } else {
2055       fucomi(index);
2056     }
2057     if (pop_right) {
2058       fpop();
2059     }
2060   } else {
2061     assert(tmp != noreg, "need temp");
2062     if (pop_left) {
2063       if (pop_right) {
2064         fcompp();
2065       } else {
2066         fcomp(index);
2067       }
2068     } else {
2069       fcom(index);
2070     }
2071     // convert FPU condition into eflags condition via rax,
2072     save_rax(tmp);
2073     fwait(); fnstsw_ax();
2074     sahf();
2075     restore_rax(tmp);
2076   }
2077   // condition codes set as follows:
2078   //
2079   // CF (corresponds to C0) if x < y
2080   // PF (corresponds to C2) if unordered
2081   // ZF (corresponds to C3) if x = y
2082 }
2083 
2084 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2085   fcmp2int(dst, unordered_is_less, 1, true, true);
2086 }
2087 
2088 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2089   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2090   Label L;
2091   if (unordered_is_less) {
2092     movl(dst, -1);
2093     jcc(Assembler::parity, L);
2094     jcc(Assembler::below , L);
2095     movl(dst, 0);
2096     jcc(Assembler::equal , L);
2097     increment(dst);
2098   } else { // unordered is greater
2099     movl(dst, 1);
2100     jcc(Assembler::parity, L);
2101     jcc(Assembler::above , L);
2102     movl(dst, 0);
2103     jcc(Assembler::equal , L);
2104     decrementl(dst);
2105   }
2106   bind(L);
2107 }
2108 
2109 void MacroAssembler::fld_d(AddressLiteral src) {
2110   fld_d(as_Address(src));
2111 }
2112 
2113 void MacroAssembler::fld_s(AddressLiteral src) {
2114   fld_s(as_Address(src));
2115 }
2116 
2117 void MacroAssembler::fldcw(AddressLiteral src) {
2118   Assembler::fldcw(as_Address(src));
2119 }
2120 
2121 void MacroAssembler::fpop() {
2122   ffree();
2123   fincstp();
2124 }
2125 
2126 void MacroAssembler::fremr(Register tmp) {
2127   save_rax(tmp);
2128   { Label L;
2129     bind(L);
2130     fprem();
2131     fwait(); fnstsw_ax();
2132     sahf();
2133     jcc(Assembler::parity, L);
2134   }
2135   restore_rax(tmp);
2136   // Result is in ST0.
2137   // Note: fxch & fpop to get rid of ST1
2138   // (otherwise FPU stack could overflow eventually)
2139   fxch(1);
2140   fpop();
2141 }
2142 
2143 void MacroAssembler::empty_FPU_stack() {
2144   if (VM_Version::supports_mmx()) {
2145     emms();
2146   } else {
2147     for (int i = 8; i-- > 0; ) ffree(i);
2148   }
2149 }
2150 #endif // !LP64
2151 
2152 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2153   if (reachable(src)) {
2154     Assembler::mulpd(dst, as_Address(src));
2155   } else {
2156     lea(rscratch1, src);
2157     Assembler::mulpd(dst, Address(rscratch1, 0));
2158   }
2159 }
2160 
2161 void MacroAssembler::load_float(Address src) {
2162 #ifdef _LP64
2163   movflt(xmm0, src);
2164 #else
2165   if (UseSSE >= 1) {
2166     movflt(xmm0, src);
2167   } else {
2168     fld_s(src);
2169   }
2170 #endif // LP64
2171 }
2172 
2173 void MacroAssembler::store_float(Address dst) {
2174 #ifdef _LP64
2175   movflt(dst, xmm0);
2176 #else
2177   if (UseSSE >= 1) {
2178     movflt(dst, xmm0);
2179   } else {
2180     fstp_s(dst);
2181   }
2182 #endif // LP64
2183 }
2184 
2185 void MacroAssembler::load_double(Address src) {
2186 #ifdef _LP64
2187   movdbl(xmm0, src);
2188 #else
2189   if (UseSSE >= 2) {
2190     movdbl(xmm0, src);
2191   } else {
2192     fld_d(src);
2193   }
2194 #endif // LP64
2195 }
2196 
2197 void MacroAssembler::store_double(Address dst) {
2198 #ifdef _LP64
2199   movdbl(dst, xmm0);
2200 #else
2201   if (UseSSE >= 2) {
2202     movdbl(dst, xmm0);
2203   } else {
2204     fstp_d(dst);
2205   }
2206 #endif // LP64
2207 }
2208 
2209 // dst = c = a * b + c
2210 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2211   Assembler::vfmadd231sd(c, a, b);
2212   if (dst != c) {
2213     movdbl(dst, c);
2214   }
2215 }
2216 
2217 // dst = c = a * b + c
2218 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2219   Assembler::vfmadd231ss(c, a, b);
2220   if (dst != c) {
2221     movflt(dst, c);
2222   }
2223 }
2224 
2225 // dst = c = a * b + c
2226 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2227   Assembler::vfmadd231pd(c, a, b, vector_len);
2228   if (dst != c) {
2229     vmovdqu(dst, c);
2230   }
2231 }
2232 
2233 // dst = c = a * b + c
2234 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2235   Assembler::vfmadd231ps(c, a, b, vector_len);
2236   if (dst != c) {
2237     vmovdqu(dst, c);
2238   }
2239 }
2240 
2241 // dst = c = a * b + c
2242 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2243   Assembler::vfmadd231pd(c, a, b, vector_len);
2244   if (dst != c) {
2245     vmovdqu(dst, c);
2246   }
2247 }
2248 
2249 // dst = c = a * b + c
2250 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2251   Assembler::vfmadd231ps(c, a, b, vector_len);
2252   if (dst != c) {
2253     vmovdqu(dst, c);
2254   }
2255 }
2256 
2257 void MacroAssembler::incrementl(AddressLiteral dst) {
2258   if (reachable(dst)) {
2259     incrementl(as_Address(dst));
2260   } else {
2261     lea(rscratch1, dst);
2262     incrementl(Address(rscratch1, 0));
2263   }
2264 }
2265 
2266 void MacroAssembler::incrementl(ArrayAddress dst) {
2267   incrementl(as_Address(dst));
2268 }
2269 
2270 void MacroAssembler::incrementl(Register reg, int value) {
2271   if (value == min_jint) {addl(reg, value) ; return; }
2272   if (value <  0) { decrementl(reg, -value); return; }
2273   if (value == 0) {                        ; return; }
2274   if (value == 1 && UseIncDec) { incl(reg) ; return; }
2275   /* else */      { addl(reg, value)       ; return; }
2276 }
2277 
2278 void MacroAssembler::incrementl(Address dst, int value) {
2279   if (value == min_jint) {addl(dst, value) ; return; }
2280   if (value <  0) { decrementl(dst, -value); return; }
2281   if (value == 0) {                        ; return; }
2282   if (value == 1 && UseIncDec) { incl(dst) ; return; }
2283   /* else */      { addl(dst, value)       ; return; }
2284 }
2285 
2286 void MacroAssembler::jump(AddressLiteral dst) {
2287   if (reachable(dst)) {
2288     jmp_literal(dst.target(), dst.rspec());
2289   } else {
2290     lea(rscratch1, dst);
2291     jmp(rscratch1);
2292   }
2293 }
2294 
2295 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
2296   if (reachable(dst)) {
2297     InstructionMark im(this);
2298     relocate(dst.reloc());
2299     const int short_size = 2;
2300     const int long_size = 6;
2301     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2302     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2303       // 0111 tttn #8-bit disp
2304       emit_int8(0x70 | cc);
2305       emit_int8((offs - short_size) & 0xFF);
2306     } else {
2307       // 0000 1111 1000 tttn #32-bit disp
2308       emit_int8(0x0F);
2309       emit_int8((unsigned char)(0x80 | cc));
2310       emit_int32(offs - long_size);
2311     }
2312   } else {
2313 #ifdef ASSERT
2314     warning("reversing conditional branch");
2315 #endif /* ASSERT */
2316     Label skip;
2317     jccb(reverse[cc], skip);
2318     lea(rscratch1, dst);
2319     Assembler::jmp(rscratch1);
2320     bind(skip);
2321   }
2322 }
2323 
2324 void MacroAssembler::fld_x(AddressLiteral src) {
2325   Assembler::fld_x(as_Address(src));
2326 }
2327 
2328 void MacroAssembler::ldmxcsr(AddressLiteral src) {
2329   if (reachable(src)) {
2330     Assembler::ldmxcsr(as_Address(src));
2331   } else {
2332     lea(rscratch1, src);
2333     Assembler::ldmxcsr(Address(rscratch1, 0));
2334   }
2335 }
2336 
2337 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2338   int off;
2339   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2340     off = offset();
2341     movsbl(dst, src); // movsxb
2342   } else {
2343     off = load_unsigned_byte(dst, src);
2344     shll(dst, 24);
2345     sarl(dst, 24);
2346   }
2347   return off;
2348 }
2349 
2350 // Note: load_signed_short used to be called load_signed_word.
2351 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
2352 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
2353 // The term "word" in HotSpot means a 32- or 64-bit machine word.
2354 int MacroAssembler::load_signed_short(Register dst, Address src) {
2355   int off;
2356   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2357     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
2358     // version but this is what 64bit has always done. This seems to imply
2359     // that users are only using 32bits worth.
2360     off = offset();
2361     movswl(dst, src); // movsxw
2362   } else {
2363     off = load_unsigned_short(dst, src);
2364     shll(dst, 16);
2365     sarl(dst, 16);
2366   }
2367   return off;
2368 }
2369 
2370 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2371   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2372   // and "3.9 Partial Register Penalties", p. 22).
2373   int off;
2374   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
2375     off = offset();
2376     movzbl(dst, src); // movzxb
2377   } else {
2378     xorl(dst, dst);
2379     off = offset();
2380     movb(dst, src);
2381   }
2382   return off;
2383 }
2384 
2385 // Note: load_unsigned_short used to be called load_unsigned_word.
2386 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2387   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2388   // and "3.9 Partial Register Penalties", p. 22).
2389   int off;
2390   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
2391     off = offset();
2392     movzwl(dst, src); // movzxw
2393   } else {
2394     xorl(dst, dst);
2395     off = offset();
2396     movw(dst, src);
2397   }
2398   return off;
2399 }
2400 
2401 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2402   switch (size_in_bytes) {
2403 #ifndef _LP64
2404   case  8:
2405     assert(dst2 != noreg, "second dest register required");
2406     movl(dst,  src);
2407     movl(dst2, src.plus_disp(BytesPerInt));
2408     break;
2409 #else
2410   case  8:  movq(dst, src); break;
2411 #endif
2412   case  4:  movl(dst, src); break;
2413   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2414   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2415   default:  ShouldNotReachHere();
2416   }
2417 }
2418 
2419 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2420   switch (size_in_bytes) {
2421 #ifndef _LP64
2422   case  8:
2423     assert(src2 != noreg, "second source register required");
2424     movl(dst,                        src);
2425     movl(dst.plus_disp(BytesPerInt), src2);
2426     break;
2427 #else
2428   case  8:  movq(dst, src); break;
2429 #endif
2430   case  4:  movl(dst, src); break;
2431   case  2:  movw(dst, src); break;
2432   case  1:  movb(dst, src); break;
2433   default:  ShouldNotReachHere();
2434   }
2435 }
2436 
2437 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
2438   if (reachable(dst)) {
2439     movl(as_Address(dst), src);
2440   } else {
2441     lea(rscratch1, dst);
2442     movl(Address(rscratch1, 0), src);
2443   }
2444 }
2445 
2446 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
2447   if (reachable(src)) {
2448     movl(dst, as_Address(src));
2449   } else {
2450     lea(rscratch1, src);
2451     movl(dst, Address(rscratch1, 0));
2452   }
2453 }
2454 
2455 // C++ bool manipulation
2456 
2457 void MacroAssembler::movbool(Register dst, Address src) {
2458   if(sizeof(bool) == 1)
2459     movb(dst, src);
2460   else if(sizeof(bool) == 2)
2461     movw(dst, src);
2462   else if(sizeof(bool) == 4)
2463     movl(dst, src);
2464   else
2465     // unsupported
2466     ShouldNotReachHere();
2467 }
2468 
2469 void MacroAssembler::movbool(Address dst, bool boolconst) {
2470   if(sizeof(bool) == 1)
2471     movb(dst, (int) boolconst);
2472   else if(sizeof(bool) == 2)
2473     movw(dst, (int) boolconst);
2474   else if(sizeof(bool) == 4)
2475     movl(dst, (int) boolconst);
2476   else
2477     // unsupported
2478     ShouldNotReachHere();
2479 }
2480 
2481 void MacroAssembler::movbool(Address dst, Register src) {
2482   if(sizeof(bool) == 1)
2483     movb(dst, src);
2484   else if(sizeof(bool) == 2)
2485     movw(dst, src);
2486   else if(sizeof(bool) == 4)
2487     movl(dst, src);
2488   else
2489     // unsupported
2490     ShouldNotReachHere();
2491 }
2492 
2493 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
2494   movb(as_Address(dst), src);
2495 }
2496 
2497 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
2498   if (reachable(src)) {
2499     movdl(dst, as_Address(src));
2500   } else {
2501     lea(rscratch1, src);
2502     movdl(dst, Address(rscratch1, 0));
2503   }
2504 }
2505 
2506 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
2507   if (reachable(src)) {
2508     movq(dst, as_Address(src));
2509   } else {
2510     lea(rscratch1, src);
2511     movq(dst, Address(rscratch1, 0));
2512   }
2513 }
2514 
2515 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
2516   if (reachable(src)) {
2517     if (UseXmmLoadAndClearUpper) {
2518       movsd (dst, as_Address(src));
2519     } else {
2520       movlpd(dst, as_Address(src));
2521     }
2522   } else {
2523     lea(rscratch1, src);
2524     if (UseXmmLoadAndClearUpper) {
2525       movsd (dst, Address(rscratch1, 0));
2526     } else {
2527       movlpd(dst, Address(rscratch1, 0));
2528     }
2529   }
2530 }
2531 
2532 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
2533   if (reachable(src)) {
2534     movss(dst, as_Address(src));
2535   } else {
2536     lea(rscratch1, src);
2537     movss(dst, Address(rscratch1, 0));
2538   }
2539 }
2540 
2541 void MacroAssembler::movptr(Register dst, Register src) {
2542   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2543 }
2544 
2545 void MacroAssembler::movptr(Register dst, Address src) {
2546   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2547 }
2548 
2549 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
2550 void MacroAssembler::movptr(Register dst, intptr_t src) {
2551   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
2552 }
2553 
2554 void MacroAssembler::movptr(Address dst, Register src) {
2555   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2556 }
2557 
2558 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2559     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2560     Assembler::movdqu(dst, src);
2561 }
2562 
2563 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2564     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2565     Assembler::movdqu(dst, src);
2566 }
2567 
2568 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2569     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2570     Assembler::movdqu(dst, src);
2571 }
2572 
2573 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
2574   if (reachable(src)) {
2575     movdqu(dst, as_Address(src));
2576   } else {
2577     lea(scratchReg, src);
2578     movdqu(dst, Address(scratchReg, 0));
2579   }
2580 }
2581 
2582 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2583     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2584     Assembler::vmovdqu(dst, src);
2585 }
2586 
2587 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2588     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2589     Assembler::vmovdqu(dst, src);
2590 }
2591 
2592 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2593     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2594     Assembler::vmovdqu(dst, src);
2595 }
2596 
2597 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
2598   if (reachable(src)) {
2599     vmovdqu(dst, as_Address(src));
2600   }
2601   else {
2602     lea(scratch_reg, src);
2603     vmovdqu(dst, Address(scratch_reg, 0));
2604   }
2605 }
2606 
2607 void MacroAssembler::kmov(KRegister dst, Address src) {
2608   if (VM_Version::supports_avx512bw()) {
2609     kmovql(dst, src);
2610   } else {
2611     assert(VM_Version::supports_evex(), "");
2612     kmovwl(dst, src);
2613   }
2614 }
2615 
2616 void MacroAssembler::kmov(Address dst, KRegister src) {
2617   if (VM_Version::supports_avx512bw()) {
2618     kmovql(dst, src);
2619   } else {
2620     assert(VM_Version::supports_evex(), "");
2621     kmovwl(dst, src);
2622   }
2623 }
2624 
2625 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2626   if (VM_Version::supports_avx512bw()) {
2627     kmovql(dst, src);
2628   } else {
2629     assert(VM_Version::supports_evex(), "");
2630     kmovwl(dst, src);
2631   }
2632 }
2633 
2634 void MacroAssembler::kmov(Register dst, KRegister src) {
2635   if (VM_Version::supports_avx512bw()) {
2636     kmovql(dst, src);
2637   } else {
2638     assert(VM_Version::supports_evex(), "");
2639     kmovwl(dst, src);
2640   }
2641 }
2642 
2643 void MacroAssembler::kmov(KRegister dst, Register src) {
2644   if (VM_Version::supports_avx512bw()) {
2645     kmovql(dst, src);
2646   } else {
2647     assert(VM_Version::supports_evex(), "");
2648     kmovwl(dst, src);
2649   }
2650 }
2651 
2652 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register scratch_reg) {
2653   if (reachable(src)) {
2654     kmovql(dst, as_Address(src));
2655   } else {
2656     lea(scratch_reg, src);
2657     kmovql(dst, Address(scratch_reg, 0));
2658   }
2659 }
2660 
2661 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg) {
2662   if (reachable(src)) {
2663     kmovwl(dst, as_Address(src));
2664   } else {
2665     lea(scratch_reg, src);
2666     kmovwl(dst, Address(scratch_reg, 0));
2667   }
2668 }
2669 
2670 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2671                                int vector_len, Register scratch_reg) {
2672   if (reachable(src)) {
2673     if (mask == k0) {
2674       Assembler::evmovdqub(dst, as_Address(src), merge, vector_len);
2675     } else {
2676       Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2677     }
2678   } else {
2679     lea(scratch_reg, src);
2680     if (mask == k0) {
2681       Assembler::evmovdqub(dst, Address(scratch_reg, 0), merge, vector_len);
2682     } else {
2683       Assembler::evmovdqub(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2684     }
2685   }
2686 }
2687 
2688 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2689                                int vector_len, Register scratch_reg) {
2690   if (reachable(src)) {
2691     Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2692   } else {
2693     lea(scratch_reg, src);
2694     Assembler::evmovdquw(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2695   }
2696 }
2697 
2698 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2699                                int vector_len, Register scratch_reg) {
2700   if (reachable(src)) {
2701     Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2702   } else {
2703     lea(scratch_reg, src);
2704     Assembler::evmovdqul(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2705   }
2706 }
2707 
2708 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2709                                int vector_len, Register scratch_reg) {
2710   if (reachable(src)) {
2711     Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2712   } else {
2713     lea(scratch_reg, src);
2714     Assembler::evmovdquq(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2715   }
2716 }
2717 
2718 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2719   if (reachable(src)) {
2720     Assembler::evmovdquq(dst, as_Address(src), vector_len);
2721   } else {
2722     lea(rscratch, src);
2723     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2724   }
2725 }
2726 
2727 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
2728   if (reachable(src)) {
2729     Assembler::movdqa(dst, as_Address(src));
2730   } else {
2731     lea(rscratch1, src);
2732     Assembler::movdqa(dst, Address(rscratch1, 0));
2733   }
2734 }
2735 
2736 #ifdef _LP64
2737   // Move Aligned, possibly non-temporal
2738   void MacroAssembler::movqa(Address dst, Register src, bool nt) {
2739     if (nt) {
2740       Assembler::movntq(dst, src);
2741     } else {
2742       Assembler::movq(dst, src);
2743     }
2744   }
2745 
2746   void MacroAssembler::movdqa(Address dst, XMMRegister src, bool nt) {
2747     if (nt) {
2748       Assembler::movntdq(dst, src);
2749     } else {
2750       Assembler::movdqu(dst, src);
2751     }
2752   }
2753   void MacroAssembler::vmovdqa(Address dst, XMMRegister src, bool nt) {
2754     if (nt) {
2755       Assembler::vmovntdq(dst, src);
2756     } else {
2757       Assembler::vmovdqu(dst, src);
2758     }
2759   }
2760   void MacroAssembler::evmovdqa(Address dst, XMMRegister src, int vector_len, bool nt) {
2761     if (nt) {
2762       Assembler::evmovntdq(dst, src, vector_len);
2763     } else {
2764       Assembler::evmovdqal(dst, src, vector_len);
2765     }
2766   }
2767 
2768   void MacroAssembler::movdqa(XMMRegister dst, Address src, bool nt) {
2769     if (nt) {
2770       Assembler::movntdqa(dst, src);
2771     } else {
2772       Assembler::movdqu(dst, src); // use unaligned load
2773     }
2774   }
2775   void MacroAssembler::vmovdqa(XMMRegister dst, Address src, bool nt) {
2776     if (nt) {
2777       Assembler::vmovntdqa(dst, src);
2778     } else {
2779       Assembler::vmovdqu(dst, src); // use unaligned load
2780     }
2781   }
2782   void MacroAssembler::evmovdqa(XMMRegister dst, Address src, int vector_len, bool nt) {
2783     if (nt) {
2784       Assembler::evmovntdqa(dst, src, vector_len);
2785     } else {
2786       Assembler::evmovdqul(dst, src, vector_len); // use unaligned load
2787     }
2788   }
2789 #endif
2790 
2791 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
2792   if (reachable(src)) {
2793     Assembler::movsd(dst, as_Address(src));
2794   } else {
2795     lea(rscratch1, src);
2796     Assembler::movsd(dst, Address(rscratch1, 0));
2797   }
2798 }
2799 
2800 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
2801   if (reachable(src)) {
2802     Assembler::movss(dst, as_Address(src));
2803   } else {
2804     lea(rscratch1, src);
2805     Assembler::movss(dst, Address(rscratch1, 0));
2806   }
2807 }
2808 
2809 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
2810   if (reachable(src)) {
2811     Assembler::mulsd(dst, as_Address(src));
2812   } else {
2813     lea(rscratch1, src);
2814     Assembler::mulsd(dst, Address(rscratch1, 0));
2815   }
2816 }
2817 
2818 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
2819   if (reachable(src)) {
2820     Assembler::mulss(dst, as_Address(src));
2821   } else {
2822     lea(rscratch1, src);
2823     Assembler::mulss(dst, Address(rscratch1, 0));
2824   }
2825 }
2826 
2827 void MacroAssembler::null_check(Register reg, int offset) {
2828   if (needs_explicit_null_check(offset)) {
2829     // provoke OS NULL exception if reg = NULL by
2830     // accessing M[reg] w/o changing any (non-CC) registers
2831     // NOTE: cmpl is plenty here to provoke a segv
2832     cmpptr(rax, Address(reg, 0));
2833     // Note: should probably use testl(rax, Address(reg, 0));
2834     //       may be shorter code (however, this version of
2835     //       testl needs to be implemented first)
2836   } else {
2837     // nothing to do, (later) access of M[reg + offset]
2838     // will provoke OS NULL exception if reg = NULL
2839   }
2840 }
2841 
2842 void MacroAssembler::os_breakpoint() {
2843   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2844   // (e.g., MSVC can't call ps() otherwise)
2845   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2846 }
2847 
2848 void MacroAssembler::unimplemented(const char* what) {
2849   const char* buf = NULL;
2850   {
2851     ResourceMark rm;
2852     stringStream ss;
2853     ss.print("unimplemented: %s", what);
2854     buf = code_string(ss.as_string());
2855   }
2856   stop(buf);
2857 }
2858 
2859 #ifdef _LP64
2860 #define XSTATE_BV 0x200
2861 #endif
2862 
2863 void MacroAssembler::pop_CPU_state() {
2864   pop_FPU_state();
2865   pop_IU_state();
2866 }
2867 
2868 void MacroAssembler::pop_FPU_state() {
2869 #ifndef _LP64
2870   frstor(Address(rsp, 0));
2871 #else
2872   fxrstor(Address(rsp, 0));
2873 #endif
2874   addptr(rsp, FPUStateSizeInWords * wordSize);
2875 }
2876 
2877 void MacroAssembler::pop_IU_state() {
2878   popa();
2879   LP64_ONLY(addq(rsp, 8));
2880   popf();
2881 }
2882 
2883 // Save Integer and Float state
2884 // Warning: Stack must be 16 byte aligned (64bit)
2885 void MacroAssembler::push_CPU_state() {
2886   push_IU_state();
2887   push_FPU_state();
2888 }
2889 
2890 void MacroAssembler::push_FPU_state() {
2891   subptr(rsp, FPUStateSizeInWords * wordSize);
2892 #ifndef _LP64
2893   fnsave(Address(rsp, 0));
2894   fwait();
2895 #else
2896   fxsave(Address(rsp, 0));
2897 #endif // LP64
2898 }
2899 
2900 void MacroAssembler::push_IU_state() {
2901   // Push flags first because pusha kills them
2902   pushf();
2903   // Make sure rsp stays 16-byte aligned
2904   LP64_ONLY(subq(rsp, 8));
2905   pusha();
2906 }
2907 
2908 void MacroAssembler::push_cont_fastpath(Register java_thread) {
2909   Label done;
2910   cmpptr(rsp, Address(java_thread, JavaThread::cont_fastpath_offset()));
2911   jccb(Assembler::belowEqual, done);
2912   movptr(Address(java_thread, JavaThread::cont_fastpath_offset()), rsp);
2913   bind(done);
2914 }
2915 
2916 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
2917   Label done;
2918   cmpptr(rsp, Address(java_thread, JavaThread::cont_fastpath_offset()));
2919   jccb(Assembler::below, done);
2920   movptr(Address(java_thread, JavaThread::cont_fastpath_offset()), 0);
2921   bind(done);
2922 }
2923 
2924 void MacroAssembler::inc_held_monitor_count(Register java_thread) {
2925   incrementl(Address(java_thread, JavaThread::held_monitor_count_offset()));
2926 }
2927 
2928 void MacroAssembler::dec_held_monitor_count(Register java_thread) {
2929   decrementl(Address(java_thread, JavaThread::held_monitor_count_offset()));
2930 }
2931 
2932 void MacroAssembler::reset_held_monitor_count(Register java_thread) {
2933   movl(Address(java_thread, JavaThread::held_monitor_count_offset()), (int32_t)0);
2934 }
2935 
2936 #ifdef ASSERT
2937 void MacroAssembler::stop_if_in_cont(Register cont, const char* name) {
2938 #ifdef _LP64
2939   Label no_cont;
2940   movptr(cont, Address(r15_thread, JavaThread::cont_entry_offset()));
2941   testl(cont, cont);
2942   jcc(Assembler::zero, no_cont);
2943   stop(name);
2944   bind(no_cont);
2945 #else
2946   Unimplemented();
2947 #endif
2948 }
2949 #endif
2950 
2951 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
2952   if (!java_thread->is_valid()) {
2953     java_thread = rdi;
2954     get_thread(java_thread);
2955   }
2956   // we must set sp to zero to clear frame
2957   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
2958   // must clear fp, so that compiled frames are not confused; it is
2959   // possible that we need it only for debugging
2960   if (clear_fp) {
2961     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2962   }
2963   // Always clear the pc because it could have been set by make_walkable()
2964   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
2965   vzeroupper();
2966 }
2967 
2968 void MacroAssembler::restore_rax(Register tmp) {
2969   if (tmp == noreg) pop(rax);
2970   else if (tmp != rax) mov(rax, tmp);
2971 }
2972 
2973 void MacroAssembler::round_to(Register reg, int modulus) {
2974   addptr(reg, modulus - 1);
2975   andptr(reg, -modulus);
2976 }
2977 
2978 void MacroAssembler::save_rax(Register tmp) {
2979   if (tmp == noreg) push(rax);
2980   else if (tmp != rax) mov(tmp, rax);
2981 }
2982 
2983 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) {
2984   if (at_return) {
2985     // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
2986     // we may safely use rsp instead to perform the stack watermark check.
2987     cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset()));
2988     jcc(Assembler::above, slow_path);
2989     return;
2990   }
2991   testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2992   jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
2993 }
2994 
2995 // Calls to C land
2996 //
2997 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
2998 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
2999 // has to be reset to 0. This is required to allow proper stack traversal.
3000 void MacroAssembler::set_last_Java_frame(Register java_thread,
3001                                          Register last_java_sp,
3002                                          Register last_java_fp,
3003                                          address  last_java_pc) {
3004   vzeroupper();
3005   // determine java_thread register
3006   if (!java_thread->is_valid()) {
3007     java_thread = rdi;
3008     get_thread(java_thread);
3009   }
3010   // determine last_java_sp register
3011   if (!last_java_sp->is_valid()) {
3012     last_java_sp = rsp;
3013   }
3014 
3015   // last_java_fp is optional
3016 
3017   if (last_java_fp->is_valid()) {
3018     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3019   }
3020 
3021   // last_java_pc is optional
3022 
3023   if (last_java_pc != NULL) {
3024     lea(Address(java_thread,
3025                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3026         InternalAddress(last_java_pc));
3027 
3028   }
3029   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3030 }
3031 
3032 void MacroAssembler::shlptr(Register dst, int imm8) {
3033   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3034 }
3035 
3036 void MacroAssembler::shrptr(Register dst, int imm8) {
3037   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3038 }
3039 
3040 void MacroAssembler::sign_extend_byte(Register reg) {
3041   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3042     movsbl(reg, reg); // movsxb
3043   } else {
3044     shll(reg, 24);
3045     sarl(reg, 24);
3046   }
3047 }
3048 
3049 void MacroAssembler::sign_extend_short(Register reg) {
3050   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3051     movswl(reg, reg); // movsxw
3052   } else {
3053     shll(reg, 16);
3054     sarl(reg, 16);
3055   }
3056 }
3057 
3058 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3059   assert(reachable(src), "Address should be reachable");
3060   testl(dst, as_Address(src));
3061 }
3062 
3063 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3064   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3065   Assembler::pcmpeqb(dst, src);
3066 }
3067 
3068 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3069   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3070   Assembler::pcmpeqw(dst, src);
3071 }
3072 
3073 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3074   assert((dst->encoding() < 16),"XMM register should be 0-15");
3075   Assembler::pcmpestri(dst, src, imm8);
3076 }
3077 
3078 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3079   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3080   Assembler::pcmpestri(dst, src, imm8);
3081 }
3082 
3083 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3084   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3085   Assembler::pmovzxbw(dst, src);
3086 }
3087 
3088 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3089   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3090   Assembler::pmovzxbw(dst, src);
3091 }
3092 
3093 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3094   assert((src->encoding() < 16),"XMM register should be 0-15");
3095   Assembler::pmovmskb(dst, src);
3096 }
3097 
3098 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3099   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3100   Assembler::ptest(dst, src);
3101 }
3102 
3103 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3104   if (reachable(src)) {
3105     Assembler::sqrtsd(dst, as_Address(src));
3106   } else {
3107     lea(rscratch1, src);
3108     Assembler::sqrtsd(dst, Address(rscratch1, 0));
3109   }
3110 }
3111 
3112 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3113   if (reachable(src)) {
3114     Assembler::sqrtss(dst, as_Address(src));
3115   } else {
3116     lea(rscratch1, src);
3117     Assembler::sqrtss(dst, Address(rscratch1, 0));
3118   }
3119 }
3120 
3121 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3122   if (reachable(src)) {
3123     Assembler::subsd(dst, as_Address(src));
3124   } else {
3125     lea(rscratch1, src);
3126     Assembler::subsd(dst, Address(rscratch1, 0));
3127   }
3128 }
3129 
3130 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg) {
3131   if (reachable(src)) {
3132     Assembler::roundsd(dst, as_Address(src), rmode);
3133   } else {
3134     lea(scratch_reg, src);
3135     Assembler::roundsd(dst, Address(scratch_reg, 0), rmode);
3136   }
3137 }
3138 
3139 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3140   if (reachable(src)) {
3141     Assembler::subss(dst, as_Address(src));
3142   } else {
3143     lea(rscratch1, src);
3144     Assembler::subss(dst, Address(rscratch1, 0));
3145   }
3146 }
3147 
3148 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3149   if (reachable(src)) {
3150     Assembler::ucomisd(dst, as_Address(src));
3151   } else {
3152     lea(rscratch1, src);
3153     Assembler::ucomisd(dst, Address(rscratch1, 0));
3154   }
3155 }
3156 
3157 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3158   if (reachable(src)) {
3159     Assembler::ucomiss(dst, as_Address(src));
3160   } else {
3161     lea(rscratch1, src);
3162     Assembler::ucomiss(dst, Address(rscratch1, 0));
3163   }
3164 }
3165 
3166 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3167   // Used in sign-bit flipping with aligned address.
3168   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3169   if (reachable(src)) {
3170     Assembler::xorpd(dst, as_Address(src));
3171   } else {
3172     lea(scratch_reg, src);
3173     Assembler::xorpd(dst, Address(scratch_reg, 0));
3174   }
3175 }
3176 
3177 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3178   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3179     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3180   }
3181   else {
3182     Assembler::xorpd(dst, src);
3183   }
3184 }
3185 
3186 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3187   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3188     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3189   } else {
3190     Assembler::xorps(dst, src);
3191   }
3192 }
3193 
3194 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3195   // Used in sign-bit flipping with aligned address.
3196   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3197   if (reachable(src)) {
3198     Assembler::xorps(dst, as_Address(src));
3199   } else {
3200     lea(scratch_reg, src);
3201     Assembler::xorps(dst, Address(scratch_reg, 0));
3202   }
3203 }
3204 
3205 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3206   // Used in sign-bit flipping with aligned address.
3207   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3208   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3209   if (reachable(src)) {
3210     Assembler::pshufb(dst, as_Address(src));
3211   } else {
3212     lea(rscratch1, src);
3213     Assembler::pshufb(dst, Address(rscratch1, 0));
3214   }
3215 }
3216 
3217 // AVX 3-operands instructions
3218 
3219 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3220   if (reachable(src)) {
3221     vaddsd(dst, nds, as_Address(src));
3222   } else {
3223     lea(rscratch1, src);
3224     vaddsd(dst, nds, Address(rscratch1, 0));
3225   }
3226 }
3227 
3228 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3229   if (reachable(src)) {
3230     vaddss(dst, nds, as_Address(src));
3231   } else {
3232     lea(rscratch1, src);
3233     vaddss(dst, nds, Address(rscratch1, 0));
3234   }
3235 }
3236 
3237 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3238   assert(UseAVX > 0, "requires some form of AVX");
3239   if (reachable(src)) {
3240     Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
3241   } else {
3242     lea(rscratch, src);
3243     Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
3244   }
3245 }
3246 
3247 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3248   assert(UseAVX > 0, "requires some form of AVX");
3249   if (reachable(src)) {
3250     Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
3251   } else {
3252     lea(rscratch, src);
3253     Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
3254   }
3255 }
3256 
3257 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3258   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3259   vandps(dst, nds, negate_field, vector_len);
3260 }
3261 
3262 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3263   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3264   vandpd(dst, nds, negate_field, vector_len);
3265 }
3266 
3267 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3268   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3269   Assembler::vpaddb(dst, nds, src, vector_len);
3270 }
3271 
3272 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3273   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3274   Assembler::vpaddb(dst, nds, src, vector_len);
3275 }
3276 
3277 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3278   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3279   Assembler::vpaddw(dst, nds, src, vector_len);
3280 }
3281 
3282 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3283   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3284   Assembler::vpaddw(dst, nds, src, vector_len);
3285 }
3286 
3287 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3288   if (reachable(src)) {
3289     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3290   } else {
3291     lea(scratch_reg, src);
3292     Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3293   }
3294 }
3295 
3296 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3297   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3298   Assembler::vpbroadcastw(dst, src, vector_len);
3299 }
3300 
3301 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3302   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3303   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3304 }
3305 
3306 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3307   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3308   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3309 }
3310 
3311 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds,
3312                                AddressLiteral src, int vector_len, Register scratch_reg) {
3313   if (reachable(src)) {
3314     Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3315   } else {
3316     lea(scratch_reg, src);
3317     Assembler::evpcmpeqd(kdst, mask, nds, Address(scratch_reg, 0), vector_len);
3318   }
3319 }
3320 
3321 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3322                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3323   if (reachable(src)) {
3324     Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3325   } else {
3326     lea(scratch_reg, src);
3327     Assembler::evpcmpd(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3328   }
3329 }
3330 
3331 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3332                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3333   if (reachable(src)) {
3334     Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3335   } else {
3336     lea(scratch_reg, src);
3337     Assembler::evpcmpq(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3338   }
3339 }
3340 
3341 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3342                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3343   if (reachable(src)) {
3344     Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3345   } else {
3346     lea(scratch_reg, src);
3347     Assembler::evpcmpb(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3348   }
3349 }
3350 
3351 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3352                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3353   if (reachable(src)) {
3354     Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3355   } else {
3356     lea(scratch_reg, src);
3357     Assembler::evpcmpw(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3358   }
3359 }
3360 
3361 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3362   if (width == Assembler::Q) {
3363     Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3364   } else {
3365     Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3366   }
3367 }
3368 
3369 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, ComparisonPredicate cond, Width width, int vector_len, Register scratch_reg) {
3370   int eq_cond_enc = 0x29;
3371   int gt_cond_enc = 0x37;
3372   if (width != Assembler::Q) {
3373     eq_cond_enc = 0x74 + width;
3374     gt_cond_enc = 0x64 + width;
3375   }
3376   switch (cond) {
3377   case eq:
3378     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3379     break;
3380   case neq:
3381     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3382     vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3383     break;
3384   case le:
3385     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3386     vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3387     break;
3388   case nlt:
3389     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3390     vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3391     break;
3392   case lt:
3393     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3394     break;
3395   case nle:
3396     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3397     break;
3398   default:
3399     assert(false, "Should not reach here");
3400   }
3401 }
3402 
3403 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3404   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3405   Assembler::vpmovzxbw(dst, src, vector_len);
3406 }
3407 
3408 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3409   assert((src->encoding() < 16),"XMM register should be 0-15");
3410   Assembler::vpmovmskb(dst, src, vector_len);
3411 }
3412 
3413 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3414   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3415   Assembler::vpmullw(dst, nds, src, vector_len);
3416 }
3417 
3418 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3419   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3420   Assembler::vpmullw(dst, nds, src, vector_len);
3421 }
3422 
3423 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3424   assert((UseAVX > 0), "AVX support is needed");
3425   if (reachable(src)) {
3426     Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3427   } else {
3428     lea(scratch_reg, src);
3429     Assembler::vpmulld(dst, nds, Address(scratch_reg, 0), vector_len);
3430   }
3431 }
3432 
3433 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3434   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3435   Assembler::vpsubb(dst, nds, src, vector_len);
3436 }
3437 
3438 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3439   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3440   Assembler::vpsubb(dst, nds, src, vector_len);
3441 }
3442 
3443 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3444   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3445   Assembler::vpsubw(dst, nds, src, vector_len);
3446 }
3447 
3448 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3449   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3450   Assembler::vpsubw(dst, nds, src, vector_len);
3451 }
3452 
3453 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3454   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3455   Assembler::vpsraw(dst, nds, shift, vector_len);
3456 }
3457 
3458 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3459   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3460   Assembler::vpsraw(dst, nds, shift, vector_len);
3461 }
3462 
3463 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3464   assert(UseAVX > 2,"");
3465   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3466      vector_len = 2;
3467   }
3468   Assembler::evpsraq(dst, nds, shift, vector_len);
3469 }
3470 
3471 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3472   assert(UseAVX > 2,"");
3473   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3474      vector_len = 2;
3475   }
3476   Assembler::evpsraq(dst, nds, shift, vector_len);
3477 }
3478 
3479 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3480   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3481   Assembler::vpsrlw(dst, nds, shift, vector_len);
3482 }
3483 
3484 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3485   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3486   Assembler::vpsrlw(dst, nds, shift, vector_len);
3487 }
3488 
3489 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3490   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3491   Assembler::vpsllw(dst, nds, shift, vector_len);
3492 }
3493 
3494 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3495   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3496   Assembler::vpsllw(dst, nds, shift, vector_len);
3497 }
3498 
3499 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3500   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3501   Assembler::vptest(dst, src);
3502 }
3503 
3504 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3505   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3506   Assembler::punpcklbw(dst, src);
3507 }
3508 
3509 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3510   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3511   Assembler::pshufd(dst, src, mode);
3512 }
3513 
3514 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3515   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3516   Assembler::pshuflw(dst, src, mode);
3517 }
3518 
3519 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3520   if (reachable(src)) {
3521     vandpd(dst, nds, as_Address(src), vector_len);
3522   } else {
3523     lea(scratch_reg, src);
3524     vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
3525   }
3526 }
3527 
3528 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3529   if (reachable(src)) {
3530     vandps(dst, nds, as_Address(src), vector_len);
3531   } else {
3532     lea(scratch_reg, src);
3533     vandps(dst, nds, Address(scratch_reg, 0), vector_len);
3534   }
3535 }
3536 
3537 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3538                             bool merge, int vector_len, Register scratch_reg) {
3539   if (reachable(src)) {
3540     Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3541   } else {
3542     lea(scratch_reg, src);
3543     Assembler::evpord(dst, mask, nds, Address(scratch_reg, 0), merge, vector_len);
3544   }
3545 }
3546 
3547 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3548   if (reachable(src)) {
3549     vdivsd(dst, nds, as_Address(src));
3550   } else {
3551     lea(rscratch1, src);
3552     vdivsd(dst, nds, Address(rscratch1, 0));
3553   }
3554 }
3555 
3556 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3557   if (reachable(src)) {
3558     vdivss(dst, nds, as_Address(src));
3559   } else {
3560     lea(rscratch1, src);
3561     vdivss(dst, nds, Address(rscratch1, 0));
3562   }
3563 }
3564 
3565 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3566   if (reachable(src)) {
3567     vmulsd(dst, nds, as_Address(src));
3568   } else {
3569     lea(rscratch1, src);
3570     vmulsd(dst, nds, Address(rscratch1, 0));
3571   }
3572 }
3573 
3574 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3575   if (reachable(src)) {
3576     vmulss(dst, nds, as_Address(src));
3577   } else {
3578     lea(rscratch1, src);
3579     vmulss(dst, nds, Address(rscratch1, 0));
3580   }
3581 }
3582 
3583 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3584   if (reachable(src)) {
3585     vsubsd(dst, nds, as_Address(src));
3586   } else {
3587     lea(rscratch1, src);
3588     vsubsd(dst, nds, Address(rscratch1, 0));
3589   }
3590 }
3591 
3592 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3593   if (reachable(src)) {
3594     vsubss(dst, nds, as_Address(src));
3595   } else {
3596     lea(rscratch1, src);
3597     vsubss(dst, nds, Address(rscratch1, 0));
3598   }
3599 }
3600 
3601 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3602   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3603   vxorps(dst, nds, src, Assembler::AVX_128bit);
3604 }
3605 
3606 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3607   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3608   vxorpd(dst, nds, src, Assembler::AVX_128bit);
3609 }
3610 
3611 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3612   if (reachable(src)) {
3613     vxorpd(dst, nds, as_Address(src), vector_len);
3614   } else {
3615     lea(scratch_reg, src);
3616     vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
3617   }
3618 }
3619 
3620 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3621   if (reachable(src)) {
3622     vxorps(dst, nds, as_Address(src), vector_len);
3623   } else {
3624     lea(scratch_reg, src);
3625     vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
3626   }
3627 }
3628 
3629 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3630   if (UseAVX > 1 || (vector_len < 1)) {
3631     if (reachable(src)) {
3632       Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3633     } else {
3634       lea(scratch_reg, src);
3635       Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
3636     }
3637   }
3638   else {
3639     MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
3640   }
3641 }
3642 
3643 void MacroAssembler::vpermd(XMMRegister dst,  XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3644   if (reachable(src)) {
3645     Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3646   } else {
3647     lea(scratch_reg, src);
3648     Assembler::vpermd(dst, nds, Address(scratch_reg, 0), vector_len);
3649   }
3650 }
3651 
3652 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
3653   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
3654   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
3655   // The inverted mask is sign-extended
3656   andptr(possibly_jweak, inverted_jweak_mask);
3657 }
3658 
3659 void MacroAssembler::resolve_jobject(Register value,
3660                                      Register thread,
3661                                      Register tmp) {
3662   assert_different_registers(value, thread, tmp);
3663   Label done, not_weak;
3664   testptr(value, value);
3665   jcc(Assembler::zero, done);                // Use NULL as-is.
3666   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
3667   jcc(Assembler::zero, not_weak);
3668   // Resolve jweak.
3669   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3670                  value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
3671   verify_oop(value);
3672   jmp(done);
3673   bind(not_weak);
3674   // Resolve (untagged) jobject.
3675   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
3676   verify_oop(value);
3677   bind(done);
3678 }
3679 
3680 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3681   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
3682 }
3683 
3684 // Force generation of a 4 byte immediate value even if it fits into 8bit
3685 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3686   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
3687 }
3688 
3689 void MacroAssembler::subptr(Register dst, Register src) {
3690   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
3691 }
3692 
3693 // C++ bool manipulation
3694 void MacroAssembler::testbool(Register dst) {
3695   if(sizeof(bool) == 1)
3696     testb(dst, 0xff);
3697   else if(sizeof(bool) == 2) {
3698     // testw implementation needed for two byte bools
3699     ShouldNotReachHere();
3700   } else if(sizeof(bool) == 4)
3701     testl(dst, dst);
3702   else
3703     // unsupported
3704     ShouldNotReachHere();
3705 }
3706 
3707 void MacroAssembler::testptr(Register dst, Register src) {
3708   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
3709 }
3710 
3711 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3712 void MacroAssembler::tlab_allocate(Register thread, Register obj,
3713                                    Register var_size_in_bytes,
3714                                    int con_size_in_bytes,
3715                                    Register t1,
3716                                    Register t2,
3717                                    Label& slow_case) {
3718   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3719   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3720 }
3721 
3722 // Defines obj, preserves var_size_in_bytes
3723 void MacroAssembler::eden_allocate(Register thread, Register obj,
3724                                    Register var_size_in_bytes,
3725                                    int con_size_in_bytes,
3726                                    Register t1,
3727                                    Label& slow_case) {
3728   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3729   bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
3730 }
3731 
3732 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3733 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3734   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3735   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3736   Label done;
3737 
3738   testptr(length_in_bytes, length_in_bytes);
3739   jcc(Assembler::zero, done);
3740 
3741   // initialize topmost word, divide index by 2, check if odd and test if zero
3742   // note: for the remaining code to work, index must be a multiple of BytesPerWord
3743 #ifdef ASSERT
3744   {
3745     Label L;
3746     testptr(length_in_bytes, BytesPerWord - 1);
3747     jcc(Assembler::zero, L);
3748     stop("length must be a multiple of BytesPerWord");
3749     bind(L);
3750   }
3751 #endif
3752   Register index = length_in_bytes;
3753   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
3754   if (UseIncDec) {
3755     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
3756   } else {
3757     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
3758     shrptr(index, 1);
3759   }
3760 #ifndef _LP64
3761   // index could have not been a multiple of 8 (i.e., bit 2 was set)
3762   {
3763     Label even;
3764     // note: if index was a multiple of 8, then it cannot
3765     //       be 0 now otherwise it must have been 0 before
3766     //       => if it is even, we don't need to check for 0 again
3767     jcc(Assembler::carryClear, even);
3768     // clear topmost word (no jump would be needed if conditional assignment worked here)
3769     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
3770     // index could be 0 now, must check again
3771     jcc(Assembler::zero, done);
3772     bind(even);
3773   }
3774 #endif // !_LP64
3775   // initialize remaining object fields: index is a multiple of 2 now
3776   {
3777     Label loop;
3778     bind(loop);
3779     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3780     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
3781     decrement(index);
3782     jcc(Assembler::notZero, loop);
3783   }
3784 
3785   bind(done);
3786 }
3787 
3788 // Look up the method for a megamorphic invokeinterface call.
3789 // The target method is determined by <intf_klass, itable_index>.
3790 // The receiver klass is in recv_klass.
3791 // On success, the result will be in method_result, and execution falls through.
3792 // On failure, execution transfers to the given label.
3793 void MacroAssembler::lookup_interface_method(Register recv_klass,
3794                                              Register intf_klass,
3795                                              RegisterOrConstant itable_index,
3796                                              Register method_result,
3797                                              Register scan_temp,
3798                                              Label& L_no_such_interface,
3799                                              bool return_method) {
3800   assert_different_registers(recv_klass, intf_klass, scan_temp);
3801   assert_different_registers(method_result, intf_klass, scan_temp);
3802   assert(recv_klass != method_result || !return_method,
3803          "recv_klass can be destroyed when method isn't needed");
3804 
3805   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3806          "caller must use same register for non-constant itable index as for method");
3807 
3808   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3809   int vtable_base = in_bytes(Klass::vtable_start_offset());
3810   int itentry_off = itableMethodEntry::method_offset_in_bytes();
3811   int scan_step   = itableOffsetEntry::size() * wordSize;
3812   int vte_size    = vtableEntry::size_in_bytes();
3813   Address::ScaleFactor times_vte_scale = Address::times_ptr;
3814   assert(vte_size == wordSize, "else adjust times_vte_scale");
3815 
3816   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3817 
3818   // %%% Could store the aligned, prescaled offset in the klassoop.
3819   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3820 
3821   if (return_method) {
3822     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3823     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3824     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
3825   }
3826 
3827   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
3828   //   if (scan->interface() == intf) {
3829   //     result = (klass + scan->offset() + itable_index);
3830   //   }
3831   // }
3832   Label search, found_method;
3833 
3834   for (int peel = 1; peel >= 0; peel--) {
3835     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
3836     cmpptr(intf_klass, method_result);
3837 
3838     if (peel) {
3839       jccb(Assembler::equal, found_method);
3840     } else {
3841       jccb(Assembler::notEqual, search);
3842       // (invert the test to fall through to found_method...)
3843     }
3844 
3845     if (!peel)  break;
3846 
3847     bind(search);
3848 
3849     // Check that the previous entry is non-null.  A null entry means that
3850     // the receiver class doesn't implement the interface, and wasn't the
3851     // same as when the caller was compiled.
3852     testptr(method_result, method_result);
3853     jcc(Assembler::zero, L_no_such_interface);
3854     addptr(scan_temp, scan_step);
3855   }
3856 
3857   bind(found_method);
3858 
3859   if (return_method) {
3860     // Got a hit.
3861     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
3862     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
3863   }
3864 }
3865 
3866 
3867 // virtual method calling
3868 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3869                                            RegisterOrConstant vtable_index,
3870                                            Register method_result) {
3871   const int base = in_bytes(Klass::vtable_start_offset());
3872   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
3873   Address vtable_entry_addr(recv_klass,
3874                             vtable_index, Address::times_ptr,
3875                             base + vtableEntry::method_offset_in_bytes());
3876   movptr(method_result, vtable_entry_addr);
3877 }
3878 
3879 
3880 void MacroAssembler::check_klass_subtype(Register sub_klass,
3881                            Register super_klass,
3882                            Register temp_reg,
3883                            Label& L_success) {
3884   Label L_failure;
3885   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
3886   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
3887   bind(L_failure);
3888 }
3889 
3890 
3891 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3892                                                    Register super_klass,
3893                                                    Register temp_reg,
3894                                                    Label* L_success,
3895                                                    Label* L_failure,
3896                                                    Label* L_slow_path,
3897                                         RegisterOrConstant super_check_offset) {
3898   assert_different_registers(sub_klass, super_klass, temp_reg);
3899   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
3900   if (super_check_offset.is_register()) {
3901     assert_different_registers(sub_klass, super_klass,
3902                                super_check_offset.as_register());
3903   } else if (must_load_sco) {
3904     assert(temp_reg != noreg, "supply either a temp or a register offset");
3905   }
3906 
3907   Label L_fallthrough;
3908   int label_nulls = 0;
3909   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
3910   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
3911   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
3912   assert(label_nulls <= 1, "at most one NULL in the batch");
3913 
3914   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3915   int sco_offset = in_bytes(Klass::super_check_offset_offset());
3916   Address super_check_offset_addr(super_klass, sco_offset);
3917 
3918   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
3919   // range of a jccb.  If this routine grows larger, reconsider at
3920   // least some of these.
3921 #define local_jcc(assembler_cond, label)                                \
3922   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
3923   else                             jcc( assembler_cond, label) /*omit semi*/
3924 
3925   // Hacked jmp, which may only be used just before L_fallthrough.
3926 #define final_jmp(label)                                                \
3927   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
3928   else                            jmp(label)                /*omit semi*/
3929 
3930   // If the pointers are equal, we are done (e.g., String[] elements).
3931   // This self-check enables sharing of secondary supertype arrays among
3932   // non-primary types such as array-of-interface.  Otherwise, each such
3933   // type would need its own customized SSA.
3934   // We move this check to the front of the fast path because many
3935   // type checks are in fact trivially successful in this manner,
3936   // so we get a nicely predicted branch right at the start of the check.
3937   cmpptr(sub_klass, super_klass);
3938   local_jcc(Assembler::equal, *L_success);
3939 
3940   // Check the supertype display:
3941   if (must_load_sco) {
3942     // Positive movl does right thing on LP64.
3943     movl(temp_reg, super_check_offset_addr);
3944     super_check_offset = RegisterOrConstant(temp_reg);
3945   }
3946   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
3947   cmpptr(super_klass, super_check_addr); // load displayed supertype
3948 
3949   // This check has worked decisively for primary supers.
3950   // Secondary supers are sought in the super_cache ('super_cache_addr').
3951   // (Secondary supers are interfaces and very deeply nested subtypes.)
3952   // This works in the same check above because of a tricky aliasing
3953   // between the super_cache and the primary super display elements.
3954   // (The 'super_check_addr' can address either, as the case requires.)
3955   // Note that the cache is updated below if it does not help us find
3956   // what we need immediately.
3957   // So if it was a primary super, we can just fail immediately.
3958   // Otherwise, it's the slow path for us (no success at this point).
3959 
3960   if (super_check_offset.is_register()) {
3961     local_jcc(Assembler::equal, *L_success);
3962     cmpl(super_check_offset.as_register(), sc_offset);
3963     if (L_failure == &L_fallthrough) {
3964       local_jcc(Assembler::equal, *L_slow_path);
3965     } else {
3966       local_jcc(Assembler::notEqual, *L_failure);
3967       final_jmp(*L_slow_path);
3968     }
3969   } else if (super_check_offset.as_constant() == sc_offset) {
3970     // Need a slow path; fast failure is impossible.
3971     if (L_slow_path == &L_fallthrough) {
3972       local_jcc(Assembler::equal, *L_success);
3973     } else {
3974       local_jcc(Assembler::notEqual, *L_slow_path);
3975       final_jmp(*L_success);
3976     }
3977   } else {
3978     // No slow path; it's a fast decision.
3979     if (L_failure == &L_fallthrough) {
3980       local_jcc(Assembler::equal, *L_success);
3981     } else {
3982       local_jcc(Assembler::notEqual, *L_failure);
3983       final_jmp(*L_success);
3984     }
3985   }
3986 
3987   bind(L_fallthrough);
3988 
3989 #undef local_jcc
3990 #undef final_jmp
3991 }
3992 
3993 
3994 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
3995                                                    Register super_klass,
3996                                                    Register temp_reg,
3997                                                    Register temp2_reg,
3998                                                    Label* L_success,
3999                                                    Label* L_failure,
4000                                                    bool set_cond_codes) {
4001   assert_different_registers(sub_klass, super_klass, temp_reg);
4002   if (temp2_reg != noreg)
4003     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4004 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4005 
4006   Label L_fallthrough;
4007   int label_nulls = 0;
4008   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4009   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4010   assert(label_nulls <= 1, "at most one NULL in the batch");
4011 
4012   // a couple of useful fields in sub_klass:
4013   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4014   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4015   Address secondary_supers_addr(sub_klass, ss_offset);
4016   Address super_cache_addr(     sub_klass, sc_offset);
4017 
4018   // Do a linear scan of the secondary super-klass chain.
4019   // This code is rarely used, so simplicity is a virtue here.
4020   // The repne_scan instruction uses fixed registers, which we must spill.
4021   // Don't worry too much about pre-existing connections with the input regs.
4022 
4023   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4024   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4025 
4026   // Get super_klass value into rax (even if it was in rdi or rcx).
4027   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4028   if (super_klass != rax || UseCompressedOops) {
4029     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4030     mov(rax, super_klass);
4031   }
4032   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4033   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4034 
4035 #ifndef PRODUCT
4036   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4037   ExternalAddress pst_counter_addr((address) pst_counter);
4038   NOT_LP64(  incrementl(pst_counter_addr) );
4039   LP64_ONLY( lea(rcx, pst_counter_addr) );
4040   LP64_ONLY( incrementl(Address(rcx, 0)) );
4041 #endif //PRODUCT
4042 
4043   // We will consult the secondary-super array.
4044   movptr(rdi, secondary_supers_addr);
4045   // Load the array length.  (Positive movl does right thing on LP64.)
4046   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4047   // Skip to start of data.
4048   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4049 
4050   // Scan RCX words at [RDI] for an occurrence of RAX.
4051   // Set NZ/Z based on last compare.
4052   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4053   // not change flags (only scas instruction which is repeated sets flags).
4054   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4055 
4056     testptr(rax,rax); // Set Z = 0
4057     repne_scan();
4058 
4059   // Unspill the temp. registers:
4060   if (pushed_rdi)  pop(rdi);
4061   if (pushed_rcx)  pop(rcx);
4062   if (pushed_rax)  pop(rax);
4063 
4064   if (set_cond_codes) {
4065     // Special hack for the AD files:  rdi is guaranteed non-zero.
4066     assert(!pushed_rdi, "rdi must be left non-NULL");
4067     // Also, the condition codes are properly set Z/NZ on succeed/failure.
4068   }
4069 
4070   if (L_failure == &L_fallthrough)
4071         jccb(Assembler::notEqual, *L_failure);
4072   else  jcc(Assembler::notEqual, *L_failure);
4073 
4074   // Success.  Cache the super we found and proceed in triumph.
4075   movptr(super_cache_addr, super_klass);
4076 
4077   if (L_success != &L_fallthrough) {
4078     jmp(*L_success);
4079   }
4080 
4081 #undef IS_A_TEMP
4082 
4083   bind(L_fallthrough);
4084 }
4085 
4086 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
4087   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
4088 
4089   Label L_fallthrough;
4090   if (L_fast_path == NULL) {
4091     L_fast_path = &L_fallthrough;
4092   } else if (L_slow_path == NULL) {
4093     L_slow_path = &L_fallthrough;
4094   }
4095 
4096   // Fast path check: class is fully initialized
4097   cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4098   jcc(Assembler::equal, *L_fast_path);
4099 
4100   // Fast path check: current thread is initializer thread
4101   cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
4102   if (L_slow_path == &L_fallthrough) {
4103     jcc(Assembler::equal, *L_fast_path);
4104     bind(*L_slow_path);
4105   } else if (L_fast_path == &L_fallthrough) {
4106     jcc(Assembler::notEqual, *L_slow_path);
4107     bind(*L_fast_path);
4108   } else {
4109     Unimplemented();
4110   }
4111 }
4112 
4113 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4114   if (VM_Version::supports_cmov()) {
4115     cmovl(cc, dst, src);
4116   } else {
4117     Label L;
4118     jccb(negate_condition(cc), L);
4119     movl(dst, src);
4120     bind(L);
4121   }
4122 }
4123 
4124 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4125   if (VM_Version::supports_cmov()) {
4126     cmovl(cc, dst, src);
4127   } else {
4128     Label L;
4129     jccb(negate_condition(cc), L);
4130     movl(dst, src);
4131     bind(L);
4132   }
4133 }
4134 
4135 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4136   if (!VerifyOops) return;
4137 
4138   // Pass register number to verify_oop_subroutine
4139   const char* b = NULL;
4140   {
4141     ResourceMark rm;
4142     stringStream ss;
4143     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4144     b = code_string(ss.as_string());
4145   }
4146   BLOCK_COMMENT("verify_oop {");
4147 #ifdef _LP64
4148   push(rscratch1);                    // save r10, trashed by movptr()
4149 #endif
4150   push(rax);                          // save rax,
4151   push(reg);                          // pass register argument
4152   ExternalAddress buffer((address) b);
4153   // avoid using pushptr, as it modifies scratch registers
4154   // and our contract is not to modify anything
4155   movptr(rax, buffer.addr());
4156   push(rax);
4157   // call indirectly to solve generation ordering problem
4158   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4159   call(rax);
4160   // Caller pops the arguments (oop, message) and restores rax, r10
4161   BLOCK_COMMENT("} verify_oop");
4162 }
4163 
4164 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4165   if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4166     vpternlogd(dst, 0xFF, dst, dst, vector_len);
4167   } else {
4168     assert(UseAVX > 0, "");
4169     vpcmpeqb(dst, dst, dst, vector_len);
4170   }
4171 }
4172 
4173 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4174                                          int extra_slot_offset) {
4175   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4176   int stackElementSize = Interpreter::stackElementSize;
4177   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4178 #ifdef ASSERT
4179   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4180   assert(offset1 - offset == stackElementSize, "correct arithmetic");
4181 #endif
4182   Register             scale_reg    = noreg;
4183   Address::ScaleFactor scale_factor = Address::no_scale;
4184   if (arg_slot.is_constant()) {
4185     offset += arg_slot.as_constant() * stackElementSize;
4186   } else {
4187     scale_reg    = arg_slot.as_register();
4188     scale_factor = Address::times(stackElementSize);
4189   }
4190   offset += wordSize;           // return PC is on stack
4191   return Address(rsp, scale_reg, scale_factor, offset);
4192 }
4193 
4194 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
4195   if (!VerifyOops) return;
4196 
4197   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4198   // Pass register number to verify_oop_subroutine
4199   const char* b = NULL;
4200   {
4201     ResourceMark rm;
4202     stringStream ss;
4203     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
4204     b = code_string(ss.as_string());
4205   }
4206 #ifdef _LP64
4207   push(rscratch1);                    // save r10, trashed by movptr()
4208 #endif
4209   push(rax);                          // save rax,
4210   // addr may contain rsp so we will have to adjust it based on the push
4211   // we just did (and on 64 bit we do two pushes)
4212   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4213   // stores rax into addr which is backwards of what was intended.
4214   if (addr.uses(rsp)) {
4215     lea(rax, addr);
4216     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4217   } else {
4218     pushptr(addr);
4219   }
4220 
4221   ExternalAddress buffer((address) b);
4222   // pass msg argument
4223   // avoid using pushptr, as it modifies scratch registers
4224   // and our contract is not to modify anything
4225   movptr(rax, buffer.addr());
4226   push(rax);
4227 
4228   // call indirectly to solve generation ordering problem
4229   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4230   call(rax);
4231   // Caller pops the arguments (addr, message) and restores rax, r10.
4232 }
4233 
4234 void MacroAssembler::verify_tlab() {
4235 #ifdef ASSERT
4236   if (UseTLAB && VerifyOops) {
4237     Label next, ok;
4238     Register t1 = rsi;
4239     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4240 
4241     push(t1);
4242     NOT_LP64(push(thread_reg));
4243     NOT_LP64(get_thread(thread_reg));
4244 
4245     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4246     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4247     jcc(Assembler::aboveEqual, next);
4248     STOP("assert(top >= start)");
4249     should_not_reach_here();
4250 
4251     bind(next);
4252     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4253     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4254     jcc(Assembler::aboveEqual, ok);
4255     STOP("assert(top <= end)");
4256     should_not_reach_here();
4257 
4258     bind(ok);
4259     NOT_LP64(pop(thread_reg));
4260     pop(t1);
4261   }
4262 #endif
4263 }
4264 
4265 class ControlWord {
4266  public:
4267   int32_t _value;
4268 
4269   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4270   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4271   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4272   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4273   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4274   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4275   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4276   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4277 
4278   void print() const {
4279     // rounding control
4280     const char* rc;
4281     switch (rounding_control()) {
4282       case 0: rc = "round near"; break;
4283       case 1: rc = "round down"; break;
4284       case 2: rc = "round up  "; break;
4285       case 3: rc = "chop      "; break;
4286       default:
4287         rc = NULL; // silence compiler warnings
4288         fatal("Unknown rounding control: %d", rounding_control());
4289     };
4290     // precision control
4291     const char* pc;
4292     switch (precision_control()) {
4293       case 0: pc = "24 bits "; break;
4294       case 1: pc = "reserved"; break;
4295       case 2: pc = "53 bits "; break;
4296       case 3: pc = "64 bits "; break;
4297       default:
4298         pc = NULL; // silence compiler warnings
4299         fatal("Unknown precision control: %d", precision_control());
4300     };
4301     // flags
4302     char f[9];
4303     f[0] = ' ';
4304     f[1] = ' ';
4305     f[2] = (precision   ()) ? 'P' : 'p';
4306     f[3] = (underflow   ()) ? 'U' : 'u';
4307     f[4] = (overflow    ()) ? 'O' : 'o';
4308     f[5] = (zero_divide ()) ? 'Z' : 'z';
4309     f[6] = (denormalized()) ? 'D' : 'd';
4310     f[7] = (invalid     ()) ? 'I' : 'i';
4311     f[8] = '\x0';
4312     // output
4313     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4314   }
4315 
4316 };
4317 
4318 class StatusWord {
4319  public:
4320   int32_t _value;
4321 
4322   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4323   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4324   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4325   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4326   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4327   int  top() const                     { return  (_value >> 11) & 7      ; }
4328   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4329   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4330   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4331   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4332   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4333   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4334   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4335   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4336 
4337   void print() const {
4338     // condition codes
4339     char c[5];
4340     c[0] = (C3()) ? '3' : '-';
4341     c[1] = (C2()) ? '2' : '-';
4342     c[2] = (C1()) ? '1' : '-';
4343     c[3] = (C0()) ? '0' : '-';
4344     c[4] = '\x0';
4345     // flags
4346     char f[9];
4347     f[0] = (error_status()) ? 'E' : '-';
4348     f[1] = (stack_fault ()) ? 'S' : '-';
4349     f[2] = (precision   ()) ? 'P' : '-';
4350     f[3] = (underflow   ()) ? 'U' : '-';
4351     f[4] = (overflow    ()) ? 'O' : '-';
4352     f[5] = (zero_divide ()) ? 'Z' : '-';
4353     f[6] = (denormalized()) ? 'D' : '-';
4354     f[7] = (invalid     ()) ? 'I' : '-';
4355     f[8] = '\x0';
4356     // output
4357     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4358   }
4359 
4360 };
4361 
4362 class TagWord {
4363  public:
4364   int32_t _value;
4365 
4366   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4367 
4368   void print() const {
4369     printf("%04x", _value & 0xFFFF);
4370   }
4371 
4372 };
4373 
4374 class FPU_Register {
4375  public:
4376   int32_t _m0;
4377   int32_t _m1;
4378   int16_t _ex;
4379 
4380   bool is_indefinite() const           {
4381     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4382   }
4383 
4384   void print() const {
4385     char  sign = (_ex < 0) ? '-' : '+';
4386     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
4387     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
4388   };
4389 
4390 };
4391 
4392 class FPU_State {
4393  public:
4394   enum {
4395     register_size       = 10,
4396     number_of_registers =  8,
4397     register_mask       =  7
4398   };
4399 
4400   ControlWord  _control_word;
4401   StatusWord   _status_word;
4402   TagWord      _tag_word;
4403   int32_t      _error_offset;
4404   int32_t      _error_selector;
4405   int32_t      _data_offset;
4406   int32_t      _data_selector;
4407   int8_t       _register[register_size * number_of_registers];
4408 
4409   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4410   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
4411 
4412   const char* tag_as_string(int tag) const {
4413     switch (tag) {
4414       case 0: return "valid";
4415       case 1: return "zero";
4416       case 2: return "special";
4417       case 3: return "empty";
4418     }
4419     ShouldNotReachHere();
4420     return NULL;
4421   }
4422 
4423   void print() const {
4424     // print computation registers
4425     { int t = _status_word.top();
4426       for (int i = 0; i < number_of_registers; i++) {
4427         int j = (i - t) & register_mask;
4428         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4429         st(j)->print();
4430         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4431       }
4432     }
4433     printf("\n");
4434     // print control registers
4435     printf("ctrl = "); _control_word.print(); printf("\n");
4436     printf("stat = "); _status_word .print(); printf("\n");
4437     printf("tags = "); _tag_word    .print(); printf("\n");
4438   }
4439 
4440 };
4441 
4442 class Flag_Register {
4443  public:
4444   int32_t _value;
4445 
4446   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
4447   bool direction() const               { return ((_value >> 10) & 1) != 0; }
4448   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
4449   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
4450   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
4451   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
4452   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
4453 
4454   void print() const {
4455     // flags
4456     char f[8];
4457     f[0] = (overflow       ()) ? 'O' : '-';
4458     f[1] = (direction      ()) ? 'D' : '-';
4459     f[2] = (sign           ()) ? 'S' : '-';
4460     f[3] = (zero           ()) ? 'Z' : '-';
4461     f[4] = (auxiliary_carry()) ? 'A' : '-';
4462     f[5] = (parity         ()) ? 'P' : '-';
4463     f[6] = (carry          ()) ? 'C' : '-';
4464     f[7] = '\x0';
4465     // output
4466     printf("%08x  flags = %s", _value, f);
4467   }
4468 
4469 };
4470 
4471 class IU_Register {
4472  public:
4473   int32_t _value;
4474 
4475   void print() const {
4476     printf("%08x  %11d", _value, _value);
4477   }
4478 
4479 };
4480 
4481 class IU_State {
4482  public:
4483   Flag_Register _eflags;
4484   IU_Register   _rdi;
4485   IU_Register   _rsi;
4486   IU_Register   _rbp;
4487   IU_Register   _rsp;
4488   IU_Register   _rbx;
4489   IU_Register   _rdx;
4490   IU_Register   _rcx;
4491   IU_Register   _rax;
4492 
4493   void print() const {
4494     // computation registers
4495     printf("rax,  = "); _rax.print(); printf("\n");
4496     printf("rbx,  = "); _rbx.print(); printf("\n");
4497     printf("rcx  = "); _rcx.print(); printf("\n");
4498     printf("rdx  = "); _rdx.print(); printf("\n");
4499     printf("rdi  = "); _rdi.print(); printf("\n");
4500     printf("rsi  = "); _rsi.print(); printf("\n");
4501     printf("rbp,  = "); _rbp.print(); printf("\n");
4502     printf("rsp  = "); _rsp.print(); printf("\n");
4503     printf("\n");
4504     // control registers
4505     printf("flgs = "); _eflags.print(); printf("\n");
4506   }
4507 };
4508 
4509 
4510 class CPU_State {
4511  public:
4512   FPU_State _fpu_state;
4513   IU_State  _iu_state;
4514 
4515   void print() const {
4516     printf("--------------------------------------------------\n");
4517     _iu_state .print();
4518     printf("\n");
4519     _fpu_state.print();
4520     printf("--------------------------------------------------\n");
4521   }
4522 
4523 };
4524 
4525 
4526 static void _print_CPU_state(CPU_State* state) {
4527   state->print();
4528 };
4529 
4530 
4531 void MacroAssembler::print_CPU_state() {
4532   push_CPU_state();
4533   push(rsp);                // pass CPU state
4534   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
4535   addptr(rsp, wordSize);       // discard argument
4536   pop_CPU_state();
4537 }
4538 
4539 
4540 #ifndef _LP64
4541 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
4542   static int counter = 0;
4543   FPU_State* fs = &state->_fpu_state;
4544   counter++;
4545   // For leaf calls, only verify that the top few elements remain empty.
4546   // We only need 1 empty at the top for C2 code.
4547   if( stack_depth < 0 ) {
4548     if( fs->tag_for_st(7) != 3 ) {
4549       printf("FPR7 not empty\n");
4550       state->print();
4551       assert(false, "error");
4552       return false;
4553     }
4554     return true;                // All other stack states do not matter
4555   }
4556 
4557   assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(),
4558          "bad FPU control word");
4559 
4560   // compute stack depth
4561   int i = 0;
4562   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
4563   int d = i;
4564   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
4565   // verify findings
4566   if (i != FPU_State::number_of_registers) {
4567     // stack not contiguous
4568     printf("%s: stack not contiguous at ST%d\n", s, i);
4569     state->print();
4570     assert(false, "error");
4571     return false;
4572   }
4573   // check if computed stack depth corresponds to expected stack depth
4574   if (stack_depth < 0) {
4575     // expected stack depth is -stack_depth or less
4576     if (d > -stack_depth) {
4577       // too many elements on the stack
4578       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
4579       state->print();
4580       assert(false, "error");
4581       return false;
4582     }
4583   } else {
4584     // expected stack depth is stack_depth
4585     if (d != stack_depth) {
4586       // wrong stack depth
4587       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
4588       state->print();
4589       assert(false, "error");
4590       return false;
4591     }
4592   }
4593   // everything is cool
4594   return true;
4595 }
4596 
4597 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
4598   if (!VerifyFPU) return;
4599   push_CPU_state();
4600   push(rsp);                // pass CPU state
4601   ExternalAddress msg((address) s);
4602   // pass message string s
4603   pushptr(msg.addr());
4604   push(stack_depth);        // pass stack depth
4605   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
4606   addptr(rsp, 3 * wordSize);   // discard arguments
4607   // check for error
4608   { Label L;
4609     testl(rax, rax);
4610     jcc(Assembler::notZero, L);
4611     int3();                  // break if error condition
4612     bind(L);
4613   }
4614   pop_CPU_state();
4615 }
4616 #endif // _LP64
4617 
4618 void MacroAssembler::restore_cpu_control_state_after_jni() {
4619   // Either restore the MXCSR register after returning from the JNI Call
4620   // or verify that it wasn't changed (with -Xcheck:jni flag).
4621   if (VM_Version::supports_sse()) {
4622     if (RestoreMXCSROnJNICalls) {
4623       ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()));
4624     } else if (CheckJNICalls) {
4625       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
4626     }
4627   }
4628   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
4629   vzeroupper();
4630   // Reset k1 to 0xffff.
4631 
4632 #ifdef COMPILER2
4633   if (PostLoopMultiversioning && VM_Version::supports_evex()) {
4634     push(rcx);
4635     movl(rcx, 0xffff);
4636     kmovwl(k1, rcx);
4637     pop(rcx);
4638   }
4639 #endif // COMPILER2
4640 
4641 #ifndef _LP64
4642   // Either restore the x87 floating pointer control word after returning
4643   // from the JNI call or verify that it wasn't changed.
4644   if (CheckJNICalls) {
4645     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
4646   }
4647 #endif // _LP64
4648 }
4649 
4650 // ((OopHandle)result).resolve();
4651 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
4652   assert_different_registers(result, tmp);
4653 
4654   // Only 64 bit platforms support GCs that require a tmp register
4655   // Only IN_HEAP loads require a thread_tmp register
4656   // OopHandle::resolve is an indirection like jobject.
4657   access_load_at(T_OBJECT, IN_NATIVE,
4658                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
4659 }
4660 
4661 // ((WeakHandle)result).resolve();
4662 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
4663   assert_different_registers(rresult, rtmp);
4664   Label resolved;
4665 
4666   // A null weak handle resolves to null.
4667   cmpptr(rresult, 0);
4668   jcc(Assembler::equal, resolved);
4669 
4670   // Only 64 bit platforms support GCs that require a tmp register
4671   // Only IN_HEAP loads require a thread_tmp register
4672   // WeakHandle::resolve is an indirection like jweak.
4673   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4674                  rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
4675   bind(resolved);
4676 }
4677 
4678 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
4679   // get mirror
4680   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
4681   load_method_holder(mirror, method);
4682   movptr(mirror, Address(mirror, mirror_offset));
4683   resolve_oop_handle(mirror, tmp);
4684 }
4685 
4686 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
4687   load_method_holder(rresult, rmethod);
4688   movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
4689 }
4690 
4691 void MacroAssembler::load_method_holder(Register holder, Register method) {
4692   movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
4693   movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
4694   movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
4695 }
4696 
4697 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
4698   assert_different_registers(src, tmp);
4699   assert_different_registers(dst, tmp);
4700 #ifdef _LP64
4701   if (UseCompressedClassPointers) {
4702     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4703     decode_klass_not_null(dst, tmp);
4704   } else
4705 #endif
4706     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4707 }
4708 
4709 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
4710   assert_different_registers(src, tmp);
4711   assert_different_registers(dst, tmp);
4712 #ifdef _LP64
4713   if (UseCompressedClassPointers) {
4714     encode_klass_not_null(src, tmp);
4715     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4716   } else
4717 #endif
4718     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4719 }
4720 
4721 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
4722                                     Register tmp1, Register thread_tmp) {
4723   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4724   decorators = AccessInternal::decorator_fixup(decorators);
4725   bool as_raw = (decorators & AS_RAW) != 0;
4726   if (as_raw) {
4727     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4728   } else {
4729     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4730   }
4731 }
4732 
4733 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
4734                                      Register tmp1, Register tmp2) {
4735   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4736   decorators = AccessInternal::decorator_fixup(decorators);
4737   bool as_raw = (decorators & AS_RAW) != 0;
4738   if (as_raw) {
4739     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2);
4740   } else {
4741     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2);
4742   }
4743 }
4744 
4745 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4746                                    Register thread_tmp, DecoratorSet decorators) {
4747   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4748 }
4749 
4750 // Doesn't do verfication, generates fixed size code
4751 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4752                                             Register thread_tmp, DecoratorSet decorators) {
4753   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4754 }
4755 
4756 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4757                                     Register tmp2, DecoratorSet decorators) {
4758   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
4759 }
4760 
4761 // Used for storing NULLs.
4762 void MacroAssembler::store_heap_oop_null(Address dst) {
4763   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4764 }
4765 
4766 #ifdef _LP64
4767 void MacroAssembler::store_klass_gap(Register dst, Register src) {
4768   if (UseCompressedClassPointers) {
4769     // Store to klass gap in destination
4770     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
4771   }
4772 }
4773 
4774 #ifdef ASSERT
4775 void MacroAssembler::verify_heapbase(const char* msg) {
4776   assert (UseCompressedOops, "should be compressed");
4777   assert (Universe::heap() != NULL, "java heap should be initialized");
4778   if (CheckCompressedOops) {
4779     Label ok;
4780     push(rscratch1); // cmpptr trashes rscratch1
4781     cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
4782     jcc(Assembler::equal, ok);
4783     STOP(msg);
4784     bind(ok);
4785     pop(rscratch1);
4786   }
4787 }
4788 #endif
4789 
4790 // Algorithm must match oop.inline.hpp encode_heap_oop.
4791 void MacroAssembler::encode_heap_oop(Register r) {
4792 #ifdef ASSERT
4793   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
4794 #endif
4795   verify_oop_msg(r, "broken oop in encode_heap_oop");
4796   if (CompressedOops::base() == NULL) {
4797     if (CompressedOops::shift() != 0) {
4798       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4799       shrq(r, LogMinObjAlignmentInBytes);
4800     }
4801     return;
4802   }
4803   testq(r, r);
4804   cmovq(Assembler::equal, r, r12_heapbase);
4805   subq(r, r12_heapbase);
4806   shrq(r, LogMinObjAlignmentInBytes);
4807 }
4808 
4809 void MacroAssembler::encode_heap_oop_not_null(Register r) {
4810 #ifdef ASSERT
4811   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
4812   if (CheckCompressedOops) {
4813     Label ok;
4814     testq(r, r);
4815     jcc(Assembler::notEqual, ok);
4816     STOP("null oop passed to encode_heap_oop_not_null");
4817     bind(ok);
4818   }
4819 #endif
4820   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
4821   if (CompressedOops::base() != NULL) {
4822     subq(r, r12_heapbase);
4823   }
4824   if (CompressedOops::shift() != 0) {
4825     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4826     shrq(r, LogMinObjAlignmentInBytes);
4827   }
4828 }
4829 
4830 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
4831 #ifdef ASSERT
4832   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
4833   if (CheckCompressedOops) {
4834     Label ok;
4835     testq(src, src);
4836     jcc(Assembler::notEqual, ok);
4837     STOP("null oop passed to encode_heap_oop_not_null2");
4838     bind(ok);
4839   }
4840 #endif
4841   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
4842   if (dst != src) {
4843     movq(dst, src);
4844   }
4845   if (CompressedOops::base() != NULL) {
4846     subq(dst, r12_heapbase);
4847   }
4848   if (CompressedOops::shift() != 0) {
4849     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4850     shrq(dst, LogMinObjAlignmentInBytes);
4851   }
4852 }
4853 
4854 void  MacroAssembler::decode_heap_oop(Register r) {
4855 #ifdef ASSERT
4856   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
4857 #endif
4858   if (CompressedOops::base() == NULL) {
4859     if (CompressedOops::shift() != 0) {
4860       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4861       shlq(r, LogMinObjAlignmentInBytes);
4862     }
4863   } else {
4864     Label done;
4865     shlq(r, LogMinObjAlignmentInBytes);
4866     jccb(Assembler::equal, done);
4867     addq(r, r12_heapbase);
4868     bind(done);
4869   }
4870   verify_oop_msg(r, "broken oop in decode_heap_oop");
4871 }
4872 
4873 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
4874   // Note: it will change flags
4875   assert (UseCompressedOops, "should only be used for compressed headers");
4876   assert (Universe::heap() != NULL, "java heap should be initialized");
4877   // Cannot assert, unverified entry point counts instructions (see .ad file)
4878   // vtableStubs also counts instructions in pd_code_size_limit.
4879   // Also do not verify_oop as this is called by verify_oop.
4880   if (CompressedOops::shift() != 0) {
4881     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4882     shlq(r, LogMinObjAlignmentInBytes);
4883     if (CompressedOops::base() != NULL) {
4884       addq(r, r12_heapbase);
4885     }
4886   } else {
4887     assert (CompressedOops::base() == NULL, "sanity");
4888   }
4889 }
4890 
4891 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
4892   // Note: it will change flags
4893   assert (UseCompressedOops, "should only be used for compressed headers");
4894   assert (Universe::heap() != NULL, "java heap should be initialized");
4895   // Cannot assert, unverified entry point counts instructions (see .ad file)
4896   // vtableStubs also counts instructions in pd_code_size_limit.
4897   // Also do not verify_oop as this is called by verify_oop.
4898   if (CompressedOops::shift() != 0) {
4899     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4900     if (LogMinObjAlignmentInBytes == Address::times_8) {
4901       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
4902     } else {
4903       if (dst != src) {
4904         movq(dst, src);
4905       }
4906       shlq(dst, LogMinObjAlignmentInBytes);
4907       if (CompressedOops::base() != NULL) {
4908         addq(dst, r12_heapbase);
4909       }
4910     }
4911   } else {
4912     assert (CompressedOops::base() == NULL, "sanity");
4913     if (dst != src) {
4914       movq(dst, src);
4915     }
4916   }
4917 }
4918 
4919 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
4920   assert_different_registers(r, tmp);
4921   if (CompressedKlassPointers::base() != NULL) {
4922     mov64(tmp, (int64_t)CompressedKlassPointers::base());
4923     subq(r, tmp);
4924   }
4925   if (CompressedKlassPointers::shift() != 0) {
4926     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4927     shrq(r, LogKlassAlignmentInBytes);
4928   }
4929 }
4930 
4931 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
4932   assert_different_registers(src, dst);
4933   if (CompressedKlassPointers::base() != NULL) {
4934     mov64(dst, -(int64_t)CompressedKlassPointers::base());
4935     addq(dst, src);
4936   } else {
4937     movptr(dst, src);
4938   }
4939   if (CompressedKlassPointers::shift() != 0) {
4940     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4941     shrq(dst, LogKlassAlignmentInBytes);
4942   }
4943 }
4944 
4945 void  MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
4946   assert_different_registers(r, tmp);
4947   // Note: it will change flags
4948   assert(UseCompressedClassPointers, "should only be used for compressed headers");
4949   // Cannot assert, unverified entry point counts instructions (see .ad file)
4950   // vtableStubs also counts instructions in pd_code_size_limit.
4951   // Also do not verify_oop as this is called by verify_oop.
4952   if (CompressedKlassPointers::shift() != 0) {
4953     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4954     shlq(r, LogKlassAlignmentInBytes);
4955   }
4956   if (CompressedKlassPointers::base() != NULL) {
4957     mov64(tmp, (int64_t)CompressedKlassPointers::base());
4958     addq(r, tmp);
4959   }
4960 }
4961 
4962 void  MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
4963   assert_different_registers(src, dst);
4964   // Note: it will change flags
4965   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4966   // Cannot assert, unverified entry point counts instructions (see .ad file)
4967   // vtableStubs also counts instructions in pd_code_size_limit.
4968   // Also do not verify_oop as this is called by verify_oop.
4969 
4970   if (CompressedKlassPointers::base() == NULL &&
4971       CompressedKlassPointers::shift() == 0) {
4972     // The best case scenario is that there is no base or shift. Then it is already
4973     // a pointer that needs nothing but a register rename.
4974     movl(dst, src);
4975   } else {
4976     if (CompressedKlassPointers::base() != NULL) {
4977       mov64(dst, (int64_t)CompressedKlassPointers::base());
4978     } else {
4979       xorq(dst, dst);
4980     }
4981     if (CompressedKlassPointers::shift() != 0) {
4982       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4983       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
4984       leaq(dst, Address(dst, src, Address::times_8, 0));
4985     } else {
4986       addq(dst, src);
4987     }
4988   }
4989 }
4990 
4991 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4992   assert (UseCompressedOops, "should only be used for compressed headers");
4993   assert (Universe::heap() != NULL, "java heap should be initialized");
4994   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4995   int oop_index = oop_recorder()->find_index(obj);
4996   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4997   mov_narrow_oop(dst, oop_index, rspec);
4998 }
4999 
5000 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5001   assert (UseCompressedOops, "should only be used for compressed headers");
5002   assert (Universe::heap() != NULL, "java heap should be initialized");
5003   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5004   int oop_index = oop_recorder()->find_index(obj);
5005   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5006   mov_narrow_oop(dst, oop_index, rspec);
5007 }
5008 
5009 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5010   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5011   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5012   int klass_index = oop_recorder()->find_index(k);
5013   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5014   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5015 }
5016 
5017 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5018   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5019   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5020   int klass_index = oop_recorder()->find_index(k);
5021   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5022   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5023 }
5024 
5025 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5026   assert (UseCompressedOops, "should only be used for compressed headers");
5027   assert (Universe::heap() != NULL, "java heap should be initialized");
5028   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5029   int oop_index = oop_recorder()->find_index(obj);
5030   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5031   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5032 }
5033 
5034 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5035   assert (UseCompressedOops, "should only be used for compressed headers");
5036   assert (Universe::heap() != NULL, "java heap should be initialized");
5037   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5038   int oop_index = oop_recorder()->find_index(obj);
5039   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5040   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5041 }
5042 
5043 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5044   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5045   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5046   int klass_index = oop_recorder()->find_index(k);
5047   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5048   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5049 }
5050 
5051 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5052   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5053   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5054   int klass_index = oop_recorder()->find_index(k);
5055   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5056   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5057 }
5058 
5059 void MacroAssembler::reinit_heapbase() {
5060   if (UseCompressedOops) {
5061     if (Universe::heap() != NULL) {
5062       if (CompressedOops::base() == NULL) {
5063         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5064       } else {
5065         mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
5066       }
5067     } else {
5068       movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5069     }
5070   }
5071 }
5072 
5073 #endif // _LP64
5074 
5075 // C2 compiled method's prolog code.
5076 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
5077 
5078   // WARNING: Initial instruction MUST be 5 bytes or longer so that
5079   // NativeJump::patch_verified_entry will be able to patch out the entry
5080   // code safely. The push to verify stack depth is ok at 5 bytes,
5081   // the frame allocation can be either 3 or 6 bytes. So if we don't do
5082   // stack bang then we must use the 6 byte frame allocation even if
5083   // we have no frame. :-(
5084   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
5085 
5086   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
5087   // Remove word for return addr
5088   framesize -= wordSize;
5089   stack_bang_size -= wordSize;
5090 
5091   // Calls to C2R adapters often do not accept exceptional returns.
5092   // We require that their callers must bang for them.  But be careful, because
5093   // some VM calls (such as call site linkage) can use several kilobytes of
5094   // stack.  But the stack safety zone should account for that.
5095   // See bugs 4446381, 4468289, 4497237.
5096   if (stack_bang_size > 0) {
5097     generate_stack_overflow_check(stack_bang_size);
5098 
5099     // We always push rbp, so that on return to interpreter rbp, will be
5100     // restored correctly and we can correct the stack.
5101     push(rbp);
5102     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5103     if (PreserveFramePointer) {
5104       mov(rbp, rsp);
5105     }
5106     // Remove word for ebp
5107     framesize -= wordSize;
5108 
5109     // Create frame
5110     if (framesize) {
5111       subptr(rsp, framesize);
5112     }
5113   } else {
5114     // Create frame (force generation of a 4 byte immediate value)
5115     subptr_imm32(rsp, framesize);
5116 
5117     // Save RBP register now.
5118     framesize -= wordSize;
5119     movptr(Address(rsp, framesize), rbp);
5120     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5121     if (PreserveFramePointer) {
5122       movptr(rbp, rsp);
5123       if (framesize > 0) {
5124         addptr(rbp, framesize);
5125       }
5126     }
5127   }
5128 
5129   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5130     framesize -= wordSize;
5131     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5132   }
5133 
5134 #ifndef _LP64
5135   // If method sets FPU control word do it now
5136   if (fp_mode_24b) {
5137     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
5138   }
5139   if (UseSSE >= 2 && VerifyFPU) {
5140     verify_FPU(0, "FPU stack must be clean on entry");
5141   }
5142 #endif
5143 
5144 #ifdef ASSERT
5145   if (VerifyStackAtCalls) {
5146     Label L;
5147     push(rax);
5148     mov(rax, rsp);
5149     andptr(rax, StackAlignmentInBytes-1);
5150     cmpptr(rax, StackAlignmentInBytes-wordSize);
5151     pop(rax);
5152     jcc(Assembler::equal, L);
5153     STOP("Stack is not properly aligned!");
5154     bind(L);
5155   }
5156 #endif
5157 
5158   if (!is_stub) {
5159     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5160     bs->nmethod_entry_barrier(this);
5161   }
5162 }
5163 
5164 #if COMPILER2_OR_JVMCI
5165 
5166 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5167 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5168   // cnt - number of qwords (8-byte words).
5169   // base - start address, qword aligned.
5170   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5171   bool use64byteVector = MaxVectorSize == 64 && AVX3Threshold == 0;
5172   if (use64byteVector) {
5173     vpxor(xtmp, xtmp, xtmp, AVX_512bit);
5174   } else if (MaxVectorSize >= 32) {
5175     vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5176   } else {
5177     pxor(xtmp, xtmp);
5178   }
5179   jmp(L_zero_64_bytes);
5180 
5181   BIND(L_loop);
5182   if (MaxVectorSize >= 32) {
5183     fill64(base, 0, xtmp, use64byteVector);
5184   } else {
5185     movdqu(Address(base,  0), xtmp);
5186     movdqu(Address(base, 16), xtmp);
5187     movdqu(Address(base, 32), xtmp);
5188     movdqu(Address(base, 48), xtmp);
5189   }
5190   addptr(base, 64);
5191 
5192   BIND(L_zero_64_bytes);
5193   subptr(cnt, 8);
5194   jccb(Assembler::greaterEqual, L_loop);
5195 
5196   // Copy trailing 64 bytes
5197   if (use64byteVector) {
5198     addptr(cnt, 8);
5199     jccb(Assembler::equal, L_end);
5200     fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
5201     jmp(L_end);
5202   } else {
5203     addptr(cnt, 4);
5204     jccb(Assembler::less, L_tail);
5205     if (MaxVectorSize >= 32) {
5206       vmovdqu(Address(base, 0), xtmp);
5207     } else {
5208       movdqu(Address(base,  0), xtmp);
5209       movdqu(Address(base, 16), xtmp);
5210     }
5211   }
5212   addptr(base, 32);
5213   subptr(cnt, 4);
5214 
5215   BIND(L_tail);
5216   addptr(cnt, 4);
5217   jccb(Assembler::lessEqual, L_end);
5218   if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5219     fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
5220   } else {
5221     decrement(cnt);
5222 
5223     BIND(L_sloop);
5224     movq(Address(base, 0), xtmp);
5225     addptr(base, 8);
5226     decrement(cnt);
5227     jccb(Assembler::greaterEqual, L_sloop);
5228   }
5229   BIND(L_end);
5230 }
5231 
5232 // Clearing constant sized memory using YMM/ZMM registers.
5233 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5234   assert(UseAVX > 2 && VM_Version::supports_avx512vlbw(), "");
5235   bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
5236 
5237   int vector64_count = (cnt & (~0x7)) >> 3;
5238   cnt = cnt & 0x7;
5239 
5240   // 64 byte initialization loop.
5241   vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
5242   for (int i = 0; i < vector64_count; i++) {
5243     fill64(base, i * 64, xtmp, use64byteVector);
5244   }
5245 
5246   // Clear remaining 64 byte tail.
5247   int disp = vector64_count * 64;
5248   if (cnt) {
5249     switch (cnt) {
5250       case 1:
5251         movq(Address(base, disp), xtmp);
5252         break;
5253       case 2:
5254         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_128bit);
5255         break;
5256       case 3:
5257         movl(rtmp, 0x7);
5258         kmovwl(mask, rtmp);
5259         evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_256bit);
5260         break;
5261       case 4:
5262         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5263         break;
5264       case 5:
5265         if (use64byteVector) {
5266           movl(rtmp, 0x1F);
5267           kmovwl(mask, rtmp);
5268           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5269         } else {
5270           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5271           movq(Address(base, disp + 32), xtmp);
5272         }
5273         break;
5274       case 6:
5275         if (use64byteVector) {
5276           movl(rtmp, 0x3F);
5277           kmovwl(mask, rtmp);
5278           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5279         } else {
5280           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5281           evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, Assembler::AVX_128bit);
5282         }
5283         break;
5284       case 7:
5285         if (use64byteVector) {
5286           movl(rtmp, 0x7F);
5287           kmovwl(mask, rtmp);
5288           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5289         } else {
5290           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5291           movl(rtmp, 0x7);
5292           kmovwl(mask, rtmp);
5293           evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, Assembler::AVX_256bit);
5294         }
5295         break;
5296       default:
5297         fatal("Unexpected length : %d\n",cnt);
5298         break;
5299     }
5300   }
5301 }
5302 
5303 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
5304                                bool is_large, KRegister mask) {
5305   // cnt      - number of qwords (8-byte words).
5306   // base     - start address, qword aligned.
5307   // is_large - if optimizers know cnt is larger than InitArrayShortSize
5308   assert(base==rdi, "base register must be edi for rep stos");
5309   assert(tmp==rax,   "tmp register must be eax for rep stos");
5310   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
5311   assert(InitArrayShortSize % BytesPerLong == 0,
5312     "InitArrayShortSize should be the multiple of BytesPerLong");
5313 
5314   Label DONE;
5315   if (!is_large || !UseXMMForObjInit) {
5316     xorptr(tmp, tmp);
5317   }
5318 
5319   if (!is_large) {
5320     Label LOOP, LONG;
5321     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5322     jccb(Assembler::greater, LONG);
5323 
5324     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5325 
5326     decrement(cnt);
5327     jccb(Assembler::negative, DONE); // Zero length
5328 
5329     // Use individual pointer-sized stores for small counts:
5330     BIND(LOOP);
5331     movptr(Address(base, cnt, Address::times_ptr), tmp);
5332     decrement(cnt);
5333     jccb(Assembler::greaterEqual, LOOP);
5334     jmpb(DONE);
5335 
5336     BIND(LONG);
5337   }
5338 
5339   // Use longer rep-prefixed ops for non-small counts:
5340   if (UseFastStosb) {
5341     shlptr(cnt, 3); // convert to number of bytes
5342     rep_stosb();
5343   } else if (UseXMMForObjInit) {
5344     xmm_clear_mem(base, cnt, tmp, xtmp, mask);
5345   } else {
5346     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5347     rep_stos();
5348   }
5349 
5350   BIND(DONE);
5351 }
5352 
5353 #endif //COMPILER2_OR_JVMCI
5354 
5355 
5356 void MacroAssembler::generate_fill(BasicType t, bool aligned,
5357                                    Register to, Register value, Register count,
5358                                    Register rtmp, XMMRegister xtmp) {
5359   ShortBranchVerifier sbv(this);
5360   assert_different_registers(to, value, count, rtmp);
5361   Label L_exit;
5362   Label L_fill_2_bytes, L_fill_4_bytes;
5363 
5364 #if defined(COMPILER2) && defined(_LP64)
5365   if(MaxVectorSize >=32 &&
5366      VM_Version::supports_avx512vlbw() &&
5367      VM_Version::supports_bmi2()) {
5368     generate_fill_avx3(t, to, value, count, rtmp, xtmp);
5369     return;
5370   }
5371 #endif
5372 
5373   int shift = -1;
5374   switch (t) {
5375     case T_BYTE:
5376       shift = 2;
5377       break;
5378     case T_SHORT:
5379       shift = 1;
5380       break;
5381     case T_INT:
5382       shift = 0;
5383       break;
5384     default: ShouldNotReachHere();
5385   }
5386 
5387   if (t == T_BYTE) {
5388     andl(value, 0xff);
5389     movl(rtmp, value);
5390     shll(rtmp, 8);
5391     orl(value, rtmp);
5392   }
5393   if (t == T_SHORT) {
5394     andl(value, 0xffff);
5395   }
5396   if (t == T_BYTE || t == T_SHORT) {
5397     movl(rtmp, value);
5398     shll(rtmp, 16);
5399     orl(value, rtmp);
5400   }
5401 
5402   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
5403   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
5404   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
5405     Label L_skip_align2;
5406     // align source address at 4 bytes address boundary
5407     if (t == T_BYTE) {
5408       Label L_skip_align1;
5409       // One byte misalignment happens only for byte arrays
5410       testptr(to, 1);
5411       jccb(Assembler::zero, L_skip_align1);
5412       movb(Address(to, 0), value);
5413       increment(to);
5414       decrement(count);
5415       BIND(L_skip_align1);
5416     }
5417     // Two bytes misalignment happens only for byte and short (char) arrays
5418     testptr(to, 2);
5419     jccb(Assembler::zero, L_skip_align2);
5420     movw(Address(to, 0), value);
5421     addptr(to, 2);
5422     subl(count, 1<<(shift-1));
5423     BIND(L_skip_align2);
5424   }
5425   if (UseSSE < 2) {
5426     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5427     // Fill 32-byte chunks
5428     subl(count, 8 << shift);
5429     jcc(Assembler::less, L_check_fill_8_bytes);
5430     align(16);
5431 
5432     BIND(L_fill_32_bytes_loop);
5433 
5434     for (int i = 0; i < 32; i += 4) {
5435       movl(Address(to, i), value);
5436     }
5437 
5438     addptr(to, 32);
5439     subl(count, 8 << shift);
5440     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5441     BIND(L_check_fill_8_bytes);
5442     addl(count, 8 << shift);
5443     jccb(Assembler::zero, L_exit);
5444     jmpb(L_fill_8_bytes);
5445 
5446     //
5447     // length is too short, just fill qwords
5448     //
5449     BIND(L_fill_8_bytes_loop);
5450     movl(Address(to, 0), value);
5451     movl(Address(to, 4), value);
5452     addptr(to, 8);
5453     BIND(L_fill_8_bytes);
5454     subl(count, 1 << (shift + 1));
5455     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5456     // fall through to fill 4 bytes
5457   } else {
5458     Label L_fill_32_bytes;
5459     if (!UseUnalignedLoadStores) {
5460       // align to 8 bytes, we know we are 4 byte aligned to start
5461       testptr(to, 4);
5462       jccb(Assembler::zero, L_fill_32_bytes);
5463       movl(Address(to, 0), value);
5464       addptr(to, 4);
5465       subl(count, 1<<shift);
5466     }
5467     BIND(L_fill_32_bytes);
5468     {
5469       assert( UseSSE >= 2, "supported cpu only" );
5470       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5471       movdl(xtmp, value);
5472       if (UseAVX >= 2 && UseUnalignedLoadStores) {
5473         Label L_check_fill_32_bytes;
5474         if (UseAVX > 2) {
5475           // Fill 64-byte chunks
5476           Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
5477 
5478           // If number of bytes to fill < AVX3Threshold, perform fill using AVX2
5479           cmpl(count, AVX3Threshold);
5480           jccb(Assembler::below, L_check_fill_64_bytes_avx2);
5481 
5482           vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
5483 
5484           subl(count, 16 << shift);
5485           jccb(Assembler::less, L_check_fill_32_bytes);
5486           align(16);
5487 
5488           BIND(L_fill_64_bytes_loop_avx3);
5489           evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
5490           addptr(to, 64);
5491           subl(count, 16 << shift);
5492           jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
5493           jmpb(L_check_fill_32_bytes);
5494 
5495           BIND(L_check_fill_64_bytes_avx2);
5496         }
5497         // Fill 64-byte chunks
5498         Label L_fill_64_bytes_loop;
5499         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
5500 
5501         subl(count, 16 << shift);
5502         jcc(Assembler::less, L_check_fill_32_bytes);
5503         align(16);
5504 
5505         BIND(L_fill_64_bytes_loop);
5506         vmovdqu(Address(to, 0), xtmp);
5507         vmovdqu(Address(to, 32), xtmp);
5508         addptr(to, 64);
5509         subl(count, 16 << shift);
5510         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
5511 
5512         BIND(L_check_fill_32_bytes);
5513         addl(count, 8 << shift);
5514         jccb(Assembler::less, L_check_fill_8_bytes);
5515         vmovdqu(Address(to, 0), xtmp);
5516         addptr(to, 32);
5517         subl(count, 8 << shift);
5518 
5519         BIND(L_check_fill_8_bytes);
5520         // clean upper bits of YMM registers
5521         movdl(xtmp, value);
5522         pshufd(xtmp, xtmp, 0);
5523       } else {
5524         // Fill 32-byte chunks
5525         pshufd(xtmp, xtmp, 0);
5526 
5527         subl(count, 8 << shift);
5528         jcc(Assembler::less, L_check_fill_8_bytes);
5529         align(16);
5530 
5531         BIND(L_fill_32_bytes_loop);
5532 
5533         if (UseUnalignedLoadStores) {
5534           movdqu(Address(to, 0), xtmp);
5535           movdqu(Address(to, 16), xtmp);
5536         } else {
5537           movq(Address(to, 0), xtmp);
5538           movq(Address(to, 8), xtmp);
5539           movq(Address(to, 16), xtmp);
5540           movq(Address(to, 24), xtmp);
5541         }
5542 
5543         addptr(to, 32);
5544         subl(count, 8 << shift);
5545         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5546 
5547         BIND(L_check_fill_8_bytes);
5548       }
5549       addl(count, 8 << shift);
5550       jccb(Assembler::zero, L_exit);
5551       jmpb(L_fill_8_bytes);
5552 
5553       //
5554       // length is too short, just fill qwords
5555       //
5556       BIND(L_fill_8_bytes_loop);
5557       movq(Address(to, 0), xtmp);
5558       addptr(to, 8);
5559       BIND(L_fill_8_bytes);
5560       subl(count, 1 << (shift + 1));
5561       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5562     }
5563   }
5564   // fill trailing 4 bytes
5565   BIND(L_fill_4_bytes);
5566   testl(count, 1<<shift);
5567   jccb(Assembler::zero, L_fill_2_bytes);
5568   movl(Address(to, 0), value);
5569   if (t == T_BYTE || t == T_SHORT) {
5570     Label L_fill_byte;
5571     addptr(to, 4);
5572     BIND(L_fill_2_bytes);
5573     // fill trailing 2 bytes
5574     testl(count, 1<<(shift-1));
5575     jccb(Assembler::zero, L_fill_byte);
5576     movw(Address(to, 0), value);
5577     if (t == T_BYTE) {
5578       addptr(to, 2);
5579       BIND(L_fill_byte);
5580       // fill trailing byte
5581       testl(count, 1);
5582       jccb(Assembler::zero, L_exit);
5583       movb(Address(to, 0), value);
5584     } else {
5585       BIND(L_fill_byte);
5586     }
5587   } else {
5588     BIND(L_fill_2_bytes);
5589   }
5590   BIND(L_exit);
5591 }
5592 
5593 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
5594   switch(type) {
5595     case T_BYTE:
5596     case T_BOOLEAN:
5597       evpbroadcastb(dst, src, vector_len);
5598       break;
5599     case T_SHORT:
5600     case T_CHAR:
5601       evpbroadcastw(dst, src, vector_len);
5602       break;
5603     case T_INT:
5604     case T_FLOAT:
5605       evpbroadcastd(dst, src, vector_len);
5606       break;
5607     case T_LONG:
5608     case T_DOUBLE:
5609       evpbroadcastq(dst, src, vector_len);
5610       break;
5611     default:
5612       fatal("Unhandled type : %s", type2name(type));
5613       break;
5614   }
5615 }
5616 
5617 // encode char[] to byte[] in ISO_8859_1 or ASCII
5618    //@IntrinsicCandidate
5619    //private static int implEncodeISOArray(byte[] sa, int sp,
5620    //byte[] da, int dp, int len) {
5621    //  int i = 0;
5622    //  for (; i < len; i++) {
5623    //    char c = StringUTF16.getChar(sa, sp++);
5624    //    if (c > '\u00FF')
5625    //      break;
5626    //    da[dp++] = (byte)c;
5627    //  }
5628    //  return i;
5629    //}
5630    //
5631    //@IntrinsicCandidate
5632    //private static int implEncodeAsciiArray(char[] sa, int sp,
5633    //    byte[] da, int dp, int len) {
5634    //  int i = 0;
5635    //  for (; i < len; i++) {
5636    //    char c = sa[sp++];
5637    //    if (c >= '\u0080')
5638    //      break;
5639    //    da[dp++] = (byte)c;
5640    //  }
5641    //  return i;
5642    //}
5643 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
5644   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
5645   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
5646   Register tmp5, Register result, bool ascii) {
5647 
5648   // rsi: src
5649   // rdi: dst
5650   // rdx: len
5651   // rcx: tmp5
5652   // rax: result
5653   ShortBranchVerifier sbv(this);
5654   assert_different_registers(src, dst, len, tmp5, result);
5655   Label L_done, L_copy_1_char, L_copy_1_char_exit;
5656 
5657   int mask = ascii ? 0xff80ff80 : 0xff00ff00;
5658   int short_mask = ascii ? 0xff80 : 0xff00;
5659 
5660   // set result
5661   xorl(result, result);
5662   // check for zero length
5663   testl(len, len);
5664   jcc(Assembler::zero, L_done);
5665 
5666   movl(result, len);
5667 
5668   // Setup pointers
5669   lea(src, Address(src, len, Address::times_2)); // char[]
5670   lea(dst, Address(dst, len, Address::times_1)); // byte[]
5671   negptr(len);
5672 
5673   if (UseSSE42Intrinsics || UseAVX >= 2) {
5674     Label L_copy_8_chars, L_copy_8_chars_exit;
5675     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
5676 
5677     if (UseAVX >= 2) {
5678       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
5679       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
5680       movdl(tmp1Reg, tmp5);
5681       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
5682       jmp(L_chars_32_check);
5683 
5684       bind(L_copy_32_chars);
5685       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
5686       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
5687       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5688       vptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
5689       jccb(Assembler::notZero, L_copy_32_chars_exit);
5690       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5691       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
5692       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
5693 
5694       bind(L_chars_32_check);
5695       addptr(len, 32);
5696       jcc(Assembler::lessEqual, L_copy_32_chars);
5697 
5698       bind(L_copy_32_chars_exit);
5699       subptr(len, 16);
5700       jccb(Assembler::greater, L_copy_16_chars_exit);
5701 
5702     } else if (UseSSE42Intrinsics) {
5703       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
5704       movdl(tmp1Reg, tmp5);
5705       pshufd(tmp1Reg, tmp1Reg, 0);
5706       jmpb(L_chars_16_check);
5707     }
5708 
5709     bind(L_copy_16_chars);
5710     if (UseAVX >= 2) {
5711       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
5712       vptest(tmp2Reg, tmp1Reg);
5713       jcc(Assembler::notZero, L_copy_16_chars_exit);
5714       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
5715       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
5716     } else {
5717       if (UseAVX > 0) {
5718         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5719         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5720         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
5721       } else {
5722         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5723         por(tmp2Reg, tmp3Reg);
5724         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5725         por(tmp2Reg, tmp4Reg);
5726       }
5727       ptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
5728       jccb(Assembler::notZero, L_copy_16_chars_exit);
5729       packuswb(tmp3Reg, tmp4Reg);
5730     }
5731     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
5732 
5733     bind(L_chars_16_check);
5734     addptr(len, 16);
5735     jcc(Assembler::lessEqual, L_copy_16_chars);
5736 
5737     bind(L_copy_16_chars_exit);
5738     if (UseAVX >= 2) {
5739       // clean upper bits of YMM registers
5740       vpxor(tmp2Reg, tmp2Reg);
5741       vpxor(tmp3Reg, tmp3Reg);
5742       vpxor(tmp4Reg, tmp4Reg);
5743       movdl(tmp1Reg, tmp5);
5744       pshufd(tmp1Reg, tmp1Reg, 0);
5745     }
5746     subptr(len, 8);
5747     jccb(Assembler::greater, L_copy_8_chars_exit);
5748 
5749     bind(L_copy_8_chars);
5750     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
5751     ptest(tmp3Reg, tmp1Reg);
5752     jccb(Assembler::notZero, L_copy_8_chars_exit);
5753     packuswb(tmp3Reg, tmp1Reg);
5754     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
5755     addptr(len, 8);
5756     jccb(Assembler::lessEqual, L_copy_8_chars);
5757 
5758     bind(L_copy_8_chars_exit);
5759     subptr(len, 8);
5760     jccb(Assembler::zero, L_done);
5761   }
5762 
5763   bind(L_copy_1_char);
5764   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
5765   testl(tmp5, short_mask);      // check if Unicode or non-ASCII char
5766   jccb(Assembler::notZero, L_copy_1_char_exit);
5767   movb(Address(dst, len, Address::times_1, 0), tmp5);
5768   addptr(len, 1);
5769   jccb(Assembler::less, L_copy_1_char);
5770 
5771   bind(L_copy_1_char_exit);
5772   addptr(result, len); // len is negative count of not processed elements
5773 
5774   bind(L_done);
5775 }
5776 
5777 #ifdef _LP64
5778 /**
5779  * Helper for multiply_to_len().
5780  */
5781 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
5782   addq(dest_lo, src1);
5783   adcq(dest_hi, 0);
5784   addq(dest_lo, src2);
5785   adcq(dest_hi, 0);
5786 }
5787 
5788 /**
5789  * Multiply 64 bit by 64 bit first loop.
5790  */
5791 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5792                                            Register y, Register y_idx, Register z,
5793                                            Register carry, Register product,
5794                                            Register idx, Register kdx) {
5795   //
5796   //  jlong carry, x[], y[], z[];
5797   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5798   //    huge_128 product = y[idx] * x[xstart] + carry;
5799   //    z[kdx] = (jlong)product;
5800   //    carry  = (jlong)(product >>> 64);
5801   //  }
5802   //  z[xstart] = carry;
5803   //
5804 
5805   Label L_first_loop, L_first_loop_exit;
5806   Label L_one_x, L_one_y, L_multiply;
5807 
5808   decrementl(xstart);
5809   jcc(Assembler::negative, L_one_x);
5810 
5811   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
5812   rorq(x_xstart, 32); // convert big-endian to little-endian
5813 
5814   bind(L_first_loop);
5815   decrementl(idx);
5816   jcc(Assembler::negative, L_first_loop_exit);
5817   decrementl(idx);
5818   jcc(Assembler::negative, L_one_y);
5819   movq(y_idx, Address(y, idx, Address::times_4,  0));
5820   rorq(y_idx, 32); // convert big-endian to little-endian
5821   bind(L_multiply);
5822   movq(product, x_xstart);
5823   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
5824   addq(product, carry);
5825   adcq(rdx, 0);
5826   subl(kdx, 2);
5827   movl(Address(z, kdx, Address::times_4,  4), product);
5828   shrq(product, 32);
5829   movl(Address(z, kdx, Address::times_4,  0), product);
5830   movq(carry, rdx);
5831   jmp(L_first_loop);
5832 
5833   bind(L_one_y);
5834   movl(y_idx, Address(y,  0));
5835   jmp(L_multiply);
5836 
5837   bind(L_one_x);
5838   movl(x_xstart, Address(x,  0));
5839   jmp(L_first_loop);
5840 
5841   bind(L_first_loop_exit);
5842 }
5843 
5844 /**
5845  * Multiply 64 bit by 64 bit and add 128 bit.
5846  */
5847 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
5848                                             Register yz_idx, Register idx,
5849                                             Register carry, Register product, int offset) {
5850   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
5851   //     z[kdx] = (jlong)product;
5852 
5853   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
5854   rorq(yz_idx, 32); // convert big-endian to little-endian
5855   movq(product, x_xstart);
5856   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
5857   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
5858   rorq(yz_idx, 32); // convert big-endian to little-endian
5859 
5860   add2_with_carry(rdx, product, carry, yz_idx);
5861 
5862   movl(Address(z, idx, Address::times_4,  offset+4), product);
5863   shrq(product, 32);
5864   movl(Address(z, idx, Address::times_4,  offset), product);
5865 
5866 }
5867 
5868 /**
5869  * Multiply 128 bit by 128 bit. Unrolled inner loop.
5870  */
5871 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
5872                                              Register yz_idx, Register idx, Register jdx,
5873                                              Register carry, Register product,
5874                                              Register carry2) {
5875   //   jlong carry, x[], y[], z[];
5876   //   int kdx = ystart+1;
5877   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5878   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
5879   //     z[kdx+idx+1] = (jlong)product;
5880   //     jlong carry2  = (jlong)(product >>> 64);
5881   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
5882   //     z[kdx+idx] = (jlong)product;
5883   //     carry  = (jlong)(product >>> 64);
5884   //   }
5885   //   idx += 2;
5886   //   if (idx > 0) {
5887   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
5888   //     z[kdx+idx] = (jlong)product;
5889   //     carry  = (jlong)(product >>> 64);
5890   //   }
5891   //
5892 
5893   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5894 
5895   movl(jdx, idx);
5896   andl(jdx, 0xFFFFFFFC);
5897   shrl(jdx, 2);
5898 
5899   bind(L_third_loop);
5900   subl(jdx, 1);
5901   jcc(Assembler::negative, L_third_loop_exit);
5902   subl(idx, 4);
5903 
5904   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
5905   movq(carry2, rdx);
5906 
5907   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
5908   movq(carry, rdx);
5909   jmp(L_third_loop);
5910 
5911   bind (L_third_loop_exit);
5912 
5913   andl (idx, 0x3);
5914   jcc(Assembler::zero, L_post_third_loop_done);
5915 
5916   Label L_check_1;
5917   subl(idx, 2);
5918   jcc(Assembler::negative, L_check_1);
5919 
5920   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
5921   movq(carry, rdx);
5922 
5923   bind (L_check_1);
5924   addl (idx, 0x2);
5925   andl (idx, 0x1);
5926   subl(idx, 1);
5927   jcc(Assembler::negative, L_post_third_loop_done);
5928 
5929   movl(yz_idx, Address(y, idx, Address::times_4,  0));
5930   movq(product, x_xstart);
5931   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
5932   movl(yz_idx, Address(z, idx, Address::times_4,  0));
5933 
5934   add2_with_carry(rdx, product, yz_idx, carry);
5935 
5936   movl(Address(z, idx, Address::times_4,  0), product);
5937   shrq(product, 32);
5938 
5939   shlq(rdx, 32);
5940   orq(product, rdx);
5941   movq(carry, product);
5942 
5943   bind(L_post_third_loop_done);
5944 }
5945 
5946 /**
5947  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
5948  *
5949  */
5950 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
5951                                                   Register carry, Register carry2,
5952                                                   Register idx, Register jdx,
5953                                                   Register yz_idx1, Register yz_idx2,
5954                                                   Register tmp, Register tmp3, Register tmp4) {
5955   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
5956 
5957   //   jlong carry, x[], y[], z[];
5958   //   int kdx = ystart+1;
5959   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5960   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
5961   //     jlong carry2  = (jlong)(tmp3 >>> 64);
5962   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
5963   //     carry  = (jlong)(tmp4 >>> 64);
5964   //     z[kdx+idx+1] = (jlong)tmp3;
5965   //     z[kdx+idx] = (jlong)tmp4;
5966   //   }
5967   //   idx += 2;
5968   //   if (idx > 0) {
5969   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
5970   //     z[kdx+idx] = (jlong)yz_idx1;
5971   //     carry  = (jlong)(yz_idx1 >>> 64);
5972   //   }
5973   //
5974 
5975   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5976 
5977   movl(jdx, idx);
5978   andl(jdx, 0xFFFFFFFC);
5979   shrl(jdx, 2);
5980 
5981   bind(L_third_loop);
5982   subl(jdx, 1);
5983   jcc(Assembler::negative, L_third_loop_exit);
5984   subl(idx, 4);
5985 
5986   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
5987   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
5988   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
5989   rorxq(yz_idx2, yz_idx2, 32);
5990 
5991   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
5992   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
5993 
5994   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
5995   rorxq(yz_idx1, yz_idx1, 32);
5996   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
5997   rorxq(yz_idx2, yz_idx2, 32);
5998 
5999   if (VM_Version::supports_adx()) {
6000     adcxq(tmp3, carry);
6001     adoxq(tmp3, yz_idx1);
6002 
6003     adcxq(tmp4, tmp);
6004     adoxq(tmp4, yz_idx2);
6005 
6006     movl(carry, 0); // does not affect flags
6007     adcxq(carry2, carry);
6008     adoxq(carry2, carry);
6009   } else {
6010     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
6011     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
6012   }
6013   movq(carry, carry2);
6014 
6015   movl(Address(z, idx, Address::times_4, 12), tmp3);
6016   shrq(tmp3, 32);
6017   movl(Address(z, idx, Address::times_4,  8), tmp3);
6018 
6019   movl(Address(z, idx, Address::times_4,  4), tmp4);
6020   shrq(tmp4, 32);
6021   movl(Address(z, idx, Address::times_4,  0), tmp4);
6022 
6023   jmp(L_third_loop);
6024 
6025   bind (L_third_loop_exit);
6026 
6027   andl (idx, 0x3);
6028   jcc(Assembler::zero, L_post_third_loop_done);
6029 
6030   Label L_check_1;
6031   subl(idx, 2);
6032   jcc(Assembler::negative, L_check_1);
6033 
6034   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
6035   rorxq(yz_idx1, yz_idx1, 32);
6036   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
6037   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6038   rorxq(yz_idx2, yz_idx2, 32);
6039 
6040   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
6041 
6042   movl(Address(z, idx, Address::times_4,  4), tmp3);
6043   shrq(tmp3, 32);
6044   movl(Address(z, idx, Address::times_4,  0), tmp3);
6045   movq(carry, tmp4);
6046 
6047   bind (L_check_1);
6048   addl (idx, 0x2);
6049   andl (idx, 0x1);
6050   subl(idx, 1);
6051   jcc(Assembler::negative, L_post_third_loop_done);
6052   movl(tmp4, Address(y, idx, Address::times_4,  0));
6053   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
6054   movl(tmp4, Address(z, idx, Address::times_4,  0));
6055 
6056   add2_with_carry(carry2, tmp3, tmp4, carry);
6057 
6058   movl(Address(z, idx, Address::times_4,  0), tmp3);
6059   shrq(tmp3, 32);
6060 
6061   shlq(carry2, 32);
6062   orq(tmp3, carry2);
6063   movq(carry, tmp3);
6064 
6065   bind(L_post_third_loop_done);
6066 }
6067 
6068 /**
6069  * Code for BigInteger::multiplyToLen() instrinsic.
6070  *
6071  * rdi: x
6072  * rax: xlen
6073  * rsi: y
6074  * rcx: ylen
6075  * r8:  z
6076  * r11: zlen
6077  * r12: tmp1
6078  * r13: tmp2
6079  * r14: tmp3
6080  * r15: tmp4
6081  * rbx: tmp5
6082  *
6083  */
6084 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
6085                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
6086   ShortBranchVerifier sbv(this);
6087   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
6088 
6089   push(tmp1);
6090   push(tmp2);
6091   push(tmp3);
6092   push(tmp4);
6093   push(tmp5);
6094 
6095   push(xlen);
6096   push(zlen);
6097 
6098   const Register idx = tmp1;
6099   const Register kdx = tmp2;
6100   const Register xstart = tmp3;
6101 
6102   const Register y_idx = tmp4;
6103   const Register carry = tmp5;
6104   const Register product  = xlen;
6105   const Register x_xstart = zlen;  // reuse register
6106 
6107   // First Loop.
6108   //
6109   //  final static long LONG_MASK = 0xffffffffL;
6110   //  int xstart = xlen - 1;
6111   //  int ystart = ylen - 1;
6112   //  long carry = 0;
6113   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6114   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
6115   //    z[kdx] = (int)product;
6116   //    carry = product >>> 32;
6117   //  }
6118   //  z[xstart] = (int)carry;
6119   //
6120 
6121   movl(idx, ylen);      // idx = ylen;
6122   movl(kdx, zlen);      // kdx = xlen+ylen;
6123   xorq(carry, carry);   // carry = 0;
6124 
6125   Label L_done;
6126 
6127   movl(xstart, xlen);
6128   decrementl(xstart);
6129   jcc(Assembler::negative, L_done);
6130 
6131   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6132 
6133   Label L_second_loop;
6134   testl(kdx, kdx);
6135   jcc(Assembler::zero, L_second_loop);
6136 
6137   Label L_carry;
6138   subl(kdx, 1);
6139   jcc(Assembler::zero, L_carry);
6140 
6141   movl(Address(z, kdx, Address::times_4,  0), carry);
6142   shrq(carry, 32);
6143   subl(kdx, 1);
6144 
6145   bind(L_carry);
6146   movl(Address(z, kdx, Address::times_4,  0), carry);
6147 
6148   // Second and third (nested) loops.
6149   //
6150   // for (int i = xstart-1; i >= 0; i--) { // Second loop
6151   //   carry = 0;
6152   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6153   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6154   //                    (z[k] & LONG_MASK) + carry;
6155   //     z[k] = (int)product;
6156   //     carry = product >>> 32;
6157   //   }
6158   //   z[i] = (int)carry;
6159   // }
6160   //
6161   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
6162 
6163   const Register jdx = tmp1;
6164 
6165   bind(L_second_loop);
6166   xorl(carry, carry);    // carry = 0;
6167   movl(jdx, ylen);       // j = ystart+1
6168 
6169   subl(xstart, 1);       // i = xstart-1;
6170   jcc(Assembler::negative, L_done);
6171 
6172   push (z);
6173 
6174   Label L_last_x;
6175   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
6176   subl(xstart, 1);       // i = xstart-1;
6177   jcc(Assembler::negative, L_last_x);
6178 
6179   if (UseBMI2Instructions) {
6180     movq(rdx,  Address(x, xstart, Address::times_4,  0));
6181     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
6182   } else {
6183     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
6184     rorq(x_xstart, 32);  // convert big-endian to little-endian
6185   }
6186 
6187   Label L_third_loop_prologue;
6188   bind(L_third_loop_prologue);
6189 
6190   push (x);
6191   push (xstart);
6192   push (ylen);
6193 
6194 
6195   if (UseBMI2Instructions) {
6196     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
6197   } else { // !UseBMI2Instructions
6198     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
6199   }
6200 
6201   pop(ylen);
6202   pop(xlen);
6203   pop(x);
6204   pop(z);
6205 
6206   movl(tmp3, xlen);
6207   addl(tmp3, 1);
6208   movl(Address(z, tmp3, Address::times_4,  0), carry);
6209   subl(tmp3, 1);
6210   jccb(Assembler::negative, L_done);
6211 
6212   shrq(carry, 32);
6213   movl(Address(z, tmp3, Address::times_4,  0), carry);
6214   jmp(L_second_loop);
6215 
6216   // Next infrequent code is moved outside loops.
6217   bind(L_last_x);
6218   if (UseBMI2Instructions) {
6219     movl(rdx, Address(x,  0));
6220   } else {
6221     movl(x_xstart, Address(x,  0));
6222   }
6223   jmp(L_third_loop_prologue);
6224 
6225   bind(L_done);
6226 
6227   pop(zlen);
6228   pop(xlen);
6229 
6230   pop(tmp5);
6231   pop(tmp4);
6232   pop(tmp3);
6233   pop(tmp2);
6234   pop(tmp1);
6235 }
6236 
6237 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
6238   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
6239   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
6240   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
6241   Label VECTOR8_TAIL, VECTOR4_TAIL;
6242   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
6243   Label SAME_TILL_END, DONE;
6244   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
6245 
6246   //scale is in rcx in both Win64 and Unix
6247   ShortBranchVerifier sbv(this);
6248 
6249   shlq(length);
6250   xorq(result, result);
6251 
6252   if ((AVX3Threshold == 0) && (UseAVX > 2) &&
6253       VM_Version::supports_avx512vlbw()) {
6254     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
6255 
6256     cmpq(length, 64);
6257     jcc(Assembler::less, VECTOR32_TAIL);
6258 
6259     movq(tmp1, length);
6260     andq(tmp1, 0x3F);      // tail count
6261     andq(length, ~(0x3F)); //vector count
6262 
6263     bind(VECTOR64_LOOP);
6264     // AVX512 code to compare 64 byte vectors.
6265     evmovdqub(rymm0, Address(obja, result), false, Assembler::AVX_512bit);
6266     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
6267     kortestql(k7, k7);
6268     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
6269     addq(result, 64);
6270     subq(length, 64);
6271     jccb(Assembler::notZero, VECTOR64_LOOP);
6272 
6273     //bind(VECTOR64_TAIL);
6274     testq(tmp1, tmp1);
6275     jcc(Assembler::zero, SAME_TILL_END);
6276 
6277     //bind(VECTOR64_TAIL);
6278     // AVX512 code to compare upto 63 byte vectors.
6279     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
6280     shlxq(tmp2, tmp2, tmp1);
6281     notq(tmp2);
6282     kmovql(k3, tmp2);
6283 
6284     evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
6285     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
6286 
6287     ktestql(k7, k3);
6288     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
6289 
6290     bind(VECTOR64_NOT_EQUAL);
6291     kmovql(tmp1, k7);
6292     notq(tmp1);
6293     tzcntq(tmp1, tmp1);
6294     addq(result, tmp1);
6295     shrq(result);
6296     jmp(DONE);
6297     bind(VECTOR32_TAIL);
6298   }
6299 
6300   cmpq(length, 8);
6301   jcc(Assembler::equal, VECTOR8_LOOP);
6302   jcc(Assembler::less, VECTOR4_TAIL);
6303 
6304   if (UseAVX >= 2) {
6305     Label VECTOR16_TAIL, VECTOR32_LOOP;
6306 
6307     cmpq(length, 16);
6308     jcc(Assembler::equal, VECTOR16_LOOP);
6309     jcc(Assembler::less, VECTOR8_LOOP);
6310 
6311     cmpq(length, 32);
6312     jccb(Assembler::less, VECTOR16_TAIL);
6313 
6314     subq(length, 32);
6315     bind(VECTOR32_LOOP);
6316     vmovdqu(rymm0, Address(obja, result));
6317     vmovdqu(rymm1, Address(objb, result));
6318     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
6319     vptest(rymm2, rymm2);
6320     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
6321     addq(result, 32);
6322     subq(length, 32);
6323     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
6324     addq(length, 32);
6325     jcc(Assembler::equal, SAME_TILL_END);
6326     //falling through if less than 32 bytes left //close the branch here.
6327 
6328     bind(VECTOR16_TAIL);
6329     cmpq(length, 16);
6330     jccb(Assembler::less, VECTOR8_TAIL);
6331     bind(VECTOR16_LOOP);
6332     movdqu(rymm0, Address(obja, result));
6333     movdqu(rymm1, Address(objb, result));
6334     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
6335     ptest(rymm2, rymm2);
6336     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6337     addq(result, 16);
6338     subq(length, 16);
6339     jcc(Assembler::equal, SAME_TILL_END);
6340     //falling through if less than 16 bytes left
6341   } else {//regular intrinsics
6342 
6343     cmpq(length, 16);
6344     jccb(Assembler::less, VECTOR8_TAIL);
6345 
6346     subq(length, 16);
6347     bind(VECTOR16_LOOP);
6348     movdqu(rymm0, Address(obja, result));
6349     movdqu(rymm1, Address(objb, result));
6350     pxor(rymm0, rymm1);
6351     ptest(rymm0, rymm0);
6352     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6353     addq(result, 16);
6354     subq(length, 16);
6355     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
6356     addq(length, 16);
6357     jcc(Assembler::equal, SAME_TILL_END);
6358     //falling through if less than 16 bytes left
6359   }
6360 
6361   bind(VECTOR8_TAIL);
6362   cmpq(length, 8);
6363   jccb(Assembler::less, VECTOR4_TAIL);
6364   bind(VECTOR8_LOOP);
6365   movq(tmp1, Address(obja, result));
6366   movq(tmp2, Address(objb, result));
6367   xorq(tmp1, tmp2);
6368   testq(tmp1, tmp1);
6369   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
6370   addq(result, 8);
6371   subq(length, 8);
6372   jcc(Assembler::equal, SAME_TILL_END);
6373   //falling through if less than 8 bytes left
6374 
6375   bind(VECTOR4_TAIL);
6376   cmpq(length, 4);
6377   jccb(Assembler::less, BYTES_TAIL);
6378   bind(VECTOR4_LOOP);
6379   movl(tmp1, Address(obja, result));
6380   xorl(tmp1, Address(objb, result));
6381   testl(tmp1, tmp1);
6382   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
6383   addq(result, 4);
6384   subq(length, 4);
6385   jcc(Assembler::equal, SAME_TILL_END);
6386   //falling through if less than 4 bytes left
6387 
6388   bind(BYTES_TAIL);
6389   bind(BYTES_LOOP);
6390   load_unsigned_byte(tmp1, Address(obja, result));
6391   load_unsigned_byte(tmp2, Address(objb, result));
6392   xorl(tmp1, tmp2);
6393   testl(tmp1, tmp1);
6394   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6395   decq(length);
6396   jcc(Assembler::zero, SAME_TILL_END);
6397   incq(result);
6398   load_unsigned_byte(tmp1, Address(obja, result));
6399   load_unsigned_byte(tmp2, Address(objb, result));
6400   xorl(tmp1, tmp2);
6401   testl(tmp1, tmp1);
6402   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6403   decq(length);
6404   jcc(Assembler::zero, SAME_TILL_END);
6405   incq(result);
6406   load_unsigned_byte(tmp1, Address(obja, result));
6407   load_unsigned_byte(tmp2, Address(objb, result));
6408   xorl(tmp1, tmp2);
6409   testl(tmp1, tmp1);
6410   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6411   jmp(SAME_TILL_END);
6412 
6413   if (UseAVX >= 2) {
6414     bind(VECTOR32_NOT_EQUAL);
6415     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
6416     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
6417     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
6418     vpmovmskb(tmp1, rymm0);
6419     bsfq(tmp1, tmp1);
6420     addq(result, tmp1);
6421     shrq(result);
6422     jmp(DONE);
6423   }
6424 
6425   bind(VECTOR16_NOT_EQUAL);
6426   if (UseAVX >= 2) {
6427     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
6428     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
6429     pxor(rymm0, rymm2);
6430   } else {
6431     pcmpeqb(rymm2, rymm2);
6432     pxor(rymm0, rymm1);
6433     pcmpeqb(rymm0, rymm1);
6434     pxor(rymm0, rymm2);
6435   }
6436   pmovmskb(tmp1, rymm0);
6437   bsfq(tmp1, tmp1);
6438   addq(result, tmp1);
6439   shrq(result);
6440   jmpb(DONE);
6441 
6442   bind(VECTOR8_NOT_EQUAL);
6443   bind(VECTOR4_NOT_EQUAL);
6444   bsfq(tmp1, tmp1);
6445   shrq(tmp1, 3);
6446   addq(result, tmp1);
6447   bind(BYTES_NOT_EQUAL);
6448   shrq(result);
6449   jmpb(DONE);
6450 
6451   bind(SAME_TILL_END);
6452   mov64(result, -1);
6453 
6454   bind(DONE);
6455 }
6456 
6457 //Helper functions for square_to_len()
6458 
6459 /**
6460  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
6461  * Preserves x and z and modifies rest of the registers.
6462  */
6463 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6464   // Perform square and right shift by 1
6465   // Handle odd xlen case first, then for even xlen do the following
6466   // jlong carry = 0;
6467   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
6468   //     huge_128 product = x[j:j+1] * x[j:j+1];
6469   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
6470   //     z[i+2:i+3] = (jlong)(product >>> 1);
6471   //     carry = (jlong)product;
6472   // }
6473 
6474   xorq(tmp5, tmp5);     // carry
6475   xorq(rdxReg, rdxReg);
6476   xorl(tmp1, tmp1);     // index for x
6477   xorl(tmp4, tmp4);     // index for z
6478 
6479   Label L_first_loop, L_first_loop_exit;
6480 
6481   testl(xlen, 1);
6482   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
6483 
6484   // Square and right shift by 1 the odd element using 32 bit multiply
6485   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
6486   imulq(raxReg, raxReg);
6487   shrq(raxReg, 1);
6488   adcq(tmp5, 0);
6489   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
6490   incrementl(tmp1);
6491   addl(tmp4, 2);
6492 
6493   // Square and  right shift by 1 the rest using 64 bit multiply
6494   bind(L_first_loop);
6495   cmpptr(tmp1, xlen);
6496   jccb(Assembler::equal, L_first_loop_exit);
6497 
6498   // Square
6499   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
6500   rorq(raxReg, 32);    // convert big-endian to little-endian
6501   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
6502 
6503   // Right shift by 1 and save carry
6504   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
6505   rcrq(rdxReg, 1);
6506   rcrq(raxReg, 1);
6507   adcq(tmp5, 0);
6508 
6509   // Store result in z
6510   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
6511   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
6512 
6513   // Update indices for x and z
6514   addl(tmp1, 2);
6515   addl(tmp4, 4);
6516   jmp(L_first_loop);
6517 
6518   bind(L_first_loop_exit);
6519 }
6520 
6521 
6522 /**
6523  * Perform the following multiply add operation using BMI2 instructions
6524  * carry:sum = sum + op1*op2 + carry
6525  * op2 should be in rdx
6526  * op2 is preserved, all other registers are modified
6527  */
6528 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
6529   // assert op2 is rdx
6530   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
6531   addq(sum, carry);
6532   adcq(tmp2, 0);
6533   addq(sum, op1);
6534   adcq(tmp2, 0);
6535   movq(carry, tmp2);
6536 }
6537 
6538 /**
6539  * Perform the following multiply add operation:
6540  * carry:sum = sum + op1*op2 + carry
6541  * Preserves op1, op2 and modifies rest of registers
6542  */
6543 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
6544   // rdx:rax = op1 * op2
6545   movq(raxReg, op2);
6546   mulq(op1);
6547 
6548   //  rdx:rax = sum + carry + rdx:rax
6549   addq(sum, carry);
6550   adcq(rdxReg, 0);
6551   addq(sum, raxReg);
6552   adcq(rdxReg, 0);
6553 
6554   // carry:sum = rdx:sum
6555   movq(carry, rdxReg);
6556 }
6557 
6558 /**
6559  * Add 64 bit long carry into z[] with carry propogation.
6560  * Preserves z and carry register values and modifies rest of registers.
6561  *
6562  */
6563 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
6564   Label L_fourth_loop, L_fourth_loop_exit;
6565 
6566   movl(tmp1, 1);
6567   subl(zlen, 2);
6568   addq(Address(z, zlen, Address::times_4, 0), carry);
6569 
6570   bind(L_fourth_loop);
6571   jccb(Assembler::carryClear, L_fourth_loop_exit);
6572   subl(zlen, 2);
6573   jccb(Assembler::negative, L_fourth_loop_exit);
6574   addq(Address(z, zlen, Address::times_4, 0), tmp1);
6575   jmp(L_fourth_loop);
6576   bind(L_fourth_loop_exit);
6577 }
6578 
6579 /**
6580  * Shift z[] left by 1 bit.
6581  * Preserves x, len, z and zlen registers and modifies rest of the registers.
6582  *
6583  */
6584 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
6585 
6586   Label L_fifth_loop, L_fifth_loop_exit;
6587 
6588   // Fifth loop
6589   // Perform primitiveLeftShift(z, zlen, 1)
6590 
6591   const Register prev_carry = tmp1;
6592   const Register new_carry = tmp4;
6593   const Register value = tmp2;
6594   const Register zidx = tmp3;
6595 
6596   // int zidx, carry;
6597   // long value;
6598   // carry = 0;
6599   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
6600   //    (carry:value)  = (z[i] << 1) | carry ;
6601   //    z[i] = value;
6602   // }
6603 
6604   movl(zidx, zlen);
6605   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
6606 
6607   bind(L_fifth_loop);
6608   decl(zidx);  // Use decl to preserve carry flag
6609   decl(zidx);
6610   jccb(Assembler::negative, L_fifth_loop_exit);
6611 
6612   if (UseBMI2Instructions) {
6613      movq(value, Address(z, zidx, Address::times_4, 0));
6614      rclq(value, 1);
6615      rorxq(value, value, 32);
6616      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6617   }
6618   else {
6619     // clear new_carry
6620     xorl(new_carry, new_carry);
6621 
6622     // Shift z[i] by 1, or in previous carry and save new carry
6623     movq(value, Address(z, zidx, Address::times_4, 0));
6624     shlq(value, 1);
6625     adcl(new_carry, 0);
6626 
6627     orq(value, prev_carry);
6628     rorq(value, 0x20);
6629     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6630 
6631     // Set previous carry = new carry
6632     movl(prev_carry, new_carry);
6633   }
6634   jmp(L_fifth_loop);
6635 
6636   bind(L_fifth_loop_exit);
6637 }
6638 
6639 
6640 /**
6641  * Code for BigInteger::squareToLen() intrinsic
6642  *
6643  * rdi: x
6644  * rsi: len
6645  * r8:  z
6646  * rcx: zlen
6647  * r12: tmp1
6648  * r13: tmp2
6649  * r14: tmp3
6650  * r15: tmp4
6651  * rbx: tmp5
6652  *
6653  */
6654 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6655 
6656   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
6657   push(tmp1);
6658   push(tmp2);
6659   push(tmp3);
6660   push(tmp4);
6661   push(tmp5);
6662 
6663   // First loop
6664   // Store the squares, right shifted one bit (i.e., divided by 2).
6665   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
6666 
6667   // Add in off-diagonal sums.
6668   //
6669   // Second, third (nested) and fourth loops.
6670   // zlen +=2;
6671   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
6672   //    carry = 0;
6673   //    long op2 = x[xidx:xidx+1];
6674   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
6675   //       k -= 2;
6676   //       long op1 = x[j:j+1];
6677   //       long sum = z[k:k+1];
6678   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
6679   //       z[k:k+1] = sum;
6680   //    }
6681   //    add_one_64(z, k, carry, tmp_regs);
6682   // }
6683 
6684   const Register carry = tmp5;
6685   const Register sum = tmp3;
6686   const Register op1 = tmp4;
6687   Register op2 = tmp2;
6688 
6689   push(zlen);
6690   push(len);
6691   addl(zlen,2);
6692   bind(L_second_loop);
6693   xorq(carry, carry);
6694   subl(zlen, 4);
6695   subl(len, 2);
6696   push(zlen);
6697   push(len);
6698   cmpl(len, 0);
6699   jccb(Assembler::lessEqual, L_second_loop_exit);
6700 
6701   // Multiply an array by one 64 bit long.
6702   if (UseBMI2Instructions) {
6703     op2 = rdxReg;
6704     movq(op2, Address(x, len, Address::times_4,  0));
6705     rorxq(op2, op2, 32);
6706   }
6707   else {
6708     movq(op2, Address(x, len, Address::times_4,  0));
6709     rorq(op2, 32);
6710   }
6711 
6712   bind(L_third_loop);
6713   decrementl(len);
6714   jccb(Assembler::negative, L_third_loop_exit);
6715   decrementl(len);
6716   jccb(Assembler::negative, L_last_x);
6717 
6718   movq(op1, Address(x, len, Address::times_4,  0));
6719   rorq(op1, 32);
6720 
6721   bind(L_multiply);
6722   subl(zlen, 2);
6723   movq(sum, Address(z, zlen, Address::times_4,  0));
6724 
6725   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
6726   if (UseBMI2Instructions) {
6727     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
6728   }
6729   else {
6730     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6731   }
6732 
6733   movq(Address(z, zlen, Address::times_4, 0), sum);
6734 
6735   jmp(L_third_loop);
6736   bind(L_third_loop_exit);
6737 
6738   // Fourth loop
6739   // Add 64 bit long carry into z with carry propogation.
6740   // Uses offsetted zlen.
6741   add_one_64(z, zlen, carry, tmp1);
6742 
6743   pop(len);
6744   pop(zlen);
6745   jmp(L_second_loop);
6746 
6747   // Next infrequent code is moved outside loops.
6748   bind(L_last_x);
6749   movl(op1, Address(x, 0));
6750   jmp(L_multiply);
6751 
6752   bind(L_second_loop_exit);
6753   pop(len);
6754   pop(zlen);
6755   pop(len);
6756   pop(zlen);
6757 
6758   // Fifth loop
6759   // Shift z left 1 bit.
6760   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
6761 
6762   // z[zlen-1] |= x[len-1] & 1;
6763   movl(tmp3, Address(x, len, Address::times_4, -4));
6764   andl(tmp3, 1);
6765   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
6766 
6767   pop(tmp5);
6768   pop(tmp4);
6769   pop(tmp3);
6770   pop(tmp2);
6771   pop(tmp1);
6772 }
6773 
6774 /**
6775  * Helper function for mul_add()
6776  * Multiply the in[] by int k and add to out[] starting at offset offs using
6777  * 128 bit by 32 bit multiply and return the carry in tmp5.
6778  * Only quad int aligned length of in[] is operated on in this function.
6779  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
6780  * This function preserves out, in and k registers.
6781  * len and offset point to the appropriate index in "in" & "out" correspondingly
6782  * tmp5 has the carry.
6783  * other registers are temporary and are modified.
6784  *
6785  */
6786 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
6787   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
6788   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6789 
6790   Label L_first_loop, L_first_loop_exit;
6791 
6792   movl(tmp1, len);
6793   shrl(tmp1, 2);
6794 
6795   bind(L_first_loop);
6796   subl(tmp1, 1);
6797   jccb(Assembler::negative, L_first_loop_exit);
6798 
6799   subl(len, 4);
6800   subl(offset, 4);
6801 
6802   Register op2 = tmp2;
6803   const Register sum = tmp3;
6804   const Register op1 = tmp4;
6805   const Register carry = tmp5;
6806 
6807   if (UseBMI2Instructions) {
6808     op2 = rdxReg;
6809   }
6810 
6811   movq(op1, Address(in, len, Address::times_4,  8));
6812   rorq(op1, 32);
6813   movq(sum, Address(out, offset, Address::times_4,  8));
6814   rorq(sum, 32);
6815   if (UseBMI2Instructions) {
6816     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6817   }
6818   else {
6819     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6820   }
6821   // Store back in big endian from little endian
6822   rorq(sum, 0x20);
6823   movq(Address(out, offset, Address::times_4,  8), sum);
6824 
6825   movq(op1, Address(in, len, Address::times_4,  0));
6826   rorq(op1, 32);
6827   movq(sum, Address(out, offset, Address::times_4,  0));
6828   rorq(sum, 32);
6829   if (UseBMI2Instructions) {
6830     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6831   }
6832   else {
6833     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6834   }
6835   // Store back in big endian from little endian
6836   rorq(sum, 0x20);
6837   movq(Address(out, offset, Address::times_4,  0), sum);
6838 
6839   jmp(L_first_loop);
6840   bind(L_first_loop_exit);
6841 }
6842 
6843 /**
6844  * Code for BigInteger::mulAdd() intrinsic
6845  *
6846  * rdi: out
6847  * rsi: in
6848  * r11: offs (out.length - offset)
6849  * rcx: len
6850  * r8:  k
6851  * r12: tmp1
6852  * r13: tmp2
6853  * r14: tmp3
6854  * r15: tmp4
6855  * rbx: tmp5
6856  * Multiply the in[] by word k and add to out[], return the carry in rax
6857  */
6858 void MacroAssembler::mul_add(Register out, Register in, Register offs,
6859    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
6860    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6861 
6862   Label L_carry, L_last_in, L_done;
6863 
6864 // carry = 0;
6865 // for (int j=len-1; j >= 0; j--) {
6866 //    long product = (in[j] & LONG_MASK) * kLong +
6867 //                   (out[offs] & LONG_MASK) + carry;
6868 //    out[offs--] = (int)product;
6869 //    carry = product >>> 32;
6870 // }
6871 //
6872   push(tmp1);
6873   push(tmp2);
6874   push(tmp3);
6875   push(tmp4);
6876   push(tmp5);
6877 
6878   Register op2 = tmp2;
6879   const Register sum = tmp3;
6880   const Register op1 = tmp4;
6881   const Register carry =  tmp5;
6882 
6883   if (UseBMI2Instructions) {
6884     op2 = rdxReg;
6885     movl(op2, k);
6886   }
6887   else {
6888     movl(op2, k);
6889   }
6890 
6891   xorq(carry, carry);
6892 
6893   //First loop
6894 
6895   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
6896   //The carry is in tmp5
6897   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
6898 
6899   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
6900   decrementl(len);
6901   jccb(Assembler::negative, L_carry);
6902   decrementl(len);
6903   jccb(Assembler::negative, L_last_in);
6904 
6905   movq(op1, Address(in, len, Address::times_4,  0));
6906   rorq(op1, 32);
6907 
6908   subl(offs, 2);
6909   movq(sum, Address(out, offs, Address::times_4,  0));
6910   rorq(sum, 32);
6911 
6912   if (UseBMI2Instructions) {
6913     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6914   }
6915   else {
6916     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6917   }
6918 
6919   // Store back in big endian from little endian
6920   rorq(sum, 0x20);
6921   movq(Address(out, offs, Address::times_4,  0), sum);
6922 
6923   testl(len, len);
6924   jccb(Assembler::zero, L_carry);
6925 
6926   //Multiply the last in[] entry, if any
6927   bind(L_last_in);
6928   movl(op1, Address(in, 0));
6929   movl(sum, Address(out, offs, Address::times_4,  -4));
6930 
6931   movl(raxReg, k);
6932   mull(op1); //tmp4 * eax -> edx:eax
6933   addl(sum, carry);
6934   adcl(rdxReg, 0);
6935   addl(sum, raxReg);
6936   adcl(rdxReg, 0);
6937   movl(carry, rdxReg);
6938 
6939   movl(Address(out, offs, Address::times_4,  -4), sum);
6940 
6941   bind(L_carry);
6942   //return tmp5/carry as carry in rax
6943   movl(rax, carry);
6944 
6945   bind(L_done);
6946   pop(tmp5);
6947   pop(tmp4);
6948   pop(tmp3);
6949   pop(tmp2);
6950   pop(tmp1);
6951 }
6952 #endif
6953 
6954 /**
6955  * Emits code to update CRC-32 with a byte value according to constants in table
6956  *
6957  * @param [in,out]crc   Register containing the crc.
6958  * @param [in]val       Register containing the byte to fold into the CRC.
6959  * @param [in]table     Register containing the table of crc constants.
6960  *
6961  * uint32_t crc;
6962  * val = crc_table[(val ^ crc) & 0xFF];
6963  * crc = val ^ (crc >> 8);
6964  *
6965  */
6966 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
6967   xorl(val, crc);
6968   andl(val, 0xFF);
6969   shrl(crc, 8); // unsigned shift
6970   xorl(crc, Address(table, val, Address::times_4, 0));
6971 }
6972 
6973 /**
6974  * Fold 128-bit data chunk
6975  */
6976 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
6977   if (UseAVX > 0) {
6978     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
6979     vpclmulldq(xcrc, xK, xcrc); // [63:0]
6980     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
6981     pxor(xcrc, xtmp);
6982   } else {
6983     movdqa(xtmp, xcrc);
6984     pclmulhdq(xtmp, xK);   // [123:64]
6985     pclmulldq(xcrc, xK);   // [63:0]
6986     pxor(xcrc, xtmp);
6987     movdqu(xtmp, Address(buf, offset));
6988     pxor(xcrc, xtmp);
6989   }
6990 }
6991 
6992 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
6993   if (UseAVX > 0) {
6994     vpclmulhdq(xtmp, xK, xcrc);
6995     vpclmulldq(xcrc, xK, xcrc);
6996     pxor(xcrc, xbuf);
6997     pxor(xcrc, xtmp);
6998   } else {
6999     movdqa(xtmp, xcrc);
7000     pclmulhdq(xtmp, xK);
7001     pclmulldq(xcrc, xK);
7002     pxor(xcrc, xbuf);
7003     pxor(xcrc, xtmp);
7004   }
7005 }
7006 
7007 /**
7008  * 8-bit folds to compute 32-bit CRC
7009  *
7010  * uint64_t xcrc;
7011  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7012  */
7013 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7014   movdl(tmp, xcrc);
7015   andl(tmp, 0xFF);
7016   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7017   psrldq(xcrc, 1); // unsigned shift one byte
7018   pxor(xcrc, xtmp);
7019 }
7020 
7021 /**
7022  * uint32_t crc;
7023  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7024  */
7025 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7026   movl(tmp, crc);
7027   andl(tmp, 0xFF);
7028   shrl(crc, 8);
7029   xorl(crc, Address(table, tmp, Address::times_4, 0));
7030 }
7031 
7032 /**
7033  * @param crc   register containing existing CRC (32-bit)
7034  * @param buf   register pointing to input byte buffer (byte*)
7035  * @param len   register containing number of bytes
7036  * @param table register that will contain address of CRC table
7037  * @param tmp   scratch register
7038  */
7039 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7040   assert_different_registers(crc, buf, len, table, tmp, rax);
7041 
7042   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7043   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7044 
7045   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7046   // context for the registers used, where all instructions below are using 128-bit mode
7047   // On EVEX without VL and BW, these instructions will all be AVX.
7048   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7049   notl(crc); // ~crc
7050   cmpl(len, 16);
7051   jcc(Assembler::less, L_tail);
7052 
7053   // Align buffer to 16 bytes
7054   movl(tmp, buf);
7055   andl(tmp, 0xF);
7056   jccb(Assembler::zero, L_aligned);
7057   subl(tmp,  16);
7058   addl(len, tmp);
7059 
7060   align(4);
7061   BIND(L_align_loop);
7062   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7063   update_byte_crc32(crc, rax, table);
7064   increment(buf);
7065   incrementl(tmp);
7066   jccb(Assembler::less, L_align_loop);
7067 
7068   BIND(L_aligned);
7069   movl(tmp, len); // save
7070   shrl(len, 4);
7071   jcc(Assembler::zero, L_tail_restore);
7072 
7073   // Fold crc into first bytes of vector
7074   movdqa(xmm1, Address(buf, 0));
7075   movdl(rax, xmm1);
7076   xorl(crc, rax);
7077   if (VM_Version::supports_sse4_1()) {
7078     pinsrd(xmm1, crc, 0);
7079   } else {
7080     pinsrw(xmm1, crc, 0);
7081     shrl(crc, 16);
7082     pinsrw(xmm1, crc, 1);
7083   }
7084   addptr(buf, 16);
7085   subl(len, 4); // len > 0
7086   jcc(Assembler::less, L_fold_tail);
7087 
7088   movdqa(xmm2, Address(buf,  0));
7089   movdqa(xmm3, Address(buf, 16));
7090   movdqa(xmm4, Address(buf, 32));
7091   addptr(buf, 48);
7092   subl(len, 3);
7093   jcc(Assembler::lessEqual, L_fold_512b);
7094 
7095   // Fold total 512 bits of polynomial on each iteration,
7096   // 128 bits per each of 4 parallel streams.
7097   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
7098 
7099   align32();
7100   BIND(L_fold_512b_loop);
7101   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7102   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7103   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7104   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7105   addptr(buf, 64);
7106   subl(len, 4);
7107   jcc(Assembler::greater, L_fold_512b_loop);
7108 
7109   // Fold 512 bits to 128 bits.
7110   BIND(L_fold_512b);
7111   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7112   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7113   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7114   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7115 
7116   // Fold the rest of 128 bits data chunks
7117   BIND(L_fold_tail);
7118   addl(len, 3);
7119   jccb(Assembler::lessEqual, L_fold_128b);
7120   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7121 
7122   BIND(L_fold_tail_loop);
7123   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7124   addptr(buf, 16);
7125   decrementl(len);
7126   jccb(Assembler::greater, L_fold_tail_loop);
7127 
7128   // Fold 128 bits in xmm1 down into 32 bits in crc register.
7129   BIND(L_fold_128b);
7130   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7131   if (UseAVX > 0) {
7132     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7133     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7134     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7135   } else {
7136     movdqa(xmm2, xmm0);
7137     pclmulqdq(xmm2, xmm1, 0x1);
7138     movdqa(xmm3, xmm0);
7139     pand(xmm3, xmm2);
7140     pclmulqdq(xmm0, xmm3, 0x1);
7141   }
7142   psrldq(xmm1, 8);
7143   psrldq(xmm2, 4);
7144   pxor(xmm0, xmm1);
7145   pxor(xmm0, xmm2);
7146 
7147   // 8 8-bit folds to compute 32-bit CRC.
7148   for (int j = 0; j < 4; j++) {
7149     fold_8bit_crc32(xmm0, table, xmm1, rax);
7150   }
7151   movdl(crc, xmm0); // mov 32 bits to general register
7152   for (int j = 0; j < 4; j++) {
7153     fold_8bit_crc32(crc, table, rax);
7154   }
7155 
7156   BIND(L_tail_restore);
7157   movl(len, tmp); // restore
7158   BIND(L_tail);
7159   andl(len, 0xf);
7160   jccb(Assembler::zero, L_exit);
7161 
7162   // Fold the rest of bytes
7163   align(4);
7164   BIND(L_tail_loop);
7165   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7166   update_byte_crc32(crc, rax, table);
7167   increment(buf);
7168   decrementl(len);
7169   jccb(Assembler::greater, L_tail_loop);
7170 
7171   BIND(L_exit);
7172   notl(crc); // ~c
7173 }
7174 
7175 #ifdef _LP64
7176 // Helper function for AVX 512 CRC32
7177 // Fold 512-bit data chunks
7178 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
7179                                              Register pos, int offset) {
7180   evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
7181   evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
7182   evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
7183   evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
7184   evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
7185 }
7186 
7187 // Helper function for AVX 512 CRC32
7188 // Compute CRC32 for < 256B buffers
7189 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
7190                                               Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7191                                               Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7192 
7193   Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
7194   Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
7195   Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
7196 
7197   // check if there is enough buffer to be able to fold 16B at a time
7198   cmpl(len, 32);
7199   jcc(Assembler::less, L_less_than_32);
7200 
7201   // if there is, load the constants
7202   movdqu(xmm10, Address(key, 1 * 16));    //rk1 and rk2 in xmm10
7203   movdl(xmm0, crc);                        // get the initial crc value
7204   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7205   pxor(xmm7, xmm0);
7206 
7207   // update the buffer pointer
7208   addl(pos, 16);
7209   //update the counter.subtract 32 instead of 16 to save one instruction from the loop
7210   subl(len, 32);
7211   jmp(L_16B_reduction_loop);
7212 
7213   bind(L_less_than_32);
7214   //mov initial crc to the return value. this is necessary for zero - length buffers.
7215   movl(rax, crc);
7216   testl(len, len);
7217   jcc(Assembler::equal, L_cleanup);
7218 
7219   movdl(xmm0, crc);                        //get the initial crc value
7220 
7221   cmpl(len, 16);
7222   jcc(Assembler::equal, L_exact_16_left);
7223   jcc(Assembler::less, L_less_than_16_left);
7224 
7225   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7226   pxor(xmm7, xmm0);                       //xor the initial crc value
7227   addl(pos, 16);
7228   subl(len, 16);
7229   movdqu(xmm10, Address(key, 1 * 16));    // rk1 and rk2 in xmm10
7230   jmp(L_get_last_two_xmms);
7231 
7232   bind(L_less_than_16_left);
7233   //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
7234   pxor(xmm1, xmm1);
7235   movptr(tmp1, rsp);
7236   movdqu(Address(tmp1, 0 * 16), xmm1);
7237 
7238   cmpl(len, 4);
7239   jcc(Assembler::less, L_only_less_than_4);
7240 
7241   //backup the counter value
7242   movl(tmp2, len);
7243   cmpl(len, 8);
7244   jcc(Assembler::less, L_less_than_8_left);
7245 
7246   //load 8 Bytes
7247   movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
7248   movq(Address(tmp1, 0 * 16), rax);
7249   addptr(tmp1, 8);
7250   subl(len, 8);
7251   addl(pos, 8);
7252 
7253   bind(L_less_than_8_left);
7254   cmpl(len, 4);
7255   jcc(Assembler::less, L_less_than_4_left);
7256 
7257   //load 4 Bytes
7258   movl(rax, Address(buf, pos, Address::times_1, 0));
7259   movl(Address(tmp1, 0 * 16), rax);
7260   addptr(tmp1, 4);
7261   subl(len, 4);
7262   addl(pos, 4);
7263 
7264   bind(L_less_than_4_left);
7265   cmpl(len, 2);
7266   jcc(Assembler::less, L_less_than_2_left);
7267 
7268   // load 2 Bytes
7269   movw(rax, Address(buf, pos, Address::times_1, 0));
7270   movl(Address(tmp1, 0 * 16), rax);
7271   addptr(tmp1, 2);
7272   subl(len, 2);
7273   addl(pos, 2);
7274 
7275   bind(L_less_than_2_left);
7276   cmpl(len, 1);
7277   jcc(Assembler::less, L_zero_left);
7278 
7279   // load 1 Byte
7280   movb(rax, Address(buf, pos, Address::times_1, 0));
7281   movb(Address(tmp1, 0 * 16), rax);
7282 
7283   bind(L_zero_left);
7284   movdqu(xmm7, Address(rsp, 0));
7285   pxor(xmm7, xmm0);                       //xor the initial crc value
7286 
7287   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7288   movdqu(xmm0, Address(rax, tmp2));
7289   pshufb(xmm7, xmm0);
7290   jmp(L_128_done);
7291 
7292   bind(L_exact_16_left);
7293   movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
7294   pxor(xmm7, xmm0);                       //xor the initial crc value
7295   jmp(L_128_done);
7296 
7297   bind(L_only_less_than_4);
7298   cmpl(len, 3);
7299   jcc(Assembler::less, L_only_less_than_3);
7300 
7301   // load 3 Bytes
7302   movb(rax, Address(buf, pos, Address::times_1, 0));
7303   movb(Address(tmp1, 0), rax);
7304 
7305   movb(rax, Address(buf, pos, Address::times_1, 1));
7306   movb(Address(tmp1, 1), rax);
7307 
7308   movb(rax, Address(buf, pos, Address::times_1, 2));
7309   movb(Address(tmp1, 2), rax);
7310 
7311   movdqu(xmm7, Address(rsp, 0));
7312   pxor(xmm7, xmm0);                     //xor the initial crc value
7313 
7314   pslldq(xmm7, 0x5);
7315   jmp(L_barrett);
7316   bind(L_only_less_than_3);
7317   cmpl(len, 2);
7318   jcc(Assembler::less, L_only_less_than_2);
7319 
7320   // load 2 Bytes
7321   movb(rax, Address(buf, pos, Address::times_1, 0));
7322   movb(Address(tmp1, 0), rax);
7323 
7324   movb(rax, Address(buf, pos, Address::times_1, 1));
7325   movb(Address(tmp1, 1), rax);
7326 
7327   movdqu(xmm7, Address(rsp, 0));
7328   pxor(xmm7, xmm0);                     //xor the initial crc value
7329 
7330   pslldq(xmm7, 0x6);
7331   jmp(L_barrett);
7332 
7333   bind(L_only_less_than_2);
7334   //load 1 Byte
7335   movb(rax, Address(buf, pos, Address::times_1, 0));
7336   movb(Address(tmp1, 0), rax);
7337 
7338   movdqu(xmm7, Address(rsp, 0));
7339   pxor(xmm7, xmm0);                     //xor the initial crc value
7340 
7341   pslldq(xmm7, 0x7);
7342 }
7343 
7344 /**
7345 * Compute CRC32 using AVX512 instructions
7346 * param crc   register containing existing CRC (32-bit)
7347 * param buf   register pointing to input byte buffer (byte*)
7348 * param len   register containing number of bytes
7349 * param tmp1  scratch register
7350 * param tmp2  scratch register
7351 * return rax  result register
7352 */
7353 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register key, Register tmp1, Register tmp2) {
7354   assert_different_registers(crc, buf, len, key, tmp1, tmp2, rax);
7355 
7356   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7357   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7358   Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
7359   Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
7360   Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
7361 
7362   const Register pos = r12;
7363   push(r12);
7364   subptr(rsp, 16 * 2 + 8);
7365 
7366   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7367   // context for the registers used, where all instructions below are using 128-bit mode
7368   // On EVEX without VL and BW, these instructions will all be AVX.
7369   lea(key, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr()));
7370   notl(crc);
7371   movl(pos, 0);
7372 
7373   // check if smaller than 256B
7374   cmpl(len, 256);
7375   jcc(Assembler::less, L_less_than_256);
7376 
7377   // load the initial crc value
7378   movdl(xmm10, crc);
7379 
7380   // receive the initial 64B data, xor the initial crc value
7381   evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
7382   evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
7383   evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
7384   evbroadcasti32x4(xmm10, Address(key, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
7385 
7386   subl(len, 256);
7387   cmpl(len, 256);
7388   jcc(Assembler::less, L_fold_128_B_loop);
7389 
7390   evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
7391   evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
7392   evbroadcasti32x4(xmm16, Address(key, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
7393   subl(len, 256);
7394 
7395   bind(L_fold_256_B_loop);
7396   addl(pos, 256);
7397   fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
7398   fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
7399   fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
7400   fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
7401 
7402   subl(len, 256);
7403   jcc(Assembler::greaterEqual, L_fold_256_B_loop);
7404 
7405   // Fold 256 into 128
7406   addl(pos, 256);
7407   evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
7408   evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
7409   vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
7410 
7411   evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
7412   evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
7413   vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
7414 
7415   evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
7416   evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
7417 
7418   addl(len, 128);
7419   jmp(L_fold_128_B_register);
7420 
7421   // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
7422   // loop will fold 128B at a time until we have 128 + y Bytes of buffer
7423 
7424   // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
7425   bind(L_fold_128_B_loop);
7426   addl(pos, 128);
7427   fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
7428   fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
7429 
7430   subl(len, 128);
7431   jcc(Assembler::greaterEqual, L_fold_128_B_loop);
7432 
7433   addl(pos, 128);
7434 
7435   // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
7436   // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
7437   bind(L_fold_128_B_register);
7438   evmovdquq(xmm16, Address(key, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
7439   evmovdquq(xmm11, Address(key, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
7440   evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
7441   evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
7442   // save last that has no multiplicand
7443   vextracti64x2(xmm7, xmm4, 3);
7444 
7445   evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
7446   evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
7447   // Needed later in reduction loop
7448   movdqu(xmm10, Address(key, 1 * 16));
7449   vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
7450   vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
7451 
7452   // Swap 1,0,3,2 - 01 00 11 10
7453   evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
7454   evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
7455   vextracti128(xmm5, xmm8, 1);
7456   evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
7457 
7458   // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
7459   // instead of a cmp instruction, we use the negative flag with the jl instruction
7460   addl(len, 128 - 16);
7461   jcc(Assembler::less, L_final_reduction_for_128);
7462 
7463   bind(L_16B_reduction_loop);
7464   vpclmulqdq(xmm8, xmm7, xmm10, 0x1);
7465   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7466   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7467   movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
7468   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7469   addl(pos, 16);
7470   subl(len, 16);
7471   jcc(Assembler::greaterEqual, L_16B_reduction_loop);
7472 
7473   bind(L_final_reduction_for_128);
7474   addl(len, 16);
7475   jcc(Assembler::equal, L_128_done);
7476 
7477   bind(L_get_last_two_xmms);
7478   movdqu(xmm2, xmm7);
7479   addl(pos, len);
7480   movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
7481   subl(pos, len);
7482 
7483   // get rid of the extra data that was loaded before
7484   // load the shift constant
7485   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7486   movdqu(xmm0, Address(rax, len));
7487   addl(rax, len);
7488 
7489   vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7490   //Change mask to 512
7491   vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
7492   vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
7493 
7494   blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
7495   vpclmulqdq(xmm8, xmm7, xmm10, 0x1);
7496   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7497   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7498   vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
7499 
7500   bind(L_128_done);
7501   // compute crc of a 128-bit value
7502   movdqu(xmm10, Address(key, 3 * 16));
7503   movdqu(xmm0, xmm7);
7504 
7505   // 64b fold
7506   vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
7507   vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
7508   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7509 
7510   // 32b fold
7511   movdqu(xmm0, xmm7);
7512   vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
7513   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7514   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7515   jmp(L_barrett);
7516 
7517   bind(L_less_than_256);
7518   kernel_crc32_avx512_256B(crc, buf, len, key, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
7519 
7520   //barrett reduction
7521   bind(L_barrett);
7522   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
7523   movdqu(xmm1, xmm7);
7524   movdqu(xmm2, xmm7);
7525   movdqu(xmm10, Address(key, 4 * 16));
7526 
7527   pclmulqdq(xmm7, xmm10, 0x0);
7528   pxor(xmm7, xmm2);
7529   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
7530   movdqu(xmm2, xmm7);
7531   pclmulqdq(xmm7, xmm10, 0x10);
7532   pxor(xmm7, xmm2);
7533   pxor(xmm7, xmm1);
7534   pextrd(crc, xmm7, 2);
7535 
7536   bind(L_cleanup);
7537   notl(crc); // ~c
7538   addptr(rsp, 16 * 2 + 8);
7539   pop(r12);
7540 }
7541 
7542 // S. Gueron / Information Processing Letters 112 (2012) 184
7543 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
7544 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
7545 // Output: the 64-bit carry-less product of B * CONST
7546 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
7547                                      Register tmp1, Register tmp2, Register tmp3) {
7548   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7549   if (n > 0) {
7550     addq(tmp3, n * 256 * 8);
7551   }
7552   //    Q1 = TABLEExt[n][B & 0xFF];
7553   movl(tmp1, in);
7554   andl(tmp1, 0x000000FF);
7555   shll(tmp1, 3);
7556   addq(tmp1, tmp3);
7557   movq(tmp1, Address(tmp1, 0));
7558 
7559   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7560   movl(tmp2, in);
7561   shrl(tmp2, 8);
7562   andl(tmp2, 0x000000FF);
7563   shll(tmp2, 3);
7564   addq(tmp2, tmp3);
7565   movq(tmp2, Address(tmp2, 0));
7566 
7567   shlq(tmp2, 8);
7568   xorq(tmp1, tmp2);
7569 
7570   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7571   movl(tmp2, in);
7572   shrl(tmp2, 16);
7573   andl(tmp2, 0x000000FF);
7574   shll(tmp2, 3);
7575   addq(tmp2, tmp3);
7576   movq(tmp2, Address(tmp2, 0));
7577 
7578   shlq(tmp2, 16);
7579   xorq(tmp1, tmp2);
7580 
7581   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7582   shrl(in, 24);
7583   andl(in, 0x000000FF);
7584   shll(in, 3);
7585   addq(in, tmp3);
7586   movq(in, Address(in, 0));
7587 
7588   shlq(in, 24);
7589   xorq(in, tmp1);
7590   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7591 }
7592 
7593 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7594                                       Register in_out,
7595                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7596                                       XMMRegister w_xtmp2,
7597                                       Register tmp1,
7598                                       Register n_tmp2, Register n_tmp3) {
7599   if (is_pclmulqdq_supported) {
7600     movdl(w_xtmp1, in_out); // modified blindly
7601 
7602     movl(tmp1, const_or_pre_comp_const_index);
7603     movdl(w_xtmp2, tmp1);
7604     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7605 
7606     movdq(in_out, w_xtmp1);
7607   } else {
7608     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
7609   }
7610 }
7611 
7612 // Recombination Alternative 2: No bit-reflections
7613 // T1 = (CRC_A * U1) << 1
7614 // T2 = (CRC_B * U2) << 1
7615 // C1 = T1 >> 32
7616 // C2 = T2 >> 32
7617 // T1 = T1 & 0xFFFFFFFF
7618 // T2 = T2 & 0xFFFFFFFF
7619 // T1 = CRC32(0, T1)
7620 // T2 = CRC32(0, T2)
7621 // C1 = C1 ^ T1
7622 // C2 = C2 ^ T2
7623 // CRC = C1 ^ C2 ^ CRC_C
7624 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7625                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7626                                      Register tmp1, Register tmp2,
7627                                      Register n_tmp3) {
7628   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7629   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7630   shlq(in_out, 1);
7631   movl(tmp1, in_out);
7632   shrq(in_out, 32);
7633   xorl(tmp2, tmp2);
7634   crc32(tmp2, tmp1, 4);
7635   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
7636   shlq(in1, 1);
7637   movl(tmp1, in1);
7638   shrq(in1, 32);
7639   xorl(tmp2, tmp2);
7640   crc32(tmp2, tmp1, 4);
7641   xorl(in1, tmp2);
7642   xorl(in_out, in1);
7643   xorl(in_out, in2);
7644 }
7645 
7646 // Set N to predefined value
7647 // Subtract from a lenght of a buffer
7648 // execute in a loop:
7649 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
7650 // for i = 1 to N do
7651 //  CRC_A = CRC32(CRC_A, A[i])
7652 //  CRC_B = CRC32(CRC_B, B[i])
7653 //  CRC_C = CRC32(CRC_C, C[i])
7654 // end for
7655 // Recombine
7656 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7657                                        Register in_out1, Register in_out2, Register in_out3,
7658                                        Register tmp1, Register tmp2, Register tmp3,
7659                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7660                                        Register tmp4, Register tmp5,
7661                                        Register n_tmp6) {
7662   Label L_processPartitions;
7663   Label L_processPartition;
7664   Label L_exit;
7665 
7666   bind(L_processPartitions);
7667   cmpl(in_out1, 3 * size);
7668   jcc(Assembler::less, L_exit);
7669     xorl(tmp1, tmp1);
7670     xorl(tmp2, tmp2);
7671     movq(tmp3, in_out2);
7672     addq(tmp3, size);
7673 
7674     bind(L_processPartition);
7675       crc32(in_out3, Address(in_out2, 0), 8);
7676       crc32(tmp1, Address(in_out2, size), 8);
7677       crc32(tmp2, Address(in_out2, size * 2), 8);
7678       addq(in_out2, 8);
7679       cmpq(in_out2, tmp3);
7680       jcc(Assembler::less, L_processPartition);
7681     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7682             w_xtmp1, w_xtmp2, w_xtmp3,
7683             tmp4, tmp5,
7684             n_tmp6);
7685     addq(in_out2, 2 * size);
7686     subl(in_out1, 3 * size);
7687     jmp(L_processPartitions);
7688 
7689   bind(L_exit);
7690 }
7691 #else
7692 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
7693                                      Register tmp1, Register tmp2, Register tmp3,
7694                                      XMMRegister xtmp1, XMMRegister xtmp2) {
7695   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7696   if (n > 0) {
7697     addl(tmp3, n * 256 * 8);
7698   }
7699   //    Q1 = TABLEExt[n][B & 0xFF];
7700   movl(tmp1, in_out);
7701   andl(tmp1, 0x000000FF);
7702   shll(tmp1, 3);
7703   addl(tmp1, tmp3);
7704   movq(xtmp1, Address(tmp1, 0));
7705 
7706   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7707   movl(tmp2, in_out);
7708   shrl(tmp2, 8);
7709   andl(tmp2, 0x000000FF);
7710   shll(tmp2, 3);
7711   addl(tmp2, tmp3);
7712   movq(xtmp2, Address(tmp2, 0));
7713 
7714   psllq(xtmp2, 8);
7715   pxor(xtmp1, xtmp2);
7716 
7717   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7718   movl(tmp2, in_out);
7719   shrl(tmp2, 16);
7720   andl(tmp2, 0x000000FF);
7721   shll(tmp2, 3);
7722   addl(tmp2, tmp3);
7723   movq(xtmp2, Address(tmp2, 0));
7724 
7725   psllq(xtmp2, 16);
7726   pxor(xtmp1, xtmp2);
7727 
7728   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7729   shrl(in_out, 24);
7730   andl(in_out, 0x000000FF);
7731   shll(in_out, 3);
7732   addl(in_out, tmp3);
7733   movq(xtmp2, Address(in_out, 0));
7734 
7735   psllq(xtmp2, 24);
7736   pxor(xtmp1, xtmp2); // Result in CXMM
7737   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7738 }
7739 
7740 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7741                                       Register in_out,
7742                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7743                                       XMMRegister w_xtmp2,
7744                                       Register tmp1,
7745                                       Register n_tmp2, Register n_tmp3) {
7746   if (is_pclmulqdq_supported) {
7747     movdl(w_xtmp1, in_out);
7748 
7749     movl(tmp1, const_or_pre_comp_const_index);
7750     movdl(w_xtmp2, tmp1);
7751     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7752     // Keep result in XMM since GPR is 32 bit in length
7753   } else {
7754     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
7755   }
7756 }
7757 
7758 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7759                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7760                                      Register tmp1, Register tmp2,
7761                                      Register n_tmp3) {
7762   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7763   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7764 
7765   psllq(w_xtmp1, 1);
7766   movdl(tmp1, w_xtmp1);
7767   psrlq(w_xtmp1, 32);
7768   movdl(in_out, w_xtmp1);
7769 
7770   xorl(tmp2, tmp2);
7771   crc32(tmp2, tmp1, 4);
7772   xorl(in_out, tmp2);
7773 
7774   psllq(w_xtmp2, 1);
7775   movdl(tmp1, w_xtmp2);
7776   psrlq(w_xtmp2, 32);
7777   movdl(in1, w_xtmp2);
7778 
7779   xorl(tmp2, tmp2);
7780   crc32(tmp2, tmp1, 4);
7781   xorl(in1, tmp2);
7782   xorl(in_out, in1);
7783   xorl(in_out, in2);
7784 }
7785 
7786 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7787                                        Register in_out1, Register in_out2, Register in_out3,
7788                                        Register tmp1, Register tmp2, Register tmp3,
7789                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7790                                        Register tmp4, Register tmp5,
7791                                        Register n_tmp6) {
7792   Label L_processPartitions;
7793   Label L_processPartition;
7794   Label L_exit;
7795 
7796   bind(L_processPartitions);
7797   cmpl(in_out1, 3 * size);
7798   jcc(Assembler::less, L_exit);
7799     xorl(tmp1, tmp1);
7800     xorl(tmp2, tmp2);
7801     movl(tmp3, in_out2);
7802     addl(tmp3, size);
7803 
7804     bind(L_processPartition);
7805       crc32(in_out3, Address(in_out2, 0), 4);
7806       crc32(tmp1, Address(in_out2, size), 4);
7807       crc32(tmp2, Address(in_out2, size*2), 4);
7808       crc32(in_out3, Address(in_out2, 0+4), 4);
7809       crc32(tmp1, Address(in_out2, size+4), 4);
7810       crc32(tmp2, Address(in_out2, size*2+4), 4);
7811       addl(in_out2, 8);
7812       cmpl(in_out2, tmp3);
7813       jcc(Assembler::less, L_processPartition);
7814 
7815         push(tmp3);
7816         push(in_out1);
7817         push(in_out2);
7818         tmp4 = tmp3;
7819         tmp5 = in_out1;
7820         n_tmp6 = in_out2;
7821 
7822       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7823             w_xtmp1, w_xtmp2, w_xtmp3,
7824             tmp4, tmp5,
7825             n_tmp6);
7826 
7827         pop(in_out2);
7828         pop(in_out1);
7829         pop(tmp3);
7830 
7831     addl(in_out2, 2 * size);
7832     subl(in_out1, 3 * size);
7833     jmp(L_processPartitions);
7834 
7835   bind(L_exit);
7836 }
7837 #endif //LP64
7838 
7839 #ifdef _LP64
7840 // Algorithm 2: Pipelined usage of the CRC32 instruction.
7841 // Input: A buffer I of L bytes.
7842 // Output: the CRC32C value of the buffer.
7843 // Notations:
7844 // Write L = 24N + r, with N = floor (L/24).
7845 // r = L mod 24 (0 <= r < 24).
7846 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
7847 // N quadwords, and R consists of r bytes.
7848 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
7849 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
7850 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
7851 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
7852 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7853                                           Register tmp1, Register tmp2, Register tmp3,
7854                                           Register tmp4, Register tmp5, Register tmp6,
7855                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7856                                           bool is_pclmulqdq_supported) {
7857   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7858   Label L_wordByWord;
7859   Label L_byteByByteProlog;
7860   Label L_byteByByte;
7861   Label L_exit;
7862 
7863   if (is_pclmulqdq_supported ) {
7864     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7865     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
7866 
7867     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7868     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7869 
7870     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7871     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7872     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
7873   } else {
7874     const_or_pre_comp_const_index[0] = 1;
7875     const_or_pre_comp_const_index[1] = 0;
7876 
7877     const_or_pre_comp_const_index[2] = 3;
7878     const_or_pre_comp_const_index[3] = 2;
7879 
7880     const_or_pre_comp_const_index[4] = 5;
7881     const_or_pre_comp_const_index[5] = 4;
7882    }
7883   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
7884                     in2, in1, in_out,
7885                     tmp1, tmp2, tmp3,
7886                     w_xtmp1, w_xtmp2, w_xtmp3,
7887                     tmp4, tmp5,
7888                     tmp6);
7889   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
7890                     in2, in1, in_out,
7891                     tmp1, tmp2, tmp3,
7892                     w_xtmp1, w_xtmp2, w_xtmp3,
7893                     tmp4, tmp5,
7894                     tmp6);
7895   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
7896                     in2, in1, in_out,
7897                     tmp1, tmp2, tmp3,
7898                     w_xtmp1, w_xtmp2, w_xtmp3,
7899                     tmp4, tmp5,
7900                     tmp6);
7901   movl(tmp1, in2);
7902   andl(tmp1, 0x00000007);
7903   negl(tmp1);
7904   addl(tmp1, in2);
7905   addq(tmp1, in1);
7906 
7907   BIND(L_wordByWord);
7908   cmpq(in1, tmp1);
7909   jcc(Assembler::greaterEqual, L_byteByByteProlog);
7910     crc32(in_out, Address(in1, 0), 4);
7911     addq(in1, 4);
7912     jmp(L_wordByWord);
7913 
7914   BIND(L_byteByByteProlog);
7915   andl(in2, 0x00000007);
7916   movl(tmp2, 1);
7917 
7918   BIND(L_byteByByte);
7919   cmpl(tmp2, in2);
7920   jccb(Assembler::greater, L_exit);
7921     crc32(in_out, Address(in1, 0), 1);
7922     incq(in1);
7923     incl(tmp2);
7924     jmp(L_byteByByte);
7925 
7926   BIND(L_exit);
7927 }
7928 #else
7929 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7930                                           Register tmp1, Register  tmp2, Register tmp3,
7931                                           Register tmp4, Register  tmp5, Register tmp6,
7932                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7933                                           bool is_pclmulqdq_supported) {
7934   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7935   Label L_wordByWord;
7936   Label L_byteByByteProlog;
7937   Label L_byteByByte;
7938   Label L_exit;
7939 
7940   if (is_pclmulqdq_supported) {
7941     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7942     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
7943 
7944     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7945     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7946 
7947     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7948     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7949   } else {
7950     const_or_pre_comp_const_index[0] = 1;
7951     const_or_pre_comp_const_index[1] = 0;
7952 
7953     const_or_pre_comp_const_index[2] = 3;
7954     const_or_pre_comp_const_index[3] = 2;
7955 
7956     const_or_pre_comp_const_index[4] = 5;
7957     const_or_pre_comp_const_index[5] = 4;
7958   }
7959   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
7960                     in2, in1, in_out,
7961                     tmp1, tmp2, tmp3,
7962                     w_xtmp1, w_xtmp2, w_xtmp3,
7963                     tmp4, tmp5,
7964                     tmp6);
7965   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
7966                     in2, in1, in_out,
7967                     tmp1, tmp2, tmp3,
7968                     w_xtmp1, w_xtmp2, w_xtmp3,
7969                     tmp4, tmp5,
7970                     tmp6);
7971   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
7972                     in2, in1, in_out,
7973                     tmp1, tmp2, tmp3,
7974                     w_xtmp1, w_xtmp2, w_xtmp3,
7975                     tmp4, tmp5,
7976                     tmp6);
7977   movl(tmp1, in2);
7978   andl(tmp1, 0x00000007);
7979   negl(tmp1);
7980   addl(tmp1, in2);
7981   addl(tmp1, in1);
7982 
7983   BIND(L_wordByWord);
7984   cmpl(in1, tmp1);
7985   jcc(Assembler::greaterEqual, L_byteByByteProlog);
7986     crc32(in_out, Address(in1,0), 4);
7987     addl(in1, 4);
7988     jmp(L_wordByWord);
7989 
7990   BIND(L_byteByByteProlog);
7991   andl(in2, 0x00000007);
7992   movl(tmp2, 1);
7993 
7994   BIND(L_byteByByte);
7995   cmpl(tmp2, in2);
7996   jccb(Assembler::greater, L_exit);
7997     movb(tmp1, Address(in1, 0));
7998     crc32(in_out, tmp1, 1);
7999     incl(in1);
8000     incl(tmp2);
8001     jmp(L_byteByByte);
8002 
8003   BIND(L_exit);
8004 }
8005 #endif // LP64
8006 #undef BIND
8007 #undef BLOCK_COMMENT
8008 
8009 // Compress char[] array to byte[].
8010 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
8011 //   @IntrinsicCandidate
8012 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8013 //     for (int i = 0; i < len; i++) {
8014 //       int c = src[srcOff++];
8015 //       if (c >>> 8 != 0) {
8016 //         return 0;
8017 //       }
8018 //       dst[dstOff++] = (byte)c;
8019 //     }
8020 //     return len;
8021 //   }
8022 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8023   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8024   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8025   Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8026   Label copy_chars_loop, return_length, return_zero, done;
8027 
8028   // rsi: src
8029   // rdi: dst
8030   // rdx: len
8031   // rcx: tmp5
8032   // rax: result
8033 
8034   // rsi holds start addr of source char[] to be compressed
8035   // rdi holds start addr of destination byte[]
8036   // rdx holds length
8037 
8038   assert(len != result, "");
8039 
8040   // save length for return
8041   push(len);
8042 
8043   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
8044     VM_Version::supports_avx512vlbw() &&
8045     VM_Version::supports_bmi2()) {
8046 
8047     Label copy_32_loop, copy_loop_tail, below_threshold;
8048 
8049     // alignment
8050     Label post_alignment;
8051 
8052     // if length of the string is less than 16, handle it in an old fashioned way
8053     testl(len, -32);
8054     jcc(Assembler::zero, below_threshold);
8055 
8056     // First check whether a character is compressable ( <= 0xFF).
8057     // Create mask to test for Unicode chars inside zmm vector
8058     movl(result, 0x00FF);
8059     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
8060 
8061     testl(len, -64);
8062     jcc(Assembler::zero, post_alignment);
8063 
8064     movl(tmp5, dst);
8065     andl(tmp5, (32 - 1));
8066     negl(tmp5);
8067     andl(tmp5, (32 - 1));
8068 
8069     // bail out when there is nothing to be done
8070     testl(tmp5, 0xFFFFFFFF);
8071     jcc(Assembler::zero, post_alignment);
8072 
8073     // ~(~0 << len), where len is the # of remaining elements to process
8074     movl(result, 0xFFFFFFFF);
8075     shlxl(result, result, tmp5);
8076     notl(result);
8077     kmovdl(mask2, result);
8078 
8079     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8080     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8081     ktestd(mask1, mask2);
8082     jcc(Assembler::carryClear, return_zero);
8083 
8084     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8085 
8086     addptr(src, tmp5);
8087     addptr(src, tmp5);
8088     addptr(dst, tmp5);
8089     subl(len, tmp5);
8090 
8091     bind(post_alignment);
8092     // end of alignment
8093 
8094     movl(tmp5, len);
8095     andl(tmp5, (32 - 1));    // tail count (in chars)
8096     andl(len, ~(32 - 1));    // vector count (in chars)
8097     jcc(Assembler::zero, copy_loop_tail);
8098 
8099     lea(src, Address(src, len, Address::times_2));
8100     lea(dst, Address(dst, len, Address::times_1));
8101     negptr(len);
8102 
8103     bind(copy_32_loop);
8104     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), /*merge*/ false, Assembler::AVX_512bit);
8105     evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
8106     kortestdl(mask1, mask1);
8107     jcc(Assembler::carryClear, return_zero);
8108 
8109     // All elements in current processed chunk are valid candidates for
8110     // compression. Write a truncated byte elements to the memory.
8111     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
8112     addptr(len, 32);
8113     jcc(Assembler::notZero, copy_32_loop);
8114 
8115     bind(copy_loop_tail);
8116     // bail out when there is nothing to be done
8117     testl(tmp5, 0xFFFFFFFF);
8118     jcc(Assembler::zero, return_length);
8119 
8120     movl(len, tmp5);
8121 
8122     // ~(~0 << len), where len is the # of remaining elements to process
8123     movl(result, 0xFFFFFFFF);
8124     shlxl(result, result, len);
8125     notl(result);
8126 
8127     kmovdl(mask2, result);
8128 
8129     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8130     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8131     ktestd(mask1, mask2);
8132     jcc(Assembler::carryClear, return_zero);
8133 
8134     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8135     jmp(return_length);
8136 
8137     bind(below_threshold);
8138   }
8139 
8140   if (UseSSE42Intrinsics) {
8141     Label copy_32_loop, copy_16, copy_tail;
8142 
8143     movl(result, len);
8144 
8145     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
8146 
8147     // vectored compression
8148     andl(len, 0xfffffff0);    // vector count (in chars)
8149     andl(result, 0x0000000f);    // tail count (in chars)
8150     testl(len, len);
8151     jcc(Assembler::zero, copy_16);
8152 
8153     // compress 16 chars per iter
8154     movdl(tmp1Reg, tmp5);
8155     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8156     pxor(tmp4Reg, tmp4Reg);
8157 
8158     lea(src, Address(src, len, Address::times_2));
8159     lea(dst, Address(dst, len, Address::times_1));
8160     negptr(len);
8161 
8162     bind(copy_32_loop);
8163     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
8164     por(tmp4Reg, tmp2Reg);
8165     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
8166     por(tmp4Reg, tmp3Reg);
8167     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
8168     jcc(Assembler::notZero, return_zero);
8169     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
8170     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
8171     addptr(len, 16);
8172     jcc(Assembler::notZero, copy_32_loop);
8173 
8174     // compress next vector of 8 chars (if any)
8175     bind(copy_16);
8176     movl(len, result);
8177     andl(len, 0xfffffff8);    // vector count (in chars)
8178     andl(result, 0x00000007);    // tail count (in chars)
8179     testl(len, len);
8180     jccb(Assembler::zero, copy_tail);
8181 
8182     movdl(tmp1Reg, tmp5);
8183     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8184     pxor(tmp3Reg, tmp3Reg);
8185 
8186     movdqu(tmp2Reg, Address(src, 0));
8187     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
8188     jccb(Assembler::notZero, return_zero);
8189     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
8190     movq(Address(dst, 0), tmp2Reg);
8191     addptr(src, 16);
8192     addptr(dst, 8);
8193 
8194     bind(copy_tail);
8195     movl(len, result);
8196   }
8197   // compress 1 char per iter
8198   testl(len, len);
8199   jccb(Assembler::zero, return_length);
8200   lea(src, Address(src, len, Address::times_2));
8201   lea(dst, Address(dst, len, Address::times_1));
8202   negptr(len);
8203 
8204   bind(copy_chars_loop);
8205   load_unsigned_short(result, Address(src, len, Address::times_2));
8206   testl(result, 0xff00);      // check if Unicode char
8207   jccb(Assembler::notZero, return_zero);
8208   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
8209   increment(len);
8210   jcc(Assembler::notZero, copy_chars_loop);
8211 
8212   // if compression succeeded, return length
8213   bind(return_length);
8214   pop(result);
8215   jmpb(done);
8216 
8217   // if compression failed, return 0
8218   bind(return_zero);
8219   xorl(result, result);
8220   addptr(rsp, wordSize);
8221 
8222   bind(done);
8223 }
8224 
8225 // Inflate byte[] array to char[].
8226 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
8227 //   @IntrinsicCandidate
8228 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
8229 //     for (int i = 0; i < len; i++) {
8230 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
8231 //     }
8232 //   }
8233 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
8234   XMMRegister tmp1, Register tmp2, KRegister mask) {
8235   Label copy_chars_loop, done, below_threshold, avx3_threshold;
8236   // rsi: src
8237   // rdi: dst
8238   // rdx: len
8239   // rcx: tmp2
8240 
8241   // rsi holds start addr of source byte[] to be inflated
8242   // rdi holds start addr of destination char[]
8243   // rdx holds length
8244   assert_different_registers(src, dst, len, tmp2);
8245   movl(tmp2, len);
8246   if ((UseAVX > 2) && // AVX512
8247     VM_Version::supports_avx512vlbw() &&
8248     VM_Version::supports_bmi2()) {
8249 
8250     Label copy_32_loop, copy_tail;
8251     Register tmp3_aliased = len;
8252 
8253     // if length of the string is less than 16, handle it in an old fashioned way
8254     testl(len, -16);
8255     jcc(Assembler::zero, below_threshold);
8256 
8257     testl(len, -1 * AVX3Threshold);
8258     jcc(Assembler::zero, avx3_threshold);
8259 
8260     // In order to use only one arithmetic operation for the main loop we use
8261     // this pre-calculation
8262     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
8263     andl(len, -32);     // vector count
8264     jccb(Assembler::zero, copy_tail);
8265 
8266     lea(src, Address(src, len, Address::times_1));
8267     lea(dst, Address(dst, len, Address::times_2));
8268     negptr(len);
8269 
8270 
8271     // inflate 32 chars per iter
8272     bind(copy_32_loop);
8273     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
8274     evmovdquw(Address(dst, len, Address::times_2), tmp1, /*merge*/ false, Assembler::AVX_512bit);
8275     addptr(len, 32);
8276     jcc(Assembler::notZero, copy_32_loop);
8277 
8278     bind(copy_tail);
8279     // bail out when there is nothing to be done
8280     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
8281     jcc(Assembler::zero, done);
8282 
8283     // ~(~0 << length), where length is the # of remaining elements to process
8284     movl(tmp3_aliased, -1);
8285     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
8286     notl(tmp3_aliased);
8287     kmovdl(mask, tmp3_aliased);
8288     evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
8289     evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
8290 
8291     jmp(done);
8292     bind(avx3_threshold);
8293   }
8294   if (UseSSE42Intrinsics) {
8295     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
8296 
8297     if (UseAVX > 1) {
8298       andl(tmp2, (16 - 1));
8299       andl(len, -16);
8300       jccb(Assembler::zero, copy_new_tail);
8301     } else {
8302       andl(tmp2, 0x00000007);   // tail count (in chars)
8303       andl(len, 0xfffffff8);    // vector count (in chars)
8304       jccb(Assembler::zero, copy_tail);
8305     }
8306 
8307     // vectored inflation
8308     lea(src, Address(src, len, Address::times_1));
8309     lea(dst, Address(dst, len, Address::times_2));
8310     negptr(len);
8311 
8312     if (UseAVX > 1) {
8313       bind(copy_16_loop);
8314       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
8315       vmovdqu(Address(dst, len, Address::times_2), tmp1);
8316       addptr(len, 16);
8317       jcc(Assembler::notZero, copy_16_loop);
8318 
8319       bind(below_threshold);
8320       bind(copy_new_tail);
8321       movl(len, tmp2);
8322       andl(tmp2, 0x00000007);
8323       andl(len, 0xFFFFFFF8);
8324       jccb(Assembler::zero, copy_tail);
8325 
8326       pmovzxbw(tmp1, Address(src, 0));
8327       movdqu(Address(dst, 0), tmp1);
8328       addptr(src, 8);
8329       addptr(dst, 2 * 8);
8330 
8331       jmp(copy_tail, true);
8332     }
8333 
8334     // inflate 8 chars per iter
8335     bind(copy_8_loop);
8336     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
8337     movdqu(Address(dst, len, Address::times_2), tmp1);
8338     addptr(len, 8);
8339     jcc(Assembler::notZero, copy_8_loop);
8340 
8341     bind(copy_tail);
8342     movl(len, tmp2);
8343 
8344     cmpl(len, 4);
8345     jccb(Assembler::less, copy_bytes);
8346 
8347     movdl(tmp1, Address(src, 0));  // load 4 byte chars
8348     pmovzxbw(tmp1, tmp1);
8349     movq(Address(dst, 0), tmp1);
8350     subptr(len, 4);
8351     addptr(src, 4);
8352     addptr(dst, 8);
8353 
8354     bind(copy_bytes);
8355   } else {
8356     bind(below_threshold);
8357   }
8358 
8359   testl(len, len);
8360   jccb(Assembler::zero, done);
8361   lea(src, Address(src, len, Address::times_1));
8362   lea(dst, Address(dst, len, Address::times_2));
8363   negptr(len);
8364 
8365   // inflate 1 char per iter
8366   bind(copy_chars_loop);
8367   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
8368   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
8369   increment(len);
8370   jcc(Assembler::notZero, copy_chars_loop);
8371 
8372   bind(done);
8373 }
8374 
8375 
8376 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
8377   switch(type) {
8378     case T_BYTE:
8379     case T_BOOLEAN:
8380       evmovdqub(dst, kmask, src, false, vector_len);
8381       break;
8382     case T_CHAR:
8383     case T_SHORT:
8384       evmovdquw(dst, kmask, src, false, vector_len);
8385       break;
8386     case T_INT:
8387     case T_FLOAT:
8388       evmovdqul(dst, kmask, src, false, vector_len);
8389       break;
8390     case T_LONG:
8391     case T_DOUBLE:
8392       evmovdquq(dst, kmask, src, false, vector_len);
8393       break;
8394     default:
8395       fatal("Unexpected type argument %s", type2name(type));
8396       break;
8397   }
8398 }
8399 
8400 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
8401   switch(type) {
8402     case T_BYTE:
8403     case T_BOOLEAN:
8404       evmovdqub(dst, kmask, src, true, vector_len);
8405       break;
8406     case T_CHAR:
8407     case T_SHORT:
8408       evmovdquw(dst, kmask, src, true, vector_len);
8409       break;
8410     case T_INT:
8411     case T_FLOAT:
8412       evmovdqul(dst, kmask, src, true, vector_len);
8413       break;
8414     case T_LONG:
8415     case T_DOUBLE:
8416       evmovdquq(dst, kmask, src, true, vector_len);
8417       break;
8418     default:
8419       fatal("Unexpected type argument %s", type2name(type));
8420       break;
8421   }
8422 }
8423 
8424 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) {
8425   switch(masklen) {
8426     case 2:
8427        knotbl(dst, src);
8428        movl(rtmp, 3);
8429        kmovbl(ktmp, rtmp);
8430        kandbl(dst, ktmp, dst);
8431        break;
8432     case 4:
8433        knotbl(dst, src);
8434        movl(rtmp, 15);
8435        kmovbl(ktmp, rtmp);
8436        kandbl(dst, ktmp, dst);
8437        break;
8438     case 8:
8439        knotbl(dst, src);
8440        break;
8441     case 16:
8442        knotwl(dst, src);
8443        break;
8444     case 32:
8445        knotdl(dst, src);
8446        break;
8447     case 64:
8448        knotql(dst, src);
8449        break;
8450     default:
8451       fatal("Unexpected vector length %d", masklen);
8452       break;
8453   }
8454 }
8455 
8456 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8457   switch(type) {
8458     case T_BOOLEAN:
8459     case T_BYTE:
8460        kandbl(dst, src1, src2);
8461        break;
8462     case T_CHAR:
8463     case T_SHORT:
8464        kandwl(dst, src1, src2);
8465        break;
8466     case T_INT:
8467     case T_FLOAT:
8468        kanddl(dst, src1, src2);
8469        break;
8470     case T_LONG:
8471     case T_DOUBLE:
8472        kandql(dst, src1, src2);
8473        break;
8474     default:
8475       fatal("Unexpected type argument %s", type2name(type));
8476       break;
8477   }
8478 }
8479 
8480 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8481   switch(type) {
8482     case T_BOOLEAN:
8483     case T_BYTE:
8484        korbl(dst, src1, src2);
8485        break;
8486     case T_CHAR:
8487     case T_SHORT:
8488        korwl(dst, src1, src2);
8489        break;
8490     case T_INT:
8491     case T_FLOAT:
8492        kordl(dst, src1, src2);
8493        break;
8494     case T_LONG:
8495     case T_DOUBLE:
8496        korql(dst, src1, src2);
8497        break;
8498     default:
8499       fatal("Unexpected type argument %s", type2name(type));
8500       break;
8501   }
8502 }
8503 
8504 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8505   switch(type) {
8506     case T_BOOLEAN:
8507     case T_BYTE:
8508        kxorbl(dst, src1, src2);
8509        break;
8510     case T_CHAR:
8511     case T_SHORT:
8512        kxorwl(dst, src1, src2);
8513        break;
8514     case T_INT:
8515     case T_FLOAT:
8516        kxordl(dst, src1, src2);
8517        break;
8518     case T_LONG:
8519     case T_DOUBLE:
8520        kxorql(dst, src1, src2);
8521        break;
8522     default:
8523       fatal("Unexpected type argument %s", type2name(type));
8524       break;
8525   }
8526 }
8527 
8528 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8529   switch(type) {
8530     case T_BOOLEAN:
8531     case T_BYTE:
8532       evpermb(dst, mask, nds, src, merge, vector_len); break;
8533     case T_CHAR:
8534     case T_SHORT:
8535       evpermw(dst, mask, nds, src, merge, vector_len); break;
8536     case T_INT:
8537     case T_FLOAT:
8538       evpermd(dst, mask, nds, src, merge, vector_len); break;
8539     case T_LONG:
8540     case T_DOUBLE:
8541       evpermq(dst, mask, nds, src, merge, vector_len); break;
8542     default:
8543       fatal("Unexpected type argument %s", type2name(type)); break;
8544   }
8545 }
8546 
8547 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8548   switch(type) {
8549     case T_BOOLEAN:
8550     case T_BYTE:
8551       evpermb(dst, mask, nds, src, merge, vector_len); break;
8552     case T_CHAR:
8553     case T_SHORT:
8554       evpermw(dst, mask, nds, src, merge, vector_len); break;
8555     case T_INT:
8556     case T_FLOAT:
8557       evpermd(dst, mask, nds, src, merge, vector_len); break;
8558     case T_LONG:
8559     case T_DOUBLE:
8560       evpermq(dst, mask, nds, src, merge, vector_len); break;
8561     default:
8562       fatal("Unexpected type argument %s", type2name(type)); break;
8563   }
8564 }
8565 
8566 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8567   switch(type) {
8568     case T_BYTE:
8569       evpminsb(dst, mask, nds, src, merge, vector_len); break;
8570     case T_SHORT:
8571       evpminsw(dst, mask, nds, src, merge, vector_len); break;
8572     case T_INT:
8573       evpminsd(dst, mask, nds, src, merge, vector_len); break;
8574     case T_LONG:
8575       evpminsq(dst, mask, nds, src, merge, vector_len); break;
8576     default:
8577       fatal("Unexpected type argument %s", type2name(type)); break;
8578   }
8579 }
8580 
8581 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8582   switch(type) {
8583     case T_BYTE:
8584       evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
8585     case T_SHORT:
8586       evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
8587     case T_INT:
8588       evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
8589     case T_LONG:
8590       evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
8591     default:
8592       fatal("Unexpected type argument %s", type2name(type)); break;
8593   }
8594 }
8595 
8596 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8597   switch(type) {
8598     case T_BYTE:
8599       evpminsb(dst, mask, nds, src, merge, vector_len); break;
8600     case T_SHORT:
8601       evpminsw(dst, mask, nds, src, merge, vector_len); break;
8602     case T_INT:
8603       evpminsd(dst, mask, nds, src, merge, vector_len); break;
8604     case T_LONG:
8605       evpminsq(dst, mask, nds, src, merge, vector_len); break;
8606     default:
8607       fatal("Unexpected type argument %s", type2name(type)); break;
8608   }
8609 }
8610 
8611 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8612   switch(type) {
8613     case T_BYTE:
8614       evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
8615     case T_SHORT:
8616       evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
8617     case T_INT:
8618       evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
8619     case T_LONG:
8620       evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
8621     default:
8622       fatal("Unexpected type argument %s", type2name(type)); break;
8623   }
8624 }
8625 
8626 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8627   switch(type) {
8628     case T_INT:
8629       evpxord(dst, mask, nds, src, merge, vector_len); break;
8630     case T_LONG:
8631       evpxorq(dst, mask, nds, src, merge, vector_len); break;
8632     default:
8633       fatal("Unexpected type argument %s", type2name(type)); break;
8634   }
8635 }
8636 
8637 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8638   switch(type) {
8639     case T_INT:
8640       evpxord(dst, mask, nds, src, merge, vector_len); break;
8641     case T_LONG:
8642       evpxorq(dst, mask, nds, src, merge, vector_len); break;
8643     default:
8644       fatal("Unexpected type argument %s", type2name(type)); break;
8645   }
8646 }
8647 
8648 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8649   switch(type) {
8650     case T_INT:
8651       Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
8652     case T_LONG:
8653       evporq(dst, mask, nds, src, merge, vector_len); break;
8654     default:
8655       fatal("Unexpected type argument %s", type2name(type)); break;
8656   }
8657 }
8658 
8659 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8660   switch(type) {
8661     case T_INT:
8662       Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
8663     case T_LONG:
8664       evporq(dst, mask, nds, src, merge, vector_len); break;
8665     default:
8666       fatal("Unexpected type argument %s", type2name(type)); break;
8667   }
8668 }
8669 
8670 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8671   switch(type) {
8672     case T_INT:
8673       evpandd(dst, mask, nds, src, merge, vector_len); break;
8674     case T_LONG:
8675       evpandq(dst, mask, nds, src, merge, vector_len); break;
8676     default:
8677       fatal("Unexpected type argument %s", type2name(type)); break;
8678   }
8679 }
8680 
8681 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8682   switch(type) {
8683     case T_INT:
8684       evpandd(dst, mask, nds, src, merge, vector_len); break;
8685     case T_LONG:
8686       evpandq(dst, mask, nds, src, merge, vector_len); break;
8687     default:
8688       fatal("Unexpected type argument %s", type2name(type)); break;
8689   }
8690 }
8691 
8692 void MacroAssembler::anytrue(Register dst, uint masklen, KRegister src1, KRegister src2) {
8693    masklen = masklen < 8 ? 8 : masklen;
8694    ktest(masklen, src1, src2);
8695    setb(Assembler::notZero, dst);
8696    movzbl(dst, dst);
8697 }
8698 
8699 void MacroAssembler::alltrue(Register dst, uint masklen, KRegister src1, KRegister src2, KRegister kscratch) {
8700   if (masklen < 8) {
8701     knotbl(kscratch, src2);
8702     kortestbl(src1, kscratch);
8703     setb(Assembler::carrySet, dst);
8704     movzbl(dst, dst);
8705   } else {
8706     ktest(masklen, src1, src2);
8707     setb(Assembler::carrySet, dst);
8708     movzbl(dst, dst);
8709   }
8710 }
8711 
8712 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) {
8713   switch(masklen) {
8714     case 8:
8715        kortestbl(src1, src2);
8716        break;
8717     case 16:
8718        kortestwl(src1, src2);
8719        break;
8720     case 32:
8721        kortestdl(src1, src2);
8722        break;
8723     case 64:
8724        kortestql(src1, src2);
8725        break;
8726     default:
8727       fatal("Unexpected mask length %d", masklen);
8728       break;
8729   }
8730 }
8731 
8732 
8733 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) {
8734   switch(masklen)  {
8735     case 8:
8736        ktestbl(src1, src2);
8737        break;
8738     case 16:
8739        ktestwl(src1, src2);
8740        break;
8741     case 32:
8742        ktestdl(src1, src2);
8743        break;
8744     case 64:
8745        ktestql(src1, src2);
8746        break;
8747     default:
8748       fatal("Unexpected mask length %d", masklen);
8749       break;
8750   }
8751 }
8752 
8753 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
8754   switch(type) {
8755     case T_INT:
8756       evprold(dst, mask, src, shift, merge, vlen_enc); break;
8757     case T_LONG:
8758       evprolq(dst, mask, src, shift, merge, vlen_enc); break;
8759     default:
8760       fatal("Unexpected type argument %s", type2name(type)); break;
8761       break;
8762   }
8763 }
8764 
8765 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
8766   switch(type) {
8767     case T_INT:
8768       evprord(dst, mask, src, shift, merge, vlen_enc); break;
8769     case T_LONG:
8770       evprorq(dst, mask, src, shift, merge, vlen_enc); break;
8771     default:
8772       fatal("Unexpected type argument %s", type2name(type)); break;
8773   }
8774 }
8775 
8776 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
8777   switch(type) {
8778     case T_INT:
8779       evprolvd(dst, mask, src1, src2, merge, vlen_enc); break;
8780     case T_LONG:
8781       evprolvq(dst, mask, src1, src2, merge, vlen_enc); break;
8782     default:
8783       fatal("Unexpected type argument %s", type2name(type)); break;
8784   }
8785 }
8786 
8787 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
8788   switch(type) {
8789     case T_INT:
8790       evprorvd(dst, mask, src1, src2, merge, vlen_enc); break;
8791     case T_LONG:
8792       evprorvq(dst, mask, src1, src2, merge, vlen_enc); break;
8793     default:
8794       fatal("Unexpected type argument %s", type2name(type)); break;
8795   }
8796 }
8797 #if COMPILER2_OR_JVMCI
8798 
8799 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
8800                                  Register length, Register temp, int vec_enc) {
8801   // Computing mask for predicated vector store.
8802   movptr(temp, -1);
8803   bzhiq(temp, temp, length);
8804   kmov(mask, temp);
8805   evmovdqu(bt, mask, dst, xmm, vec_enc);
8806 }
8807 
8808 // Set memory operation for length "less than" 64 bytes.
8809 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
8810                                        XMMRegister xmm, KRegister mask, Register length,
8811                                        Register temp, bool use64byteVector) {
8812   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8813   BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
8814   if (!use64byteVector) {
8815     fill32(dst, disp, xmm);
8816     subptr(length, 32 >> shift);
8817     fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
8818   } else {
8819     assert(MaxVectorSize == 64, "vector length != 64");
8820     fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
8821   }
8822 }
8823 
8824 
8825 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
8826                                        XMMRegister xmm, KRegister mask, Register length,
8827                                        Register temp) {
8828   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8829   BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
8830   fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
8831 }
8832 
8833 
8834 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
8835   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8836   vmovdqu(Address(dst, disp), xmm);
8837 }
8838 
8839 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
8840   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8841   BasicType type[] = {T_BYTE,  T_SHORT,  T_INT,   T_LONG};
8842   if (!use64byteVector) {
8843     fill32(dst, disp, xmm);
8844     fill32(dst, disp + 32, xmm);
8845   } else {
8846     evmovdquq(Address(dst, disp), xmm, Assembler::AVX_512bit);
8847   }
8848 }
8849 
8850 #ifdef _LP64
8851 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
8852                                         Register count, Register rtmp, XMMRegister xtmp) {
8853   Label L_exit;
8854   Label L_fill_start;
8855   Label L_fill_64_bytes;
8856   Label L_fill_96_bytes;
8857   Label L_fill_128_bytes;
8858   Label L_fill_128_bytes_loop;
8859   Label L_fill_128_loop_header;
8860   Label L_fill_128_bytes_loop_header;
8861   Label L_fill_128_bytes_loop_pre_header;
8862   Label L_fill_zmm_sequence;
8863 
8864   int shift = -1;
8865   switch(type) {
8866     case T_BYTE:  shift = 0;
8867       break;
8868     case T_SHORT: shift = 1;
8869       break;
8870     case T_INT:   shift = 2;
8871       break;
8872     /* Uncomment when LONG fill stubs are supported.
8873     case T_LONG:  shift = 3;
8874       break;
8875     */
8876     default:
8877       fatal("Unhandled type: %s\n", type2name(type));
8878   }
8879 
8880   if (AVX3Threshold != 0  || MaxVectorSize == 32) {
8881 
8882     if (MaxVectorSize == 64) {
8883       cmpq(count, AVX3Threshold >> shift);
8884       jcc(Assembler::greater, L_fill_zmm_sequence);
8885     }
8886 
8887     evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
8888 
8889     bind(L_fill_start);
8890 
8891     cmpq(count, 32 >> shift);
8892     jccb(Assembler::greater, L_fill_64_bytes);
8893     fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
8894     jmp(L_exit);
8895 
8896     bind(L_fill_64_bytes);
8897     cmpq(count, 64 >> shift);
8898     jccb(Assembler::greater, L_fill_96_bytes);
8899     fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
8900     jmp(L_exit);
8901 
8902     bind(L_fill_96_bytes);
8903     cmpq(count, 96 >> shift);
8904     jccb(Assembler::greater, L_fill_128_bytes);
8905     fill64(to, 0, xtmp);
8906     subq(count, 64 >> shift);
8907     fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
8908     jmp(L_exit);
8909 
8910     bind(L_fill_128_bytes);
8911     cmpq(count, 128 >> shift);
8912     jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
8913     fill64(to, 0, xtmp);
8914     fill32(to, 64, xtmp);
8915     subq(count, 96 >> shift);
8916     fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
8917     jmp(L_exit);
8918 
8919     bind(L_fill_128_bytes_loop_pre_header);
8920     {
8921       mov(rtmp, to);
8922       andq(rtmp, 31);
8923       jccb(Assembler::zero, L_fill_128_bytes_loop_header);
8924       negq(rtmp);
8925       addq(rtmp, 32);
8926       mov64(r8, -1L);
8927       bzhiq(r8, r8, rtmp);
8928       kmovql(k2, r8);
8929       evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, Assembler::AVX_256bit);
8930       addq(to, rtmp);
8931       shrq(rtmp, shift);
8932       subq(count, rtmp);
8933     }
8934 
8935     cmpq(count, 128 >> shift);
8936     jcc(Assembler::less, L_fill_start);
8937 
8938     bind(L_fill_128_bytes_loop_header);
8939     subq(count, 128 >> shift);
8940 
8941     align32();
8942     bind(L_fill_128_bytes_loop);
8943       fill64(to, 0, xtmp);
8944       fill64(to, 64, xtmp);
8945       addq(to, 128);
8946       subq(count, 128 >> shift);
8947       jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
8948 
8949     addq(count, 128 >> shift);
8950     jcc(Assembler::zero, L_exit);
8951     jmp(L_fill_start);
8952   }
8953 
8954   if (MaxVectorSize == 64) {
8955     // Sequence using 64 byte ZMM register.
8956     Label L_fill_128_bytes_zmm;
8957     Label L_fill_192_bytes_zmm;
8958     Label L_fill_192_bytes_loop_zmm;
8959     Label L_fill_192_bytes_loop_header_zmm;
8960     Label L_fill_192_bytes_loop_pre_header_zmm;
8961     Label L_fill_start_zmm_sequence;
8962 
8963     bind(L_fill_zmm_sequence);
8964     evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
8965 
8966     bind(L_fill_start_zmm_sequence);
8967     cmpq(count, 64 >> shift);
8968     jccb(Assembler::greater, L_fill_128_bytes_zmm);
8969     fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
8970     jmp(L_exit);
8971 
8972     bind(L_fill_128_bytes_zmm);
8973     cmpq(count, 128 >> shift);
8974     jccb(Assembler::greater, L_fill_192_bytes_zmm);
8975     fill64(to, 0, xtmp, true);
8976     subq(count, 64 >> shift);
8977     fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
8978     jmp(L_exit);
8979 
8980     bind(L_fill_192_bytes_zmm);
8981     cmpq(count, 192 >> shift);
8982     jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
8983     fill64(to, 0, xtmp, true);
8984     fill64(to, 64, xtmp, true);
8985     subq(count, 128 >> shift);
8986     fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
8987     jmp(L_exit);
8988 
8989     bind(L_fill_192_bytes_loop_pre_header_zmm);
8990     {
8991       movq(rtmp, to);
8992       andq(rtmp, 63);
8993       jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
8994       negq(rtmp);
8995       addq(rtmp, 64);
8996       mov64(r8, -1L);
8997       bzhiq(r8, r8, rtmp);
8998       kmovql(k2, r8);
8999       evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, Assembler::AVX_512bit);
9000       addq(to, rtmp);
9001       shrq(rtmp, shift);
9002       subq(count, rtmp);
9003     }
9004 
9005     cmpq(count, 192 >> shift);
9006     jcc(Assembler::less, L_fill_start_zmm_sequence);
9007 
9008     bind(L_fill_192_bytes_loop_header_zmm);
9009     subq(count, 192 >> shift);
9010 
9011     align32();
9012     bind(L_fill_192_bytes_loop_zmm);
9013       fill64(to, 0, xtmp, true);
9014       fill64(to, 64, xtmp, true);
9015       fill64(to, 128, xtmp, true);
9016       addq(to, 192);
9017       subq(count, 192 >> shift);
9018       jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
9019 
9020     addq(count, 192 >> shift);
9021     jcc(Assembler::zero, L_exit);
9022     jmp(L_fill_start_zmm_sequence);
9023   }
9024   bind(L_exit);
9025 }
9026 #endif
9027 #endif //COMPILER2_OR_JVMCI
9028 
9029 
9030 #ifdef _LP64
9031 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
9032   Label done;
9033   cvttss2sil(dst, src);
9034   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9035   cmpl(dst, 0x80000000); // float_sign_flip
9036   jccb(Assembler::notEqual, done);
9037   subptr(rsp, 8);
9038   movflt(Address(rsp, 0), src);
9039   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
9040   pop(dst);
9041   bind(done);
9042 }
9043 
9044 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
9045   Label done;
9046   cvttsd2sil(dst, src);
9047   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9048   cmpl(dst, 0x80000000); // float_sign_flip
9049   jccb(Assembler::notEqual, done);
9050   subptr(rsp, 8);
9051   movdbl(Address(rsp, 0), src);
9052   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
9053   pop(dst);
9054   bind(done);
9055 }
9056 
9057 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
9058   Label done;
9059   cvttss2siq(dst, src);
9060   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9061   jccb(Assembler::notEqual, done);
9062   subptr(rsp, 8);
9063   movflt(Address(rsp, 0), src);
9064   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
9065   pop(dst);
9066   bind(done);
9067 }
9068 
9069 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
9070   Label done;
9071   cvttsd2siq(dst, src);
9072   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9073   jccb(Assembler::notEqual, done);
9074   subptr(rsp, 8);
9075   movdbl(Address(rsp, 0), src);
9076   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
9077   pop(dst);
9078   bind(done);
9079 }
9080 
9081 void MacroAssembler::cache_wb(Address line)
9082 {
9083   // 64 bit cpus always support clflush
9084   assert(VM_Version::supports_clflush(), "clflush should be available");
9085   bool optimized = VM_Version::supports_clflushopt();
9086   bool no_evict = VM_Version::supports_clwb();
9087 
9088   // prefer clwb (writeback without evict) otherwise
9089   // prefer clflushopt (potentially parallel writeback with evict)
9090   // otherwise fallback on clflush (serial writeback with evict)
9091 
9092   if (optimized) {
9093     if (no_evict) {
9094       clwb(line);
9095     } else {
9096       clflushopt(line);
9097     }
9098   } else {
9099     // no need for fence when using CLFLUSH
9100     clflush(line);
9101   }
9102 }
9103 
9104 void MacroAssembler::cache_wbsync(bool is_pre)
9105 {
9106   assert(VM_Version::supports_clflush(), "clflush should be available");
9107   bool optimized = VM_Version::supports_clflushopt();
9108   bool no_evict = VM_Version::supports_clwb();
9109 
9110   // pick the correct implementation
9111 
9112   if (!is_pre && (optimized || no_evict)) {
9113     // need an sfence for post flush when using clflushopt or clwb
9114     // otherwise no no need for any synchroniaztion
9115 
9116     sfence();
9117   }
9118 }
9119 
9120 #endif // _LP64
9121 
9122 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9123   switch (cond) {
9124     // Note some conditions are synonyms for others
9125     case Assembler::zero:         return Assembler::notZero;
9126     case Assembler::notZero:      return Assembler::zero;
9127     case Assembler::less:         return Assembler::greaterEqual;
9128     case Assembler::lessEqual:    return Assembler::greater;
9129     case Assembler::greater:      return Assembler::lessEqual;
9130     case Assembler::greaterEqual: return Assembler::less;
9131     case Assembler::below:        return Assembler::aboveEqual;
9132     case Assembler::belowEqual:   return Assembler::above;
9133     case Assembler::above:        return Assembler::belowEqual;
9134     case Assembler::aboveEqual:   return Assembler::below;
9135     case Assembler::overflow:     return Assembler::noOverflow;
9136     case Assembler::noOverflow:   return Assembler::overflow;
9137     case Assembler::negative:     return Assembler::positive;
9138     case Assembler::positive:     return Assembler::negative;
9139     case Assembler::parity:       return Assembler::noParity;
9140     case Assembler::noParity:     return Assembler::parity;
9141   }
9142   ShouldNotReachHere(); return Assembler::overflow;
9143 }
9144 
9145 SkipIfEqual::SkipIfEqual(
9146     MacroAssembler* masm, const bool* flag_addr, bool value) {
9147   _masm = masm;
9148   _masm->cmp8(ExternalAddress((address)flag_addr), value);
9149   _masm->jcc(Assembler::equal, _label);
9150 }
9151 
9152 SkipIfEqual::~SkipIfEqual() {
9153   _masm->bind(_label);
9154 }
9155 
9156 // 32-bit Windows has its own fast-path implementation
9157 // of get_thread
9158 #if !defined(WIN32) || defined(_LP64)
9159 
9160 // This is simply a call to Thread::current()
9161 void MacroAssembler::get_thread(Register thread) {
9162   if (thread != rax) {
9163     push(rax);
9164   }
9165   LP64_ONLY(push(rdi);)
9166   LP64_ONLY(push(rsi);)
9167   push(rdx);
9168   push(rcx);
9169 #ifdef _LP64
9170   push(r8);
9171   push(r9);
9172   push(r10);
9173   push(r11);
9174 #endif
9175 
9176   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
9177 
9178 #ifdef _LP64
9179   pop(r11);
9180   pop(r10);
9181   pop(r9);
9182   pop(r8);
9183 #endif
9184   pop(rcx);
9185   pop(rdx);
9186   LP64_ONLY(pop(rsi);)
9187   LP64_ONLY(pop(rdi);)
9188   if (thread != rax) {
9189     mov(thread, rax);
9190     pop(rax);
9191   }
9192 }
9193 
9194 #endif // !WIN32 || _LP64
--- EOF ---