1 /*
   2  * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "jvm.h"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "compiler/compiler_globals.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/collectedHeap.inline.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/bytecodeHistogram.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/resourceArea.hpp"
  38 #include "memory/universe.hpp"
  39 #include "oops/accessDecorators.hpp"
  40 #include "oops/compressedOops.inline.hpp"
  41 #include "oops/klass.inline.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "runtime/flags/flagSetting.hpp"
  44 #include "runtime/interfaceSupport.inline.hpp"
  45 #include "runtime/jniHandles.hpp"
  46 #include "runtime/objectMonitor.hpp"
  47 #include "runtime/os.hpp"
  48 #include "runtime/safepoint.hpp"
  49 #include "runtime/safepointMechanism.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "runtime/thread.hpp"
  53 #include "utilities/macros.hpp"
  54 #include "crc32c.h"
  55 
  56 #ifdef PRODUCT
  57 #define BLOCK_COMMENT(str) /* nothing */
  58 #define STOP(error) stop(error)
  59 #else
  60 #define BLOCK_COMMENT(str) block_comment(str)
  61 #define STOP(error) block_comment(error); stop(error)
  62 #endif
  63 
  64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  65 
  66 #ifdef ASSERT
  67 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  68 #endif
  69 
  70 static Assembler::Condition reverse[] = {
  71     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  72     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  73     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  74     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  75     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  76     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  77     Assembler::above          /* belowEqual    = 0x6 */ ,
  78     Assembler::belowEqual     /* above         = 0x7 */ ,
  79     Assembler::positive       /* negative      = 0x8 */ ,
  80     Assembler::negative       /* positive      = 0x9 */ ,
  81     Assembler::noParity       /* parity        = 0xa */ ,
  82     Assembler::parity         /* noParity      = 0xb */ ,
  83     Assembler::greaterEqual   /* less          = 0xc */ ,
  84     Assembler::less           /* greaterEqual  = 0xd */ ,
  85     Assembler::greater        /* lessEqual     = 0xe */ ,
  86     Assembler::lessEqual      /* greater       = 0xf, */
  87 
  88 };
  89 
  90 
  91 // Implementation of MacroAssembler
  92 
  93 // First all the versions that have distinct versions depending on 32/64 bit
  94 // Unless the difference is trivial (1 line or so).
  95 
  96 #ifndef _LP64
  97 
  98 // 32bit versions
  99 
 100 Address MacroAssembler::as_Address(AddressLiteral adr) {
 101   return Address(adr.target(), adr.rspec());
 102 }
 103 
 104 Address MacroAssembler::as_Address(ArrayAddress adr) {
 105   return Address::make_array(adr);
 106 }
 107 
 108 void MacroAssembler::call_VM_leaf_base(address entry_point,
 109                                        int number_of_arguments) {
 110   call(RuntimeAddress(entry_point));
 111   increment(rsp, number_of_arguments * wordSize);
 112 }
 113 
 114 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 115   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 116 }
 117 
 118 
 119 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 120   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 121 }
 122 
 123 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 124   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 125 }
 126 
 127 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 128   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 129 }
 130 
 131 void MacroAssembler::extend_sign(Register hi, Register lo) {
 132   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 133   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 134     cdql();
 135   } else {
 136     movl(hi, lo);
 137     sarl(hi, 31);
 138   }
 139 }
 140 
 141 void MacroAssembler::jC2(Register tmp, Label& L) {
 142   // set parity bit if FPU flag C2 is set (via rax)
 143   save_rax(tmp);
 144   fwait(); fnstsw_ax();
 145   sahf();
 146   restore_rax(tmp);
 147   // branch
 148   jcc(Assembler::parity, L);
 149 }
 150 
 151 void MacroAssembler::jnC2(Register tmp, Label& L) {
 152   // set parity bit if FPU flag C2 is set (via rax)
 153   save_rax(tmp);
 154   fwait(); fnstsw_ax();
 155   sahf();
 156   restore_rax(tmp);
 157   // branch
 158   jcc(Assembler::noParity, L);
 159 }
 160 
 161 // 32bit can do a case table jump in one instruction but we no longer allow the base
 162 // to be installed in the Address class
 163 void MacroAssembler::jump(ArrayAddress entry) {
 164   jmp(as_Address(entry));
 165 }
 166 
 167 // Note: y_lo will be destroyed
 168 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 169   // Long compare for Java (semantics as described in JVM spec.)
 170   Label high, low, done;
 171 
 172   cmpl(x_hi, y_hi);
 173   jcc(Assembler::less, low);
 174   jcc(Assembler::greater, high);
 175   // x_hi is the return register
 176   xorl(x_hi, x_hi);
 177   cmpl(x_lo, y_lo);
 178   jcc(Assembler::below, low);
 179   jcc(Assembler::equal, done);
 180 
 181   bind(high);
 182   xorl(x_hi, x_hi);
 183   increment(x_hi);
 184   jmp(done);
 185 
 186   bind(low);
 187   xorl(x_hi, x_hi);
 188   decrementl(x_hi);
 189 
 190   bind(done);
 191 }
 192 
 193 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 194     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 195 }
 196 
 197 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 198   // leal(dst, as_Address(adr));
 199   // see note in movl as to why we must use a move
 200   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 201 }
 202 
 203 void MacroAssembler::leave() {
 204   mov(rsp, rbp);
 205   pop(rbp);
 206 }
 207 
 208 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 209   // Multiplication of two Java long values stored on the stack
 210   // as illustrated below. Result is in rdx:rax.
 211   //
 212   // rsp ---> [  ??  ] \               \
 213   //            ....    | y_rsp_offset  |
 214   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 215   //          [ y_hi ]                  | (in bytes)
 216   //            ....                    |
 217   //          [ x_lo ]                 /
 218   //          [ x_hi ]
 219   //            ....
 220   //
 221   // Basic idea: lo(result) = lo(x_lo * y_lo)
 222   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 223   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 224   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 225   Label quick;
 226   // load x_hi, y_hi and check if quick
 227   // multiplication is possible
 228   movl(rbx, x_hi);
 229   movl(rcx, y_hi);
 230   movl(rax, rbx);
 231   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 232   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 233   // do full multiplication
 234   // 1st step
 235   mull(y_lo);                                    // x_hi * y_lo
 236   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 237   // 2nd step
 238   movl(rax, x_lo);
 239   mull(rcx);                                     // x_lo * y_hi
 240   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 241   // 3rd step
 242   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 243   movl(rax, x_lo);
 244   mull(y_lo);                                    // x_lo * y_lo
 245   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 246 }
 247 
 248 void MacroAssembler::lneg(Register hi, Register lo) {
 249   negl(lo);
 250   adcl(hi, 0);
 251   negl(hi);
 252 }
 253 
 254 void MacroAssembler::lshl(Register hi, Register lo) {
 255   // Java shift left long support (semantics as described in JVM spec., p.305)
 256   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 257   // shift value is in rcx !
 258   assert(hi != rcx, "must not use rcx");
 259   assert(lo != rcx, "must not use rcx");
 260   const Register s = rcx;                        // shift count
 261   const int      n = BitsPerWord;
 262   Label L;
 263   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 264   cmpl(s, n);                                    // if (s < n)
 265   jcc(Assembler::less, L);                       // else (s >= n)
 266   movl(hi, lo);                                  // x := x << n
 267   xorl(lo, lo);
 268   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 269   bind(L);                                       // s (mod n) < n
 270   shldl(hi, lo);                                 // x := x << s
 271   shll(lo);
 272 }
 273 
 274 
 275 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 276   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 277   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 278   assert(hi != rcx, "must not use rcx");
 279   assert(lo != rcx, "must not use rcx");
 280   const Register s = rcx;                        // shift count
 281   const int      n = BitsPerWord;
 282   Label L;
 283   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 284   cmpl(s, n);                                    // if (s < n)
 285   jcc(Assembler::less, L);                       // else (s >= n)
 286   movl(lo, hi);                                  // x := x >> n
 287   if (sign_extension) sarl(hi, 31);
 288   else                xorl(hi, hi);
 289   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 290   bind(L);                                       // s (mod n) < n
 291   shrdl(lo, hi);                                 // x := x >> s
 292   if (sign_extension) sarl(hi);
 293   else                shrl(hi);
 294 }
 295 
 296 void MacroAssembler::movoop(Register dst, jobject obj) {
 297   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 298 }
 299 
 300 void MacroAssembler::movoop(Address dst, jobject obj) {
 301   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 302 }
 303 
 304 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 305   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 306 }
 307 
 308 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 309   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 310 }
 311 
 312 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 313   // scratch register is not used,
 314   // it is defined to match parameters of 64-bit version of this method.
 315   if (src.is_lval()) {
 316     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 317   } else {
 318     movl(dst, as_Address(src));
 319   }
 320 }
 321 
 322 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 323   movl(as_Address(dst), src);
 324 }
 325 
 326 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 327   movl(dst, as_Address(src));
 328 }
 329 
 330 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 331 void MacroAssembler::movptr(Address dst, intptr_t src) {
 332   movl(dst, src);
 333 }
 334 
 335 
 336 void MacroAssembler::pop_callee_saved_registers() {
 337   pop(rcx);
 338   pop(rdx);
 339   pop(rdi);
 340   pop(rsi);
 341 }
 342 
 343 void MacroAssembler::push_callee_saved_registers() {
 344   push(rsi);
 345   push(rdi);
 346   push(rdx);
 347   push(rcx);
 348 }
 349 
 350 void MacroAssembler::pushoop(jobject obj) {
 351   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 352 }
 353 
 354 void MacroAssembler::pushklass(Metadata* obj) {
 355   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 356 }
 357 
 358 void MacroAssembler::pushptr(AddressLiteral src) {
 359   if (src.is_lval()) {
 360     push_literal32((int32_t)src.target(), src.rspec());
 361   } else {
 362     pushl(as_Address(src));
 363   }
 364 }
 365 
 366 static void pass_arg0(MacroAssembler* masm, Register arg) {
 367   masm->push(arg);
 368 }
 369 
 370 static void pass_arg1(MacroAssembler* masm, Register arg) {
 371   masm->push(arg);
 372 }
 373 
 374 static void pass_arg2(MacroAssembler* masm, Register arg) {
 375   masm->push(arg);
 376 }
 377 
 378 static void pass_arg3(MacroAssembler* masm, Register arg) {
 379   masm->push(arg);
 380 }
 381 
 382 #ifndef PRODUCT
 383 extern "C" void findpc(intptr_t x);
 384 #endif
 385 
 386 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 387   // In order to get locks to work, we need to fake a in_VM state
 388   JavaThread* thread = JavaThread::current();
 389   JavaThreadState saved_state = thread->thread_state();
 390   thread->set_thread_state(_thread_in_vm);
 391   if (ShowMessageBoxOnError) {
 392     JavaThread* thread = JavaThread::current();
 393     JavaThreadState saved_state = thread->thread_state();
 394     thread->set_thread_state(_thread_in_vm);
 395     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 396       ttyLocker ttyl;
 397       BytecodeCounter::print();
 398     }
 399     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 400     // This is the value of eip which points to where verify_oop will return.
 401     if (os::message_box(msg, "Execution stopped, print registers?")) {
 402       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 403       BREAKPOINT;
 404     }
 405   }
 406   fatal("DEBUG MESSAGE: %s", msg);
 407 }
 408 
 409 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 410   ttyLocker ttyl;
 411   FlagSetting fs(Debugging, true);
 412   tty->print_cr("eip = 0x%08x", eip);
 413 #ifndef PRODUCT
 414   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 415     tty->cr();
 416     findpc(eip);
 417     tty->cr();
 418   }
 419 #endif
 420 #define PRINT_REG(rax) \
 421   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 422   PRINT_REG(rax);
 423   PRINT_REG(rbx);
 424   PRINT_REG(rcx);
 425   PRINT_REG(rdx);
 426   PRINT_REG(rdi);
 427   PRINT_REG(rsi);
 428   PRINT_REG(rbp);
 429   PRINT_REG(rsp);
 430 #undef PRINT_REG
 431   // Print some words near top of staack.
 432   int* dump_sp = (int*) rsp;
 433   for (int col1 = 0; col1 < 8; col1++) {
 434     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 435     os::print_location(tty, *dump_sp++);
 436   }
 437   for (int row = 0; row < 16; row++) {
 438     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 439     for (int col = 0; col < 8; col++) {
 440       tty->print(" 0x%08x", *dump_sp++);
 441     }
 442     tty->cr();
 443   }
 444   // Print some instructions around pc:
 445   Disassembler::decode((address)eip-64, (address)eip);
 446   tty->print_cr("--------");
 447   Disassembler::decode((address)eip, (address)eip+32);
 448 }
 449 
 450 void MacroAssembler::stop(const char* msg) {
 451   ExternalAddress message((address)msg);
 452   // push address of message
 453   pushptr(message.addr());
 454   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 455   pusha();                                            // push registers
 456   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 457   hlt();
 458 }
 459 
 460 void MacroAssembler::warn(const char* msg) {
 461   push_CPU_state();
 462 
 463   ExternalAddress message((address) msg);
 464   // push address of message
 465   pushptr(message.addr());
 466 
 467   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 468   addl(rsp, wordSize);       // discard argument
 469   pop_CPU_state();
 470 }
 471 
 472 void MacroAssembler::print_state() {
 473   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 474   pusha();                                            // push registers
 475 
 476   push_CPU_state();
 477   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 478   pop_CPU_state();
 479 
 480   popa();
 481   addl(rsp, wordSize);
 482 }
 483 
 484 #else // _LP64
 485 
 486 // 64 bit versions
 487 
 488 Address MacroAssembler::as_Address(AddressLiteral adr) {
 489   // amd64 always does this as a pc-rel
 490   // we can be absolute or disp based on the instruction type
 491   // jmp/call are displacements others are absolute
 492   assert(!adr.is_lval(), "must be rval");
 493   assert(reachable(adr), "must be");
 494   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 495 
 496 }
 497 
 498 Address MacroAssembler::as_Address(ArrayAddress adr) {
 499   AddressLiteral base = adr.base();
 500   lea(rscratch1, base);
 501   Address index = adr.index();
 502   assert(index._disp == 0, "must not have disp"); // maybe it can?
 503   Address array(rscratch1, index._index, index._scale, index._disp);
 504   return array;
 505 }
 506 
 507 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 508   Label L, E;
 509 
 510 #ifdef _WIN64
 511   // Windows always allocates space for it's register args
 512   assert(num_args <= 4, "only register arguments supported");
 513   subq(rsp,  frame::arg_reg_save_area_bytes);
 514 #endif
 515 
 516   // Align stack if necessary
 517   testl(rsp, 15);
 518   jcc(Assembler::zero, L);
 519 
 520   subq(rsp, 8);
 521   {
 522     call(RuntimeAddress(entry_point));
 523   }
 524   addq(rsp, 8);
 525   jmp(E);
 526 
 527   bind(L);
 528   {
 529     call(RuntimeAddress(entry_point));
 530   }
 531 
 532   bind(E);
 533 
 534 #ifdef _WIN64
 535   // restore stack pointer
 536   addq(rsp, frame::arg_reg_save_area_bytes);
 537 #endif
 538 
 539 }
 540 
 541 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 542   assert(!src2.is_lval(), "should use cmpptr");
 543 
 544   if (reachable(src2)) {
 545     cmpq(src1, as_Address(src2));
 546   } else {
 547     lea(rscratch1, src2);
 548     Assembler::cmpq(src1, Address(rscratch1, 0));
 549   }
 550 }
 551 
 552 int MacroAssembler::corrected_idivq(Register reg) {
 553   // Full implementation of Java ldiv and lrem; checks for special
 554   // case as described in JVM spec., p.243 & p.271.  The function
 555   // returns the (pc) offset of the idivl instruction - may be needed
 556   // for implicit exceptions.
 557   //
 558   //         normal case                           special case
 559   //
 560   // input : rax: dividend                         min_long
 561   //         reg: divisor   (may not be eax/edx)   -1
 562   //
 563   // output: rax: quotient  (= rax idiv reg)       min_long
 564   //         rdx: remainder (= rax irem reg)       0
 565   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 566   static const int64_t min_long = 0x8000000000000000;
 567   Label normal_case, special_case;
 568 
 569   // check for special case
 570   cmp64(rax, ExternalAddress((address) &min_long));
 571   jcc(Assembler::notEqual, normal_case);
 572   xorl(rdx, rdx); // prepare rdx for possible special case (where
 573                   // remainder = 0)
 574   cmpq(reg, -1);
 575   jcc(Assembler::equal, special_case);
 576 
 577   // handle normal case
 578   bind(normal_case);
 579   cdqq();
 580   int idivq_offset = offset();
 581   idivq(reg);
 582 
 583   // normal and special case exit
 584   bind(special_case);
 585 
 586   return idivq_offset;
 587 }
 588 
 589 void MacroAssembler::decrementq(Register reg, int value) {
 590   if (value == min_jint) { subq(reg, value); return; }
 591   if (value <  0) { incrementq(reg, -value); return; }
 592   if (value == 0) {                        ; return; }
 593   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 594   /* else */      { subq(reg, value)       ; return; }
 595 }
 596 
 597 void MacroAssembler::decrementq(Address dst, int value) {
 598   if (value == min_jint) { subq(dst, value); return; }
 599   if (value <  0) { incrementq(dst, -value); return; }
 600   if (value == 0) {                        ; return; }
 601   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 602   /* else */      { subq(dst, value)       ; return; }
 603 }
 604 
 605 void MacroAssembler::incrementq(AddressLiteral dst) {
 606   if (reachable(dst)) {
 607     incrementq(as_Address(dst));
 608   } else {
 609     lea(rscratch1, dst);
 610     incrementq(Address(rscratch1, 0));
 611   }
 612 }
 613 
 614 void MacroAssembler::incrementq(Register reg, int value) {
 615   if (value == min_jint) { addq(reg, value); return; }
 616   if (value <  0) { decrementq(reg, -value); return; }
 617   if (value == 0) {                        ; return; }
 618   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 619   /* else */      { addq(reg, value)       ; return; }
 620 }
 621 
 622 void MacroAssembler::incrementq(Address dst, int value) {
 623   if (value == min_jint) { addq(dst, value); return; }
 624   if (value <  0) { decrementq(dst, -value); return; }
 625   if (value == 0) {                        ; return; }
 626   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 627   /* else */      { addq(dst, value)       ; return; }
 628 }
 629 
 630 // 32bit can do a case table jump in one instruction but we no longer allow the base
 631 // to be installed in the Address class
 632 void MacroAssembler::jump(ArrayAddress entry) {
 633   lea(rscratch1, entry.base());
 634   Address dispatch = entry.index();
 635   assert(dispatch._base == noreg, "must be");
 636   dispatch._base = rscratch1;
 637   jmp(dispatch);
 638 }
 639 
 640 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 641   ShouldNotReachHere(); // 64bit doesn't use two regs
 642   cmpq(x_lo, y_lo);
 643 }
 644 
 645 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 646     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 647 }
 648 
 649 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 650   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 651   movptr(dst, rscratch1);
 652 }
 653 
 654 void MacroAssembler::leave() {
 655   // %%% is this really better? Why not on 32bit too?
 656   emit_int8((unsigned char)0xC9); // LEAVE
 657 }
 658 
 659 void MacroAssembler::lneg(Register hi, Register lo) {
 660   ShouldNotReachHere(); // 64bit doesn't use two regs
 661   negq(lo);
 662 }
 663 
 664 void MacroAssembler::movoop(Register dst, jobject obj) {
 665   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 666 }
 667 
 668 void MacroAssembler::movoop(Address dst, jobject obj) {
 669   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 670   movq(dst, rscratch1);
 671 }
 672 
 673 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 674   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 675 }
 676 
 677 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 678   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 679   movq(dst, rscratch1);
 680 }
 681 
 682 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 683   if (src.is_lval()) {
 684     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 685   } else {
 686     if (reachable(src)) {
 687       movq(dst, as_Address(src));
 688     } else {
 689       lea(scratch, src);
 690       movq(dst, Address(scratch, 0));
 691     }
 692   }
 693 }
 694 
 695 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 696   movq(as_Address(dst), src);
 697 }
 698 
 699 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 700   movq(dst, as_Address(src));
 701 }
 702 
 703 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 704 void MacroAssembler::movptr(Address dst, intptr_t src) {
 705   if (is_simm32(src)) {
 706     movptr(dst, checked_cast<int32_t>(src));
 707   } else {
 708     mov64(rscratch1, src);
 709     movq(dst, rscratch1);
 710   }
 711 }
 712 
 713 // These are mostly for initializing NULL
 714 void MacroAssembler::movptr(Address dst, int32_t src) {
 715   movslq(dst, src);
 716 }
 717 
 718 void MacroAssembler::movptr(Register dst, int32_t src) {
 719   mov64(dst, (intptr_t)src);
 720 }
 721 
 722 void MacroAssembler::pushoop(jobject obj) {
 723   movoop(rscratch1, obj);
 724   push(rscratch1);
 725 }
 726 
 727 void MacroAssembler::pushklass(Metadata* obj) {
 728   mov_metadata(rscratch1, obj);
 729   push(rscratch1);
 730 }
 731 
 732 void MacroAssembler::pushptr(AddressLiteral src) {
 733   lea(rscratch1, src);
 734   if (src.is_lval()) {
 735     push(rscratch1);
 736   } else {
 737     pushq(Address(rscratch1, 0));
 738   }
 739 }
 740 
 741 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 742   reset_last_Java_frame(r15_thread, clear_fp);
 743 }
 744 
 745 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 746                                          Register last_java_fp,
 747                                          address  last_java_pc) {
 748   vzeroupper();
 749   // determine last_java_sp register
 750   if (!last_java_sp->is_valid()) {
 751     last_java_sp = rsp;
 752   }
 753 
 754   // last_java_fp is optional
 755   if (last_java_fp->is_valid()) {
 756     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 757            last_java_fp);
 758   }
 759 
 760   // last_java_pc is optional
 761   if (last_java_pc != NULL) {
 762     Address java_pc(r15_thread,
 763                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 764     lea(rscratch1, InternalAddress(last_java_pc));
 765     movptr(java_pc, rscratch1);
 766   }
 767 
 768   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 769 }
 770 
 771 static void pass_arg0(MacroAssembler* masm, Register arg) {
 772   if (c_rarg0 != arg ) {
 773     masm->mov(c_rarg0, arg);
 774   }
 775 }
 776 
 777 static void pass_arg1(MacroAssembler* masm, Register arg) {
 778   if (c_rarg1 != arg ) {
 779     masm->mov(c_rarg1, arg);
 780   }
 781 }
 782 
 783 static void pass_arg2(MacroAssembler* masm, Register arg) {
 784   if (c_rarg2 != arg ) {
 785     masm->mov(c_rarg2, arg);
 786   }
 787 }
 788 
 789 static void pass_arg3(MacroAssembler* masm, Register arg) {
 790   if (c_rarg3 != arg ) {
 791     masm->mov(c_rarg3, arg);
 792   }
 793 }
 794 
 795 void MacroAssembler::stop(const char* msg) {
 796   if (ShowMessageBoxOnError) {
 797     address rip = pc();
 798     pusha(); // get regs on stack
 799     lea(c_rarg1, InternalAddress(rip));
 800     movq(c_rarg2, rsp); // pass pointer to regs array
 801   }
 802   lea(c_rarg0, ExternalAddress((address) msg));
 803   andq(rsp, -16); // align stack as required by ABI
 804   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 805   hlt();
 806 }
 807 
 808 void MacroAssembler::warn(const char* msg) {
 809   push(rbp);
 810   movq(rbp, rsp);
 811   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 812   push_CPU_state();   // keeps alignment at 16 bytes
 813   lea(c_rarg0, ExternalAddress((address) msg));
 814   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
 815   call(rax);
 816   pop_CPU_state();
 817   mov(rsp, rbp);
 818   pop(rbp);
 819 }
 820 
 821 void MacroAssembler::print_state() {
 822   address rip = pc();
 823   pusha();            // get regs on stack
 824   push(rbp);
 825   movq(rbp, rsp);
 826   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 827   push_CPU_state();   // keeps alignment at 16 bytes
 828 
 829   lea(c_rarg0, InternalAddress(rip));
 830   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 831   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 832 
 833   pop_CPU_state();
 834   mov(rsp, rbp);
 835   pop(rbp);
 836   popa();
 837 }
 838 
 839 #ifndef PRODUCT
 840 extern "C" void findpc(intptr_t x);
 841 #endif
 842 
 843 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 844   // In order to get locks to work, we need to fake a in_VM state
 845   if (ShowMessageBoxOnError) {
 846     JavaThread* thread = JavaThread::current();
 847     JavaThreadState saved_state = thread->thread_state();
 848     thread->set_thread_state(_thread_in_vm);
 849 #ifndef PRODUCT
 850     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 851       ttyLocker ttyl;
 852       BytecodeCounter::print();
 853     }
 854 #endif
 855     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 856     // XXX correct this offset for amd64
 857     // This is the value of eip which points to where verify_oop will return.
 858     if (os::message_box(msg, "Execution stopped, print registers?")) {
 859       print_state64(pc, regs);
 860       BREAKPOINT;
 861     }
 862   }
 863   fatal("DEBUG MESSAGE: %s", msg);
 864 }
 865 
 866 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 867   ttyLocker ttyl;
 868   FlagSetting fs(Debugging, true);
 869   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 870 #ifndef PRODUCT
 871   tty->cr();
 872   findpc(pc);
 873   tty->cr();
 874 #endif
 875 #define PRINT_REG(rax, value) \
 876   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 877   PRINT_REG(rax, regs[15]);
 878   PRINT_REG(rbx, regs[12]);
 879   PRINT_REG(rcx, regs[14]);
 880   PRINT_REG(rdx, regs[13]);
 881   PRINT_REG(rdi, regs[8]);
 882   PRINT_REG(rsi, regs[9]);
 883   PRINT_REG(rbp, regs[10]);
 884   // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
 885   PRINT_REG(rsp, (intptr_t)(&regs[16]));
 886   PRINT_REG(r8 , regs[7]);
 887   PRINT_REG(r9 , regs[6]);
 888   PRINT_REG(r10, regs[5]);
 889   PRINT_REG(r11, regs[4]);
 890   PRINT_REG(r12, regs[3]);
 891   PRINT_REG(r13, regs[2]);
 892   PRINT_REG(r14, regs[1]);
 893   PRINT_REG(r15, regs[0]);
 894 #undef PRINT_REG
 895   // Print some words near the top of the stack.
 896   int64_t* rsp = &regs[16];
 897   int64_t* dump_sp = rsp;
 898   for (int col1 = 0; col1 < 8; col1++) {
 899     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 900     os::print_location(tty, *dump_sp++);
 901   }
 902   for (int row = 0; row < 25; row++) {
 903     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 904     for (int col = 0; col < 4; col++) {
 905       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 906     }
 907     tty->cr();
 908   }
 909   // Print some instructions around pc:
 910   Disassembler::decode((address)pc-64, (address)pc);
 911   tty->print_cr("--------");
 912   Disassembler::decode((address)pc, (address)pc+32);
 913 }
 914 
 915 // The java_calling_convention describes stack locations as ideal slots on
 916 // a frame with no abi restrictions. Since we must observe abi restrictions
 917 // (like the placement of the register window) the slots must be biased by
 918 // the following value.
 919 static int reg2offset_in(VMReg r) {
 920   // Account for saved rbp and return address
 921   // This should really be in_preserve_stack_slots
 922   return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
 923 }
 924 
 925 static int reg2offset_out(VMReg r) {
 926   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
 927 }
 928 
 929 // A long move
 930 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 931 
 932   // The calling conventions assures us that each VMregpair is either
 933   // all really one physical register or adjacent stack slots.
 934 
 935   if (src.is_single_phys_reg() ) {
 936     if (dst.is_single_phys_reg()) {
 937       if (dst.first() != src.first()) {
 938         mov(dst.first()->as_Register(), src.first()->as_Register());
 939       }
 940     } else {
 941       assert(dst.is_single_reg(), "not a stack pair: (%s, %s), (%s, %s)",
 942        src.first()->name(), src.second()->name(), dst.first()->name(), dst.second()->name());
 943       movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
 944     }
 945   } else if (dst.is_single_phys_reg()) {
 946     assert(src.is_single_reg(),  "not a stack pair");
 947     movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 948   } else {
 949     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 950     movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 951     movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 952   }
 953 }
 954 
 955 // A double move
 956 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 957 
 958   // The calling conventions assures us that each VMregpair is either
 959   // all really one physical register or adjacent stack slots.
 960 
 961   if (src.is_single_phys_reg() ) {
 962     if (dst.is_single_phys_reg()) {
 963       // In theory these overlap but the ordering is such that this is likely a nop
 964       if ( src.first() != dst.first()) {
 965         movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
 966       }
 967     } else {
 968       assert(dst.is_single_reg(), "not a stack pair");
 969       movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
 970     }
 971   } else if (dst.is_single_phys_reg()) {
 972     assert(src.is_single_reg(),  "not a stack pair");
 973     movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 974   } else {
 975     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 976     movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 977     movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 978   }
 979 }
 980 
 981 
 982 // A float arg may have to do float reg int reg conversion
 983 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 984   assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
 985 
 986   // The calling conventions assures us that each VMregpair is either
 987   // all really one physical register or adjacent stack slots.
 988 
 989   if (src.first()->is_stack()) {
 990     if (dst.first()->is_stack()) {
 991       movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 992       movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 993     } else {
 994       // stack to reg
 995       assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
 996       movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 997     }
 998   } else if (dst.first()->is_stack()) {
 999     // reg to stack
1000     assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1001     movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
1002   } else {
1003     // reg to reg
1004     // In theory these overlap but the ordering is such that this is likely a nop
1005     if ( src.first() != dst.first()) {
1006       movdbl(dst.first()->as_XMMRegister(),  src.first()->as_XMMRegister());
1007     }
1008   }
1009 }
1010 
1011 // On 64 bit we will store integer like items to the stack as
1012 // 64 bits items (x86_32/64 abi) even though java would only store
1013 // 32bits for a parameter. On 32bit it will simply be 32 bits
1014 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
1015 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
1016   if (src.first()->is_stack()) {
1017     if (dst.first()->is_stack()) {
1018       // stack to stack
1019       movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
1020       movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
1021     } else {
1022       // stack to reg
1023       movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
1024     }
1025   } else if (dst.first()->is_stack()) {
1026     // reg to stack
1027     // Do we really have to sign extend???
1028     // __ movslq(src.first()->as_Register(), src.first()->as_Register());
1029     movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
1030   } else {
1031     // Do we really have to sign extend???
1032     // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
1033     if (dst.first() != src.first()) {
1034       movq(dst.first()->as_Register(), src.first()->as_Register());
1035     }
1036   }
1037 }
1038 
1039 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
1040   if (src.first()->is_stack()) {
1041     if (dst.first()->is_stack()) {
1042       // stack to stack
1043       movq(rax, Address(rbp, reg2offset_in(src.first())));
1044       movq(Address(rsp, reg2offset_out(dst.first())), rax);
1045     } else {
1046       // stack to reg
1047       movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1048     }
1049   } else if (dst.first()->is_stack()) {
1050     // reg to stack
1051     movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1052   } else {
1053     if (dst.first() != src.first()) {
1054       movq(dst.first()->as_Register(), src.first()->as_Register());
1055     }
1056   }
1057 }
1058 
1059 // An oop arg. Must pass a handle not the oop itself
1060 void MacroAssembler::object_move(OopMap* map,
1061                         int oop_handle_offset,
1062                         int framesize_in_slots,
1063                         VMRegPair src,
1064                         VMRegPair dst,
1065                         bool is_receiver,
1066                         int* receiver_offset) {
1067 
1068   // must pass a handle. First figure out the location we use as a handle
1069 
1070   Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1071 
1072   // See if oop is NULL if it is we need no handle
1073 
1074   if (src.first()->is_stack()) {
1075 
1076     // Oop is already on the stack as an argument
1077     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1078     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1079     if (is_receiver) {
1080       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1081     }
1082 
1083     cmpptr(Address(rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD);
1084     lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1085     // conditionally move a NULL
1086     cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1087   } else {
1088 
1089     // Oop is in an a register we must store it to the space we reserve
1090     // on the stack for oop_handles and pass a handle if oop is non-NULL
1091 
1092     const Register rOop = src.first()->as_Register();
1093     int oop_slot;
1094     if (rOop == j_rarg0)
1095       oop_slot = 0;
1096     else if (rOop == j_rarg1)
1097       oop_slot = 1;
1098     else if (rOop == j_rarg2)
1099       oop_slot = 2;
1100     else if (rOop == j_rarg3)
1101       oop_slot = 3;
1102     else if (rOop == j_rarg4)
1103       oop_slot = 4;
1104     else {
1105       assert(rOop == j_rarg5, "wrong register");
1106       oop_slot = 5;
1107     }
1108 
1109     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1110     int offset = oop_slot*VMRegImpl::stack_slot_size;
1111 
1112     map->set_oop(VMRegImpl::stack2reg(oop_slot));
1113     // Store oop in handle area, may be NULL
1114     movptr(Address(rsp, offset), rOop);
1115     if (is_receiver) {
1116       *receiver_offset = offset;
1117     }
1118 
1119     cmpptr(rOop, (int32_t)NULL_WORD);
1120     lea(rHandle, Address(rsp, offset));
1121     // conditionally move a NULL from the handle area where it was just stored
1122     cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1123   }
1124 
1125   // If arg is on the stack then place it otherwise it is already in correct reg.
1126   if (dst.first()->is_stack()) {
1127     movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1128   }
1129 }
1130 
1131 #endif // _LP64
1132 
1133 // Now versions that are common to 32/64 bit
1134 
1135 void MacroAssembler::addptr(Register dst, int32_t imm32) {
1136   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
1137 }
1138 
1139 void MacroAssembler::addptr(Register dst, Register src) {
1140   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1141 }
1142 
1143 void MacroAssembler::addptr(Address dst, Register src) {
1144   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1145 }
1146 
1147 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
1148   if (reachable(src)) {
1149     Assembler::addsd(dst, as_Address(src));
1150   } else {
1151     lea(rscratch1, src);
1152     Assembler::addsd(dst, Address(rscratch1, 0));
1153   }
1154 }
1155 
1156 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
1157   if (reachable(src)) {
1158     addss(dst, as_Address(src));
1159   } else {
1160     lea(rscratch1, src);
1161     addss(dst, Address(rscratch1, 0));
1162   }
1163 }
1164 
1165 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
1166   if (reachable(src)) {
1167     Assembler::addpd(dst, as_Address(src));
1168   } else {
1169     lea(rscratch1, src);
1170     Assembler::addpd(dst, Address(rscratch1, 0));
1171   }
1172 }
1173 
1174 // See 8273459.  Function for ensuring 64-byte alignment, intended for stubs only.
1175 // Stub code is generated once and never copied.
1176 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
1177 void MacroAssembler::align64() {
1178   align(64, (unsigned long long) pc());
1179 }
1180 
1181 void MacroAssembler::align32() {
1182   align(32, (unsigned long long) pc());
1183 }
1184 
1185 void MacroAssembler::align(int modulus) {
1186   // 8273459: Ensure alignment is possible with current segment alignment
1187   assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
1188   align(modulus, offset());
1189 }
1190 
1191 void MacroAssembler::align(int modulus, int target) {
1192   if (target % modulus != 0) {
1193     nop(modulus - (target % modulus));
1194   }
1195 }
1196 
1197 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1198   // Used in sign-masking with aligned address.
1199   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1200   if (reachable(src)) {
1201     Assembler::andpd(dst, as_Address(src));
1202   } else {
1203     lea(scratch_reg, src);
1204     Assembler::andpd(dst, Address(scratch_reg, 0));
1205   }
1206 }
1207 
1208 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1209   // Used in sign-masking with aligned address.
1210   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1211   if (reachable(src)) {
1212     Assembler::andps(dst, as_Address(src));
1213   } else {
1214     lea(scratch_reg, src);
1215     Assembler::andps(dst, Address(scratch_reg, 0));
1216   }
1217 }
1218 
1219 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1220   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1221 }
1222 
1223 void MacroAssembler::atomic_incl(Address counter_addr) {
1224   lock();
1225   incrementl(counter_addr);
1226 }
1227 
1228 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1229   if (reachable(counter_addr)) {
1230     atomic_incl(as_Address(counter_addr));
1231   } else {
1232     lea(scr, counter_addr);
1233     atomic_incl(Address(scr, 0));
1234   }
1235 }
1236 
1237 #ifdef _LP64
1238 void MacroAssembler::atomic_incq(Address counter_addr) {
1239   lock();
1240   incrementq(counter_addr);
1241 }
1242 
1243 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1244   if (reachable(counter_addr)) {
1245     atomic_incq(as_Address(counter_addr));
1246   } else {
1247     lea(scr, counter_addr);
1248     atomic_incq(Address(scr, 0));
1249   }
1250 }
1251 #endif
1252 
1253 // Writes to stack successive pages until offset reached to check for
1254 // stack overflow + shadow pages.  This clobbers tmp.
1255 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1256   movptr(tmp, rsp);
1257   // Bang stack for total size given plus shadow page size.
1258   // Bang one page at a time because large size can bang beyond yellow and
1259   // red zones.
1260   Label loop;
1261   bind(loop);
1262   movl(Address(tmp, (-os::vm_page_size())), size );
1263   subptr(tmp, os::vm_page_size());
1264   subl(size, os::vm_page_size());
1265   jcc(Assembler::greater, loop);
1266 
1267   // Bang down shadow pages too.
1268   // At this point, (tmp-0) is the last address touched, so don't
1269   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1270   // was post-decremented.)  Skip this address by starting at i=1, and
1271   // touch a few more pages below.  N.B.  It is important to touch all
1272   // the way down including all pages in the shadow zone.
1273   for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1274     // this could be any sized move but this is can be a debugging crumb
1275     // so the bigger the better.
1276     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1277   }
1278 }
1279 
1280 void MacroAssembler::reserved_stack_check() {
1281     // testing if reserved zone needs to be enabled
1282     Label no_reserved_zone_enabling;
1283     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1284     NOT_LP64(get_thread(rsi);)
1285 
1286     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1287     jcc(Assembler::below, no_reserved_zone_enabling);
1288 
1289     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1290     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1291     should_not_reach_here();
1292 
1293     bind(no_reserved_zone_enabling);
1294 }
1295 
1296 void MacroAssembler::c2bool(Register x) {
1297   // implements x == 0 ? 0 : 1
1298   // note: must only look at least-significant byte of x
1299   //       since C-style booleans are stored in one byte
1300   //       only! (was bug)
1301   andl(x, 0xFF);
1302   setb(Assembler::notZero, x);
1303 }
1304 
1305 // Wouldn't need if AddressLiteral version had new name
1306 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1307   Assembler::call(L, rtype);
1308 }
1309 
1310 void MacroAssembler::call(Register entry) {
1311   Assembler::call(entry);
1312 }
1313 
1314 void MacroAssembler::call(AddressLiteral entry) {
1315   if (reachable(entry)) {
1316     Assembler::call_literal(entry.target(), entry.rspec());
1317   } else {
1318     lea(rscratch1, entry);
1319     Assembler::call(rscratch1);
1320   }
1321 }
1322 
1323 void MacroAssembler::ic_call(address entry, jint method_index) {
1324   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
1325   movptr(rax, (intptr_t)Universe::non_oop_word());
1326   call(AddressLiteral(entry, rh));
1327 }
1328 
1329 // Implementation of call_VM versions
1330 
1331 void MacroAssembler::call_VM(Register oop_result,
1332                              address entry_point,
1333                              bool check_exceptions) {
1334   Label C, E;
1335   call(C, relocInfo::none);
1336   jmp(E);
1337 
1338   bind(C);
1339   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1340   ret(0);
1341 
1342   bind(E);
1343 }
1344 
1345 void MacroAssembler::call_VM(Register oop_result,
1346                              address entry_point,
1347                              Register arg_1,
1348                              bool check_exceptions) {
1349   Label C, E;
1350   call(C, relocInfo::none);
1351   jmp(E);
1352 
1353   bind(C);
1354   pass_arg1(this, arg_1);
1355   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1356   ret(0);
1357 
1358   bind(E);
1359 }
1360 
1361 void MacroAssembler::call_VM(Register oop_result,
1362                              address entry_point,
1363                              Register arg_1,
1364                              Register arg_2,
1365                              bool check_exceptions) {
1366   Label C, E;
1367   call(C, relocInfo::none);
1368   jmp(E);
1369 
1370   bind(C);
1371 
1372   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1373 
1374   pass_arg2(this, arg_2);
1375   pass_arg1(this, arg_1);
1376   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1377   ret(0);
1378 
1379   bind(E);
1380 }
1381 
1382 void MacroAssembler::call_VM(Register oop_result,
1383                              address entry_point,
1384                              Register arg_1,
1385                              Register arg_2,
1386                              Register arg_3,
1387                              bool check_exceptions) {
1388   Label C, E;
1389   call(C, relocInfo::none);
1390   jmp(E);
1391 
1392   bind(C);
1393 
1394   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1395   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1396   pass_arg3(this, arg_3);
1397 
1398   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1399   pass_arg2(this, arg_2);
1400 
1401   pass_arg1(this, arg_1);
1402   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1403   ret(0);
1404 
1405   bind(E);
1406 }
1407 
1408 void MacroAssembler::call_VM(Register oop_result,
1409                              Register last_java_sp,
1410                              address entry_point,
1411                              int number_of_arguments,
1412                              bool check_exceptions) {
1413   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1414   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1415 }
1416 
1417 void MacroAssembler::call_VM(Register oop_result,
1418                              Register last_java_sp,
1419                              address entry_point,
1420                              Register arg_1,
1421                              bool check_exceptions) {
1422   pass_arg1(this, arg_1);
1423   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1424 }
1425 
1426 void MacroAssembler::call_VM(Register oop_result,
1427                              Register last_java_sp,
1428                              address entry_point,
1429                              Register arg_1,
1430                              Register arg_2,
1431                              bool check_exceptions) {
1432 
1433   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1434   pass_arg2(this, arg_2);
1435   pass_arg1(this, arg_1);
1436   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1437 }
1438 
1439 void MacroAssembler::call_VM(Register oop_result,
1440                              Register last_java_sp,
1441                              address entry_point,
1442                              Register arg_1,
1443                              Register arg_2,
1444                              Register arg_3,
1445                              bool check_exceptions) {
1446   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1447   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1448   pass_arg3(this, arg_3);
1449   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1450   pass_arg2(this, arg_2);
1451   pass_arg1(this, arg_1);
1452   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1453 }
1454 
1455 void MacroAssembler::super_call_VM(Register oop_result,
1456                                    Register last_java_sp,
1457                                    address entry_point,
1458                                    int number_of_arguments,
1459                                    bool check_exceptions) {
1460   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1461   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1462 }
1463 
1464 void MacroAssembler::super_call_VM(Register oop_result,
1465                                    Register last_java_sp,
1466                                    address entry_point,
1467                                    Register arg_1,
1468                                    bool check_exceptions) {
1469   pass_arg1(this, arg_1);
1470   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1471 }
1472 
1473 void MacroAssembler::super_call_VM(Register oop_result,
1474                                    Register last_java_sp,
1475                                    address entry_point,
1476                                    Register arg_1,
1477                                    Register arg_2,
1478                                    bool check_exceptions) {
1479 
1480   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1481   pass_arg2(this, arg_2);
1482   pass_arg1(this, arg_1);
1483   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1484 }
1485 
1486 void MacroAssembler::super_call_VM(Register oop_result,
1487                                    Register last_java_sp,
1488                                    address entry_point,
1489                                    Register arg_1,
1490                                    Register arg_2,
1491                                    Register arg_3,
1492                                    bool check_exceptions) {
1493   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1494   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1495   pass_arg3(this, arg_3);
1496   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1497   pass_arg2(this, arg_2);
1498   pass_arg1(this, arg_1);
1499   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1500 }
1501 
1502 void MacroAssembler::call_VM_base(Register oop_result,
1503                                   Register java_thread,
1504                                   Register last_java_sp,
1505                                   address  entry_point,
1506                                   int      number_of_arguments,
1507                                   bool     check_exceptions) {
1508   // determine java_thread register
1509   if (!java_thread->is_valid()) {
1510 #ifdef _LP64
1511     java_thread = r15_thread;
1512 #else
1513     java_thread = rdi;
1514     get_thread(java_thread);
1515 #endif // LP64
1516   }
1517   // determine last_java_sp register
1518   if (!last_java_sp->is_valid()) {
1519     last_java_sp = rsp;
1520   }
1521   // debugging support
1522   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
1523   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
1524 #ifdef ASSERT
1525   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1526   // r12 is the heapbase.
1527   LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
1528 #endif // ASSERT
1529 
1530   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
1531   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1532 
1533   // push java thread (becomes first argument of C function)
1534 
1535   NOT_LP64(push(java_thread); number_of_arguments++);
1536   LP64_ONLY(mov(c_rarg0, r15_thread));
1537 
1538   // set last Java frame before call
1539   assert(last_java_sp != rbp, "can't use ebp/rbp");
1540 
1541   // Only interpreter should have to set fp
1542   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
1543 
1544   // do the call, remove parameters
1545   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1546 
1547   // restore the thread (cannot use the pushed argument since arguments
1548   // may be overwritten by C code generated by an optimizing compiler);
1549   // however can use the register value directly if it is callee saved.
1550   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
1551     // rdi & rsi (also r15) are callee saved -> nothing to do
1552 #ifdef ASSERT
1553     guarantee(java_thread != rax, "change this code");
1554     push(rax);
1555     { Label L;
1556       get_thread(rax);
1557       cmpptr(java_thread, rax);
1558       jcc(Assembler::equal, L);
1559       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
1560       bind(L);
1561     }
1562     pop(rax);
1563 #endif
1564   } else {
1565     get_thread(java_thread);
1566   }
1567   // reset last Java frame
1568   // Only interpreter should have to clear fp
1569   reset_last_Java_frame(java_thread, true);
1570 
1571    // C++ interp handles this in the interpreter
1572   check_and_handle_popframe(java_thread);
1573   check_and_handle_earlyret(java_thread);
1574 
1575   if (check_exceptions) {
1576     // check for pending exceptions (java_thread is set upon return)
1577     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
1578 #ifndef _LP64
1579     jump_cc(Assembler::notEqual,
1580             RuntimeAddress(StubRoutines::forward_exception_entry()));
1581 #else
1582     // This used to conditionally jump to forward_exception however it is
1583     // possible if we relocate that the branch will not reach. So we must jump
1584     // around so we can always reach
1585 
1586     Label ok;
1587     jcc(Assembler::equal, ok);
1588     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1589     bind(ok);
1590 #endif // LP64
1591   }
1592 
1593   // get oop result if there is one and reset the value in the thread
1594   if (oop_result->is_valid()) {
1595     get_vm_result(oop_result, java_thread);
1596   }
1597 }
1598 
1599 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1600 
1601   // Calculate the value for last_Java_sp
1602   // somewhat subtle. call_VM does an intermediate call
1603   // which places a return address on the stack just under the
1604   // stack pointer as the user finsihed with it. This allows
1605   // use to retrieve last_Java_pc from last_Java_sp[-1].
1606   // On 32bit we then have to push additional args on the stack to accomplish
1607   // the actual requested call. On 64bit call_VM only can use register args
1608   // so the only extra space is the return address that call_VM created.
1609   // This hopefully explains the calculations here.
1610 
1611 #ifdef _LP64
1612   // We've pushed one address, correct last_Java_sp
1613   lea(rax, Address(rsp, wordSize));
1614 #else
1615   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
1616 #endif // LP64
1617 
1618   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
1619 
1620 }
1621 
1622 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1623 void MacroAssembler::call_VM_leaf0(address entry_point) {
1624   MacroAssembler::call_VM_leaf_base(entry_point, 0);
1625 }
1626 
1627 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1628   call_VM_leaf_base(entry_point, number_of_arguments);
1629 }
1630 
1631 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1632   pass_arg0(this, arg_0);
1633   call_VM_leaf(entry_point, 1);
1634 }
1635 
1636 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1637 
1638   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1639   pass_arg1(this, arg_1);
1640   pass_arg0(this, arg_0);
1641   call_VM_leaf(entry_point, 2);
1642 }
1643 
1644 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1645   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1646   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1647   pass_arg2(this, arg_2);
1648   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1649   pass_arg1(this, arg_1);
1650   pass_arg0(this, arg_0);
1651   call_VM_leaf(entry_point, 3);
1652 }
1653 
1654 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1655   pass_arg0(this, arg_0);
1656   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1657 }
1658 
1659 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1660 
1661   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1662   pass_arg1(this, arg_1);
1663   pass_arg0(this, arg_0);
1664   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1665 }
1666 
1667 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1668   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1669   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1670   pass_arg2(this, arg_2);
1671   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1672   pass_arg1(this, arg_1);
1673   pass_arg0(this, arg_0);
1674   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1675 }
1676 
1677 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1678   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1679   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1680   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1681   pass_arg3(this, arg_3);
1682   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1683   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1684   pass_arg2(this, arg_2);
1685   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1686   pass_arg1(this, arg_1);
1687   pass_arg0(this, arg_0);
1688   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1689 }
1690 
1691 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1692   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1693   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
1694   verify_oop_msg(oop_result, "broken oop in call_VM_base");
1695 }
1696 
1697 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1698   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1699   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
1700 }
1701 
1702 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
1703 }
1704 
1705 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
1706 }
1707 
1708 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
1709   if (reachable(src1)) {
1710     cmpl(as_Address(src1), imm);
1711   } else {
1712     lea(rscratch1, src1);
1713     cmpl(Address(rscratch1, 0), imm);
1714   }
1715 }
1716 
1717 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
1718   assert(!src2.is_lval(), "use cmpptr");
1719   if (reachable(src2)) {
1720     cmpl(src1, as_Address(src2));
1721   } else {
1722     lea(rscratch1, src2);
1723     cmpl(src1, Address(rscratch1, 0));
1724   }
1725 }
1726 
1727 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1728   Assembler::cmpl(src1, imm);
1729 }
1730 
1731 void MacroAssembler::cmp32(Register src1, Address src2) {
1732   Assembler::cmpl(src1, src2);
1733 }
1734 
1735 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1736   ucomisd(opr1, opr2);
1737 
1738   Label L;
1739   if (unordered_is_less) {
1740     movl(dst, -1);
1741     jcc(Assembler::parity, L);
1742     jcc(Assembler::below , L);
1743     movl(dst, 0);
1744     jcc(Assembler::equal , L);
1745     increment(dst);
1746   } else { // unordered is greater
1747     movl(dst, 1);
1748     jcc(Assembler::parity, L);
1749     jcc(Assembler::above , L);
1750     movl(dst, 0);
1751     jcc(Assembler::equal , L);
1752     decrementl(dst);
1753   }
1754   bind(L);
1755 }
1756 
1757 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1758   ucomiss(opr1, opr2);
1759 
1760   Label L;
1761   if (unordered_is_less) {
1762     movl(dst, -1);
1763     jcc(Assembler::parity, L);
1764     jcc(Assembler::below , L);
1765     movl(dst, 0);
1766     jcc(Assembler::equal , L);
1767     increment(dst);
1768   } else { // unordered is greater
1769     movl(dst, 1);
1770     jcc(Assembler::parity, L);
1771     jcc(Assembler::above , L);
1772     movl(dst, 0);
1773     jcc(Assembler::equal , L);
1774     decrementl(dst);
1775   }
1776   bind(L);
1777 }
1778 
1779 
1780 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
1781   if (reachable(src1)) {
1782     cmpb(as_Address(src1), imm);
1783   } else {
1784     lea(rscratch1, src1);
1785     cmpb(Address(rscratch1, 0), imm);
1786   }
1787 }
1788 
1789 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
1790 #ifdef _LP64
1791   if (src2.is_lval()) {
1792     movptr(rscratch1, src2);
1793     Assembler::cmpq(src1, rscratch1);
1794   } else if (reachable(src2)) {
1795     cmpq(src1, as_Address(src2));
1796   } else {
1797     lea(rscratch1, src2);
1798     Assembler::cmpq(src1, Address(rscratch1, 0));
1799   }
1800 #else
1801   if (src2.is_lval()) {
1802     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1803   } else {
1804     cmpl(src1, as_Address(src2));
1805   }
1806 #endif // _LP64
1807 }
1808 
1809 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
1810   assert(src2.is_lval(), "not a mem-mem compare");
1811 #ifdef _LP64
1812   // moves src2's literal address
1813   movptr(rscratch1, src2);
1814   Assembler::cmpq(src1, rscratch1);
1815 #else
1816   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1817 #endif // _LP64
1818 }
1819 
1820 void MacroAssembler::cmpoop(Register src1, Register src2) {
1821   cmpptr(src1, src2);
1822 }
1823 
1824 void MacroAssembler::cmpoop(Register src1, Address src2) {
1825   cmpptr(src1, src2);
1826 }
1827 
1828 #ifdef _LP64
1829 void MacroAssembler::cmpoop(Register src1, jobject src2) {
1830   movoop(rscratch1, src2);
1831   cmpptr(src1, rscratch1);
1832 }
1833 #endif
1834 
1835 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
1836   if (reachable(adr)) {
1837     lock();
1838     cmpxchgptr(reg, as_Address(adr));
1839   } else {
1840     lea(rscratch1, adr);
1841     lock();
1842     cmpxchgptr(reg, Address(rscratch1, 0));
1843   }
1844 }
1845 
1846 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1847   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
1848 }
1849 
1850 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
1851   if (reachable(src)) {
1852     Assembler::comisd(dst, as_Address(src));
1853   } else {
1854     lea(rscratch1, src);
1855     Assembler::comisd(dst, Address(rscratch1, 0));
1856   }
1857 }
1858 
1859 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
1860   if (reachable(src)) {
1861     Assembler::comiss(dst, as_Address(src));
1862   } else {
1863     lea(rscratch1, src);
1864     Assembler::comiss(dst, Address(rscratch1, 0));
1865   }
1866 }
1867 
1868 
1869 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
1870   Condition negated_cond = negate_condition(cond);
1871   Label L;
1872   jcc(negated_cond, L);
1873   pushf(); // Preserve flags
1874   atomic_incl(counter_addr);
1875   popf();
1876   bind(L);
1877 }
1878 
1879 int MacroAssembler::corrected_idivl(Register reg) {
1880   // Full implementation of Java idiv and irem; checks for
1881   // special case as described in JVM spec., p.243 & p.271.
1882   // The function returns the (pc) offset of the idivl
1883   // instruction - may be needed for implicit exceptions.
1884   //
1885   //         normal case                           special case
1886   //
1887   // input : rax,: dividend                         min_int
1888   //         reg: divisor   (may not be rax,/rdx)   -1
1889   //
1890   // output: rax,: quotient  (= rax, idiv reg)       min_int
1891   //         rdx: remainder (= rax, irem reg)       0
1892   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1893   const int min_int = 0x80000000;
1894   Label normal_case, special_case;
1895 
1896   // check for special case
1897   cmpl(rax, min_int);
1898   jcc(Assembler::notEqual, normal_case);
1899   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1900   cmpl(reg, -1);
1901   jcc(Assembler::equal, special_case);
1902 
1903   // handle normal case
1904   bind(normal_case);
1905   cdql();
1906   int idivl_offset = offset();
1907   idivl(reg);
1908 
1909   // normal and special case exit
1910   bind(special_case);
1911 
1912   return idivl_offset;
1913 }
1914 
1915 
1916 
1917 void MacroAssembler::decrementl(Register reg, int value) {
1918   if (value == min_jint) {subl(reg, value) ; return; }
1919   if (value <  0) { incrementl(reg, -value); return; }
1920   if (value == 0) {                        ; return; }
1921   if (value == 1 && UseIncDec) { decl(reg) ; return; }
1922   /* else */      { subl(reg, value)       ; return; }
1923 }
1924 
1925 void MacroAssembler::decrementl(Address dst, int value) {
1926   if (value == min_jint) {subl(dst, value) ; return; }
1927   if (value <  0) { incrementl(dst, -value); return; }
1928   if (value == 0) {                        ; return; }
1929   if (value == 1 && UseIncDec) { decl(dst) ; return; }
1930   /* else */      { subl(dst, value)       ; return; }
1931 }
1932 
1933 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
1934   assert (shift_value > 0, "illegal shift value");
1935   Label _is_positive;
1936   testl (reg, reg);
1937   jcc (Assembler::positive, _is_positive);
1938   int offset = (1 << shift_value) - 1 ;
1939 
1940   if (offset == 1) {
1941     incrementl(reg);
1942   } else {
1943     addl(reg, offset);
1944   }
1945 
1946   bind (_is_positive);
1947   sarl(reg, shift_value);
1948 }
1949 
1950 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
1951   if (reachable(src)) {
1952     Assembler::divsd(dst, as_Address(src));
1953   } else {
1954     lea(rscratch1, src);
1955     Assembler::divsd(dst, Address(rscratch1, 0));
1956   }
1957 }
1958 
1959 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
1960   if (reachable(src)) {
1961     Assembler::divss(dst, as_Address(src));
1962   } else {
1963     lea(rscratch1, src);
1964     Assembler::divss(dst, Address(rscratch1, 0));
1965   }
1966 }
1967 
1968 void MacroAssembler::enter() {
1969   push(rbp);
1970   mov(rbp, rsp);
1971 }
1972 
1973 // A 5 byte nop that is safe for patching (see patch_verified_entry)
1974 void MacroAssembler::fat_nop() {
1975   if (UseAddressNop) {
1976     addr_nop_5();
1977   } else {
1978     emit_int8(0x26); // es:
1979     emit_int8(0x2e); // cs:
1980     emit_int8(0x64); // fs:
1981     emit_int8(0x65); // gs:
1982     emit_int8((unsigned char)0x90);
1983   }
1984 }
1985 
1986 #ifndef _LP64
1987 void MacroAssembler::fcmp(Register tmp) {
1988   fcmp(tmp, 1, true, true);
1989 }
1990 
1991 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
1992   assert(!pop_right || pop_left, "usage error");
1993   if (VM_Version::supports_cmov()) {
1994     assert(tmp == noreg, "unneeded temp");
1995     if (pop_left) {
1996       fucomip(index);
1997     } else {
1998       fucomi(index);
1999     }
2000     if (pop_right) {
2001       fpop();
2002     }
2003   } else {
2004     assert(tmp != noreg, "need temp");
2005     if (pop_left) {
2006       if (pop_right) {
2007         fcompp();
2008       } else {
2009         fcomp(index);
2010       }
2011     } else {
2012       fcom(index);
2013     }
2014     // convert FPU condition into eflags condition via rax,
2015     save_rax(tmp);
2016     fwait(); fnstsw_ax();
2017     sahf();
2018     restore_rax(tmp);
2019   }
2020   // condition codes set as follows:
2021   //
2022   // CF (corresponds to C0) if x < y
2023   // PF (corresponds to C2) if unordered
2024   // ZF (corresponds to C3) if x = y
2025 }
2026 
2027 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2028   fcmp2int(dst, unordered_is_less, 1, true, true);
2029 }
2030 
2031 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2032   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2033   Label L;
2034   if (unordered_is_less) {
2035     movl(dst, -1);
2036     jcc(Assembler::parity, L);
2037     jcc(Assembler::below , L);
2038     movl(dst, 0);
2039     jcc(Assembler::equal , L);
2040     increment(dst);
2041   } else { // unordered is greater
2042     movl(dst, 1);
2043     jcc(Assembler::parity, L);
2044     jcc(Assembler::above , L);
2045     movl(dst, 0);
2046     jcc(Assembler::equal , L);
2047     decrementl(dst);
2048   }
2049   bind(L);
2050 }
2051 
2052 void MacroAssembler::fld_d(AddressLiteral src) {
2053   fld_d(as_Address(src));
2054 }
2055 
2056 void MacroAssembler::fld_s(AddressLiteral src) {
2057   fld_s(as_Address(src));
2058 }
2059 
2060 void MacroAssembler::fldcw(AddressLiteral src) {
2061   Assembler::fldcw(as_Address(src));
2062 }
2063 
2064 void MacroAssembler::fpop() {
2065   ffree();
2066   fincstp();
2067 }
2068 
2069 void MacroAssembler::fremr(Register tmp) {
2070   save_rax(tmp);
2071   { Label L;
2072     bind(L);
2073     fprem();
2074     fwait(); fnstsw_ax();
2075     sahf();
2076     jcc(Assembler::parity, L);
2077   }
2078   restore_rax(tmp);
2079   // Result is in ST0.
2080   // Note: fxch & fpop to get rid of ST1
2081   // (otherwise FPU stack could overflow eventually)
2082   fxch(1);
2083   fpop();
2084 }
2085 
2086 void MacroAssembler::empty_FPU_stack() {
2087   if (VM_Version::supports_mmx()) {
2088     emms();
2089   } else {
2090     for (int i = 8; i-- > 0; ) ffree(i);
2091   }
2092 }
2093 #endif // !LP64
2094 
2095 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2096   if (reachable(src)) {
2097     Assembler::mulpd(dst, as_Address(src));
2098   } else {
2099     lea(rscratch1, src);
2100     Assembler::mulpd(dst, Address(rscratch1, 0));
2101   }
2102 }
2103 
2104 void MacroAssembler::load_float(Address src) {
2105 #ifdef _LP64
2106   movflt(xmm0, src);
2107 #else
2108   if (UseSSE >= 1) {
2109     movflt(xmm0, src);
2110   } else {
2111     fld_s(src);
2112   }
2113 #endif // LP64
2114 }
2115 
2116 void MacroAssembler::store_float(Address dst) {
2117 #ifdef _LP64
2118   movflt(dst, xmm0);
2119 #else
2120   if (UseSSE >= 1) {
2121     movflt(dst, xmm0);
2122   } else {
2123     fstp_s(dst);
2124   }
2125 #endif // LP64
2126 }
2127 
2128 void MacroAssembler::load_double(Address src) {
2129 #ifdef _LP64
2130   movdbl(xmm0, src);
2131 #else
2132   if (UseSSE >= 2) {
2133     movdbl(xmm0, src);
2134   } else {
2135     fld_d(src);
2136   }
2137 #endif // LP64
2138 }
2139 
2140 void MacroAssembler::store_double(Address dst) {
2141 #ifdef _LP64
2142   movdbl(dst, xmm0);
2143 #else
2144   if (UseSSE >= 2) {
2145     movdbl(dst, xmm0);
2146   } else {
2147     fstp_d(dst);
2148   }
2149 #endif // LP64
2150 }
2151 
2152 // dst = c = a * b + c
2153 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2154   Assembler::vfmadd231sd(c, a, b);
2155   if (dst != c) {
2156     movdbl(dst, c);
2157   }
2158 }
2159 
2160 // dst = c = a * b + c
2161 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2162   Assembler::vfmadd231ss(c, a, b);
2163   if (dst != c) {
2164     movflt(dst, c);
2165   }
2166 }
2167 
2168 // dst = c = a * b + c
2169 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2170   Assembler::vfmadd231pd(c, a, b, vector_len);
2171   if (dst != c) {
2172     vmovdqu(dst, c);
2173   }
2174 }
2175 
2176 // dst = c = a * b + c
2177 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2178   Assembler::vfmadd231ps(c, a, b, vector_len);
2179   if (dst != c) {
2180     vmovdqu(dst, c);
2181   }
2182 }
2183 
2184 // dst = c = a * b + c
2185 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2186   Assembler::vfmadd231pd(c, a, b, vector_len);
2187   if (dst != c) {
2188     vmovdqu(dst, c);
2189   }
2190 }
2191 
2192 // dst = c = a * b + c
2193 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2194   Assembler::vfmadd231ps(c, a, b, vector_len);
2195   if (dst != c) {
2196     vmovdqu(dst, c);
2197   }
2198 }
2199 
2200 void MacroAssembler::incrementl(AddressLiteral dst) {
2201   if (reachable(dst)) {
2202     incrementl(as_Address(dst));
2203   } else {
2204     lea(rscratch1, dst);
2205     incrementl(Address(rscratch1, 0));
2206   }
2207 }
2208 
2209 void MacroAssembler::incrementl(ArrayAddress dst) {
2210   incrementl(as_Address(dst));
2211 }
2212 
2213 void MacroAssembler::incrementl(Register reg, int value) {
2214   if (value == min_jint) {addl(reg, value) ; return; }
2215   if (value <  0) { decrementl(reg, -value); return; }
2216   if (value == 0) {                        ; return; }
2217   if (value == 1 && UseIncDec) { incl(reg) ; return; }
2218   /* else */      { addl(reg, value)       ; return; }
2219 }
2220 
2221 void MacroAssembler::incrementl(Address dst, int value) {
2222   if (value == min_jint) {addl(dst, value) ; return; }
2223   if (value <  0) { decrementl(dst, -value); return; }
2224   if (value == 0) {                        ; return; }
2225   if (value == 1 && UseIncDec) { incl(dst) ; return; }
2226   /* else */      { addl(dst, value)       ; return; }
2227 }
2228 
2229 void MacroAssembler::jump(AddressLiteral dst) {
2230   if (reachable(dst)) {
2231     jmp_literal(dst.target(), dst.rspec());
2232   } else {
2233     lea(rscratch1, dst);
2234     jmp(rscratch1);
2235   }
2236 }
2237 
2238 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
2239   if (reachable(dst)) {
2240     InstructionMark im(this);
2241     relocate(dst.reloc());
2242     const int short_size = 2;
2243     const int long_size = 6;
2244     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2245     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2246       // 0111 tttn #8-bit disp
2247       emit_int8(0x70 | cc);
2248       emit_int8((offs - short_size) & 0xFF);
2249     } else {
2250       // 0000 1111 1000 tttn #32-bit disp
2251       emit_int8(0x0F);
2252       emit_int8((unsigned char)(0x80 | cc));
2253       emit_int32(offs - long_size);
2254     }
2255   } else {
2256 #ifdef ASSERT
2257     warning("reversing conditional branch");
2258 #endif /* ASSERT */
2259     Label skip;
2260     jccb(reverse[cc], skip);
2261     lea(rscratch1, dst);
2262     Assembler::jmp(rscratch1);
2263     bind(skip);
2264   }
2265 }
2266 
2267 void MacroAssembler::fld_x(AddressLiteral src) {
2268   Assembler::fld_x(as_Address(src));
2269 }
2270 
2271 void MacroAssembler::ldmxcsr(AddressLiteral src) {
2272   if (reachable(src)) {
2273     Assembler::ldmxcsr(as_Address(src));
2274   } else {
2275     lea(rscratch1, src);
2276     Assembler::ldmxcsr(Address(rscratch1, 0));
2277   }
2278 }
2279 
2280 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2281   int off;
2282   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2283     off = offset();
2284     movsbl(dst, src); // movsxb
2285   } else {
2286     off = load_unsigned_byte(dst, src);
2287     shll(dst, 24);
2288     sarl(dst, 24);
2289   }
2290   return off;
2291 }
2292 
2293 // Note: load_signed_short used to be called load_signed_word.
2294 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
2295 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
2296 // The term "word" in HotSpot means a 32- or 64-bit machine word.
2297 int MacroAssembler::load_signed_short(Register dst, Address src) {
2298   int off;
2299   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2300     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
2301     // version but this is what 64bit has always done. This seems to imply
2302     // that users are only using 32bits worth.
2303     off = offset();
2304     movswl(dst, src); // movsxw
2305   } else {
2306     off = load_unsigned_short(dst, src);
2307     shll(dst, 16);
2308     sarl(dst, 16);
2309   }
2310   return off;
2311 }
2312 
2313 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2314   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2315   // and "3.9 Partial Register Penalties", p. 22).
2316   int off;
2317   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
2318     off = offset();
2319     movzbl(dst, src); // movzxb
2320   } else {
2321     xorl(dst, dst);
2322     off = offset();
2323     movb(dst, src);
2324   }
2325   return off;
2326 }
2327 
2328 // Note: load_unsigned_short used to be called load_unsigned_word.
2329 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2330   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2331   // and "3.9 Partial Register Penalties", p. 22).
2332   int off;
2333   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
2334     off = offset();
2335     movzwl(dst, src); // movzxw
2336   } else {
2337     xorl(dst, dst);
2338     off = offset();
2339     movw(dst, src);
2340   }
2341   return off;
2342 }
2343 
2344 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2345   switch (size_in_bytes) {
2346 #ifndef _LP64
2347   case  8:
2348     assert(dst2 != noreg, "second dest register required");
2349     movl(dst,  src);
2350     movl(dst2, src.plus_disp(BytesPerInt));
2351     break;
2352 #else
2353   case  8:  movq(dst, src); break;
2354 #endif
2355   case  4:  movl(dst, src); break;
2356   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2357   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2358   default:  ShouldNotReachHere();
2359   }
2360 }
2361 
2362 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2363   switch (size_in_bytes) {
2364 #ifndef _LP64
2365   case  8:
2366     assert(src2 != noreg, "second source register required");
2367     movl(dst,                        src);
2368     movl(dst.plus_disp(BytesPerInt), src2);
2369     break;
2370 #else
2371   case  8:  movq(dst, src); break;
2372 #endif
2373   case  4:  movl(dst, src); break;
2374   case  2:  movw(dst, src); break;
2375   case  1:  movb(dst, src); break;
2376   default:  ShouldNotReachHere();
2377   }
2378 }
2379 
2380 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
2381   if (reachable(dst)) {
2382     movl(as_Address(dst), src);
2383   } else {
2384     lea(rscratch1, dst);
2385     movl(Address(rscratch1, 0), src);
2386   }
2387 }
2388 
2389 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
2390   if (reachable(src)) {
2391     movl(dst, as_Address(src));
2392   } else {
2393     lea(rscratch1, src);
2394     movl(dst, Address(rscratch1, 0));
2395   }
2396 }
2397 
2398 // C++ bool manipulation
2399 
2400 void MacroAssembler::movbool(Register dst, Address src) {
2401   if(sizeof(bool) == 1)
2402     movb(dst, src);
2403   else if(sizeof(bool) == 2)
2404     movw(dst, src);
2405   else if(sizeof(bool) == 4)
2406     movl(dst, src);
2407   else
2408     // unsupported
2409     ShouldNotReachHere();
2410 }
2411 
2412 void MacroAssembler::movbool(Address dst, bool boolconst) {
2413   if(sizeof(bool) == 1)
2414     movb(dst, (int) boolconst);
2415   else if(sizeof(bool) == 2)
2416     movw(dst, (int) boolconst);
2417   else if(sizeof(bool) == 4)
2418     movl(dst, (int) boolconst);
2419   else
2420     // unsupported
2421     ShouldNotReachHere();
2422 }
2423 
2424 void MacroAssembler::movbool(Address dst, Register src) {
2425   if(sizeof(bool) == 1)
2426     movb(dst, src);
2427   else if(sizeof(bool) == 2)
2428     movw(dst, src);
2429   else if(sizeof(bool) == 4)
2430     movl(dst, src);
2431   else
2432     // unsupported
2433     ShouldNotReachHere();
2434 }
2435 
2436 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
2437   movb(as_Address(dst), src);
2438 }
2439 
2440 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
2441   if (reachable(src)) {
2442     movdl(dst, as_Address(src));
2443   } else {
2444     lea(rscratch1, src);
2445     movdl(dst, Address(rscratch1, 0));
2446   }
2447 }
2448 
2449 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
2450   if (reachable(src)) {
2451     movq(dst, as_Address(src));
2452   } else {
2453     lea(rscratch1, src);
2454     movq(dst, Address(rscratch1, 0));
2455   }
2456 }
2457 
2458 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
2459   if (reachable(src)) {
2460     if (UseXmmLoadAndClearUpper) {
2461       movsd (dst, as_Address(src));
2462     } else {
2463       movlpd(dst, as_Address(src));
2464     }
2465   } else {
2466     lea(rscratch1, src);
2467     if (UseXmmLoadAndClearUpper) {
2468       movsd (dst, Address(rscratch1, 0));
2469     } else {
2470       movlpd(dst, Address(rscratch1, 0));
2471     }
2472   }
2473 }
2474 
2475 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
2476   if (reachable(src)) {
2477     movss(dst, as_Address(src));
2478   } else {
2479     lea(rscratch1, src);
2480     movss(dst, Address(rscratch1, 0));
2481   }
2482 }
2483 
2484 void MacroAssembler::movptr(Register dst, Register src) {
2485   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2486 }
2487 
2488 void MacroAssembler::movptr(Register dst, Address src) {
2489   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2490 }
2491 
2492 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
2493 void MacroAssembler::movptr(Register dst, intptr_t src) {
2494   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
2495 }
2496 
2497 void MacroAssembler::movptr(Address dst, Register src) {
2498   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2499 }
2500 
2501 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2502     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2503     Assembler::movdqu(dst, src);
2504 }
2505 
2506 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2507     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2508     Assembler::movdqu(dst, src);
2509 }
2510 
2511 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2512     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2513     Assembler::movdqu(dst, src);
2514 }
2515 
2516 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
2517   if (reachable(src)) {
2518     movdqu(dst, as_Address(src));
2519   } else {
2520     lea(scratchReg, src);
2521     movdqu(dst, Address(scratchReg, 0));
2522   }
2523 }
2524 
2525 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2526     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2527     Assembler::vmovdqu(dst, src);
2528 }
2529 
2530 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2531     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2532     Assembler::vmovdqu(dst, src);
2533 }
2534 
2535 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2536     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2537     Assembler::vmovdqu(dst, src);
2538 }
2539 
2540 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
2541   if (reachable(src)) {
2542     vmovdqu(dst, as_Address(src));
2543   }
2544   else {
2545     lea(scratch_reg, src);
2546     vmovdqu(dst, Address(scratch_reg, 0));
2547   }
2548 }
2549 
2550 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg, int vector_len) {
2551   assert(vector_len <= AVX_256bit, "AVX2 vector length");
2552   if (vector_len == AVX_256bit) {
2553     vmovdqu(dst, src, scratch_reg);
2554   } else {
2555     movdqu(dst, src, scratch_reg);
2556   }
2557 }
2558 
2559 void MacroAssembler::kmov(KRegister dst, Address src) {
2560   if (VM_Version::supports_avx512bw()) {
2561     kmovql(dst, src);
2562   } else {
2563     assert(VM_Version::supports_evex(), "");
2564     kmovwl(dst, src);
2565   }
2566 }
2567 
2568 void MacroAssembler::kmov(Address dst, KRegister src) {
2569   if (VM_Version::supports_avx512bw()) {
2570     kmovql(dst, src);
2571   } else {
2572     assert(VM_Version::supports_evex(), "");
2573     kmovwl(dst, src);
2574   }
2575 }
2576 
2577 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2578   if (VM_Version::supports_avx512bw()) {
2579     kmovql(dst, src);
2580   } else {
2581     assert(VM_Version::supports_evex(), "");
2582     kmovwl(dst, src);
2583   }
2584 }
2585 
2586 void MacroAssembler::kmov(Register dst, KRegister src) {
2587   if (VM_Version::supports_avx512bw()) {
2588     kmovql(dst, src);
2589   } else {
2590     assert(VM_Version::supports_evex(), "");
2591     kmovwl(dst, src);
2592   }
2593 }
2594 
2595 void MacroAssembler::kmov(KRegister dst, Register src) {
2596   if (VM_Version::supports_avx512bw()) {
2597     kmovql(dst, src);
2598   } else {
2599     assert(VM_Version::supports_evex(), "");
2600     kmovwl(dst, src);
2601   }
2602 }
2603 
2604 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register scratch_reg) {
2605   if (reachable(src)) {
2606     kmovql(dst, as_Address(src));
2607   } else {
2608     lea(scratch_reg, src);
2609     kmovql(dst, Address(scratch_reg, 0));
2610   }
2611 }
2612 
2613 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg) {
2614   if (reachable(src)) {
2615     kmovwl(dst, as_Address(src));
2616   } else {
2617     lea(scratch_reg, src);
2618     kmovwl(dst, Address(scratch_reg, 0));
2619   }
2620 }
2621 
2622 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2623                                int vector_len, Register scratch_reg) {
2624   if (reachable(src)) {
2625     if (mask == k0) {
2626       Assembler::evmovdqub(dst, as_Address(src), merge, vector_len);
2627     } else {
2628       Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2629     }
2630   } else {
2631     lea(scratch_reg, src);
2632     if (mask == k0) {
2633       Assembler::evmovdqub(dst, Address(scratch_reg, 0), merge, vector_len);
2634     } else {
2635       Assembler::evmovdqub(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2636     }
2637   }
2638 }
2639 
2640 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2641                                int vector_len, Register scratch_reg) {
2642   if (reachable(src)) {
2643     Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2644   } else {
2645     lea(scratch_reg, src);
2646     Assembler::evmovdquw(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2647   }
2648 }
2649 
2650 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2651                                int vector_len, Register scratch_reg) {
2652   if (reachable(src)) {
2653     Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2654   } else {
2655     lea(scratch_reg, src);
2656     Assembler::evmovdqul(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2657   }
2658 }
2659 
2660 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2661                                int vector_len, Register scratch_reg) {
2662   if (reachable(src)) {
2663     Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2664   } else {
2665     lea(scratch_reg, src);
2666     Assembler::evmovdquq(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2667   }
2668 }
2669 
2670 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2671   if (reachable(src)) {
2672     Assembler::evmovdquq(dst, as_Address(src), vector_len);
2673   } else {
2674     lea(rscratch, src);
2675     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2676   }
2677 }
2678 
2679 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
2680   if (reachable(src)) {
2681     Assembler::movdqa(dst, as_Address(src));
2682   } else {
2683     lea(rscratch1, src);
2684     Assembler::movdqa(dst, Address(rscratch1, 0));
2685   }
2686 }
2687 
2688 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
2689   if (reachable(src)) {
2690     Assembler::movsd(dst, as_Address(src));
2691   } else {
2692     lea(rscratch1, src);
2693     Assembler::movsd(dst, Address(rscratch1, 0));
2694   }
2695 }
2696 
2697 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
2698   if (reachable(src)) {
2699     Assembler::movss(dst, as_Address(src));
2700   } else {
2701     lea(rscratch1, src);
2702     Assembler::movss(dst, Address(rscratch1, 0));
2703   }
2704 }
2705 
2706 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2707   if (reachable(src)) {
2708     Assembler::vmovddup(dst, as_Address(src), vector_len);
2709   } else {
2710     lea(rscratch, src);
2711     Assembler::vmovddup(dst, Address(rscratch, 0), vector_len);
2712   }
2713 }
2714 
2715 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
2716   if (reachable(src)) {
2717     Assembler::mulsd(dst, as_Address(src));
2718   } else {
2719     lea(rscratch1, src);
2720     Assembler::mulsd(dst, Address(rscratch1, 0));
2721   }
2722 }
2723 
2724 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
2725   if (reachable(src)) {
2726     Assembler::mulss(dst, as_Address(src));
2727   } else {
2728     lea(rscratch1, src);
2729     Assembler::mulss(dst, Address(rscratch1, 0));
2730   }
2731 }
2732 
2733 void MacroAssembler::null_check(Register reg, int offset) {
2734   if (needs_explicit_null_check(offset)) {
2735     // provoke OS NULL exception if reg = NULL by
2736     // accessing M[reg] w/o changing any (non-CC) registers
2737     // NOTE: cmpl is plenty here to provoke a segv
2738     cmpptr(rax, Address(reg, 0));
2739     // Note: should probably use testl(rax, Address(reg, 0));
2740     //       may be shorter code (however, this version of
2741     //       testl needs to be implemented first)
2742   } else {
2743     // nothing to do, (later) access of M[reg + offset]
2744     // will provoke OS NULL exception if reg = NULL
2745   }
2746 }
2747 
2748 void MacroAssembler::os_breakpoint() {
2749   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2750   // (e.g., MSVC can't call ps() otherwise)
2751   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2752 }
2753 
2754 void MacroAssembler::unimplemented(const char* what) {
2755   const char* buf = NULL;
2756   {
2757     ResourceMark rm;
2758     stringStream ss;
2759     ss.print("unimplemented: %s", what);
2760     buf = code_string(ss.as_string());
2761   }
2762   stop(buf);
2763 }
2764 
2765 #ifdef _LP64
2766 #define XSTATE_BV 0x200
2767 #endif
2768 
2769 void MacroAssembler::pop_CPU_state() {
2770   pop_FPU_state();
2771   pop_IU_state();
2772 }
2773 
2774 void MacroAssembler::pop_FPU_state() {
2775 #ifndef _LP64
2776   frstor(Address(rsp, 0));
2777 #else
2778   fxrstor(Address(rsp, 0));
2779 #endif
2780   addptr(rsp, FPUStateSizeInWords * wordSize);
2781 }
2782 
2783 void MacroAssembler::pop_IU_state() {
2784   popa();
2785   LP64_ONLY(addq(rsp, 8));
2786   popf();
2787 }
2788 
2789 // Save Integer and Float state
2790 // Warning: Stack must be 16 byte aligned (64bit)
2791 void MacroAssembler::push_CPU_state() {
2792   push_IU_state();
2793   push_FPU_state();
2794 }
2795 
2796 void MacroAssembler::push_FPU_state() {
2797   subptr(rsp, FPUStateSizeInWords * wordSize);
2798 #ifndef _LP64
2799   fnsave(Address(rsp, 0));
2800   fwait();
2801 #else
2802   fxsave(Address(rsp, 0));
2803 #endif // LP64
2804 }
2805 
2806 void MacroAssembler::push_IU_state() {
2807   // Push flags first because pusha kills them
2808   pushf();
2809   // Make sure rsp stays 16-byte aligned
2810   LP64_ONLY(subq(rsp, 8));
2811   pusha();
2812 }
2813 
2814 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
2815   if (!java_thread->is_valid()) {
2816     java_thread = rdi;
2817     get_thread(java_thread);
2818   }
2819   // we must set sp to zero to clear frame
2820   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
2821   // must clear fp, so that compiled frames are not confused; it is
2822   // possible that we need it only for debugging
2823   if (clear_fp) {
2824     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2825   }
2826   // Always clear the pc because it could have been set by make_walkable()
2827   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
2828   vzeroupper();
2829 }
2830 
2831 void MacroAssembler::restore_rax(Register tmp) {
2832   if (tmp == noreg) pop(rax);
2833   else if (tmp != rax) mov(rax, tmp);
2834 }
2835 
2836 void MacroAssembler::round_to(Register reg, int modulus) {
2837   addptr(reg, modulus - 1);
2838   andptr(reg, -modulus);
2839 }
2840 
2841 void MacroAssembler::save_rax(Register tmp) {
2842   if (tmp == noreg) push(rax);
2843   else if (tmp != rax) mov(tmp, rax);
2844 }
2845 
2846 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) {
2847   if (at_return) {
2848     // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
2849     // we may safely use rsp instead to perform the stack watermark check.
2850     cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset()));
2851     jcc(Assembler::above, slow_path);
2852     return;
2853   }
2854   testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2855   jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
2856 }
2857 
2858 // Calls to C land
2859 //
2860 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
2861 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
2862 // has to be reset to 0. This is required to allow proper stack traversal.
2863 void MacroAssembler::set_last_Java_frame(Register java_thread,
2864                                          Register last_java_sp,
2865                                          Register last_java_fp,
2866                                          address  last_java_pc) {
2867   vzeroupper();
2868   // determine java_thread register
2869   if (!java_thread->is_valid()) {
2870     java_thread = rdi;
2871     get_thread(java_thread);
2872   }
2873   // determine last_java_sp register
2874   if (!last_java_sp->is_valid()) {
2875     last_java_sp = rsp;
2876   }
2877 
2878   // last_java_fp is optional
2879 
2880   if (last_java_fp->is_valid()) {
2881     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
2882   }
2883 
2884   // last_java_pc is optional
2885 
2886   if (last_java_pc != NULL) {
2887     lea(Address(java_thread,
2888                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
2889         InternalAddress(last_java_pc));
2890 
2891   }
2892   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
2893 }
2894 
2895 void MacroAssembler::shlptr(Register dst, int imm8) {
2896   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
2897 }
2898 
2899 void MacroAssembler::shrptr(Register dst, int imm8) {
2900   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
2901 }
2902 
2903 void MacroAssembler::sign_extend_byte(Register reg) {
2904   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
2905     movsbl(reg, reg); // movsxb
2906   } else {
2907     shll(reg, 24);
2908     sarl(reg, 24);
2909   }
2910 }
2911 
2912 void MacroAssembler::sign_extend_short(Register reg) {
2913   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2914     movswl(reg, reg); // movsxw
2915   } else {
2916     shll(reg, 16);
2917     sarl(reg, 16);
2918   }
2919 }
2920 
2921 void MacroAssembler::testl(Register dst, AddressLiteral src) {
2922   assert(reachable(src), "Address should be reachable");
2923   testl(dst, as_Address(src));
2924 }
2925 
2926 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
2927   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2928   Assembler::pcmpeqb(dst, src);
2929 }
2930 
2931 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
2932   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2933   Assembler::pcmpeqw(dst, src);
2934 }
2935 
2936 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2937   assert((dst->encoding() < 16),"XMM register should be 0-15");
2938   Assembler::pcmpestri(dst, src, imm8);
2939 }
2940 
2941 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2942   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2943   Assembler::pcmpestri(dst, src, imm8);
2944 }
2945 
2946 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2947   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2948   Assembler::pmovzxbw(dst, src);
2949 }
2950 
2951 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
2952   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2953   Assembler::pmovzxbw(dst, src);
2954 }
2955 
2956 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
2957   assert((src->encoding() < 16),"XMM register should be 0-15");
2958   Assembler::pmovmskb(dst, src);
2959 }
2960 
2961 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
2962   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2963   Assembler::ptest(dst, src);
2964 }
2965 
2966 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
2967   if (reachable(src)) {
2968     Assembler::sqrtsd(dst, as_Address(src));
2969   } else {
2970     lea(rscratch1, src);
2971     Assembler::sqrtsd(dst, Address(rscratch1, 0));
2972   }
2973 }
2974 
2975 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
2976   if (reachable(src)) {
2977     Assembler::sqrtss(dst, as_Address(src));
2978   } else {
2979     lea(rscratch1, src);
2980     Assembler::sqrtss(dst, Address(rscratch1, 0));
2981   }
2982 }
2983 
2984 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
2985   if (reachable(src)) {
2986     Assembler::subsd(dst, as_Address(src));
2987   } else {
2988     lea(rscratch1, src);
2989     Assembler::subsd(dst, Address(rscratch1, 0));
2990   }
2991 }
2992 
2993 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg) {
2994   if (reachable(src)) {
2995     Assembler::roundsd(dst, as_Address(src), rmode);
2996   } else {
2997     lea(scratch_reg, src);
2998     Assembler::roundsd(dst, Address(scratch_reg, 0), rmode);
2999   }
3000 }
3001 
3002 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3003   if (reachable(src)) {
3004     Assembler::subss(dst, as_Address(src));
3005   } else {
3006     lea(rscratch1, src);
3007     Assembler::subss(dst, Address(rscratch1, 0));
3008   }
3009 }
3010 
3011 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3012   if (reachable(src)) {
3013     Assembler::ucomisd(dst, as_Address(src));
3014   } else {
3015     lea(rscratch1, src);
3016     Assembler::ucomisd(dst, Address(rscratch1, 0));
3017   }
3018 }
3019 
3020 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3021   if (reachable(src)) {
3022     Assembler::ucomiss(dst, as_Address(src));
3023   } else {
3024     lea(rscratch1, src);
3025     Assembler::ucomiss(dst, Address(rscratch1, 0));
3026   }
3027 }
3028 
3029 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3030   // Used in sign-bit flipping with aligned address.
3031   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3032   if (reachable(src)) {
3033     Assembler::xorpd(dst, as_Address(src));
3034   } else {
3035     lea(scratch_reg, src);
3036     Assembler::xorpd(dst, Address(scratch_reg, 0));
3037   }
3038 }
3039 
3040 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3041   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3042     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3043   }
3044   else {
3045     Assembler::xorpd(dst, src);
3046   }
3047 }
3048 
3049 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3050   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3051     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3052   } else {
3053     Assembler::xorps(dst, src);
3054   }
3055 }
3056 
3057 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3058   // Used in sign-bit flipping with aligned address.
3059   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3060   if (reachable(src)) {
3061     Assembler::xorps(dst, as_Address(src));
3062   } else {
3063     lea(scratch_reg, src);
3064     Assembler::xorps(dst, Address(scratch_reg, 0));
3065   }
3066 }
3067 
3068 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3069   // Used in sign-bit flipping with aligned address.
3070   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3071   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3072   if (reachable(src)) {
3073     Assembler::pshufb(dst, as_Address(src));
3074   } else {
3075     lea(rscratch1, src);
3076     Assembler::pshufb(dst, Address(rscratch1, 0));
3077   }
3078 }
3079 
3080 // AVX 3-operands instructions
3081 
3082 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3083   if (reachable(src)) {
3084     vaddsd(dst, nds, as_Address(src));
3085   } else {
3086     lea(rscratch1, src);
3087     vaddsd(dst, nds, Address(rscratch1, 0));
3088   }
3089 }
3090 
3091 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3092   if (reachable(src)) {
3093     vaddss(dst, nds, as_Address(src));
3094   } else {
3095     lea(rscratch1, src);
3096     vaddss(dst, nds, Address(rscratch1, 0));
3097   }
3098 }
3099 
3100 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3101   assert(UseAVX > 0, "requires some form of AVX");
3102   if (reachable(src)) {
3103     Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
3104   } else {
3105     lea(rscratch, src);
3106     Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
3107   }
3108 }
3109 
3110 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3111   assert(UseAVX > 0, "requires some form of AVX");
3112   if (reachable(src)) {
3113     Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
3114   } else {
3115     lea(rscratch, src);
3116     Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
3117   }
3118 }
3119 
3120 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3121   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3122   vandps(dst, nds, negate_field, vector_len);
3123 }
3124 
3125 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3126   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3127   vandpd(dst, nds, negate_field, vector_len);
3128 }
3129 
3130 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3131   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3132   Assembler::vpaddb(dst, nds, src, vector_len);
3133 }
3134 
3135 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3136   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3137   Assembler::vpaddb(dst, nds, src, vector_len);
3138 }
3139 
3140 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3141   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3142   Assembler::vpaddw(dst, nds, src, vector_len);
3143 }
3144 
3145 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3146   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3147   Assembler::vpaddw(dst, nds, src, vector_len);
3148 }
3149 
3150 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3151   if (reachable(src)) {
3152     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3153   } else {
3154     lea(scratch_reg, src);
3155     Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3156   }
3157 }
3158 
3159 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3160   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3161   Assembler::vpbroadcastw(dst, src, vector_len);
3162 }
3163 
3164 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3165   if (reachable(src)) {
3166     Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
3167   } else {
3168     lea(rscratch, src);
3169     Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len);
3170   }
3171 }
3172 
3173 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3174   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3175   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3176 }
3177 
3178 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3179   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3180   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3181 }
3182 
3183 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds,
3184                                AddressLiteral src, int vector_len, Register scratch_reg) {
3185   if (reachable(src)) {
3186     Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3187   } else {
3188     lea(scratch_reg, src);
3189     Assembler::evpcmpeqd(kdst, mask, nds, Address(scratch_reg, 0), vector_len);
3190   }
3191 }
3192 
3193 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3194                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3195   if (reachable(src)) {
3196     Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3197   } else {
3198     lea(scratch_reg, src);
3199     Assembler::evpcmpd(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3200   }
3201 }
3202 
3203 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3204                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3205   if (reachable(src)) {
3206     Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3207   } else {
3208     lea(scratch_reg, src);
3209     Assembler::evpcmpq(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3210   }
3211 }
3212 
3213 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3214                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3215   if (reachable(src)) {
3216     Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3217   } else {
3218     lea(scratch_reg, src);
3219     Assembler::evpcmpb(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3220   }
3221 }
3222 
3223 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3224                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3225   if (reachable(src)) {
3226     Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3227   } else {
3228     lea(scratch_reg, src);
3229     Assembler::evpcmpw(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3230   }
3231 }
3232 
3233 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3234   if (width == Assembler::Q) {
3235     Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3236   } else {
3237     Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3238   }
3239 }
3240 
3241 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) {
3242   int eq_cond_enc = 0x29;
3243   int gt_cond_enc = 0x37;
3244   if (width != Assembler::Q) {
3245     eq_cond_enc = 0x74 + width;
3246     gt_cond_enc = 0x64 + width;
3247   }
3248   switch (cond) {
3249   case eq:
3250     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3251     break;
3252   case neq:
3253     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3254     vallones(xtmp, vector_len);
3255     vpxor(dst, xtmp, dst, vector_len);
3256     break;
3257   case le:
3258     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3259     vallones(xtmp, vector_len);
3260     vpxor(dst, xtmp, dst, vector_len);
3261     break;
3262   case nlt:
3263     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3264     vallones(xtmp, vector_len);
3265     vpxor(dst, xtmp, dst, vector_len);
3266     break;
3267   case lt:
3268     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3269     break;
3270   case nle:
3271     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3272     break;
3273   default:
3274     assert(false, "Should not reach here");
3275   }
3276 }
3277 
3278 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3279   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3280   Assembler::vpmovzxbw(dst, src, vector_len);
3281 }
3282 
3283 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3284   assert((src->encoding() < 16),"XMM register should be 0-15");
3285   Assembler::vpmovmskb(dst, src, vector_len);
3286 }
3287 
3288 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3289   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3290   Assembler::vpmullw(dst, nds, src, vector_len);
3291 }
3292 
3293 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3294   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3295   Assembler::vpmullw(dst, nds, src, vector_len);
3296 }
3297 
3298 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3299   assert((UseAVX > 0), "AVX support is needed");
3300   if (reachable(src)) {
3301     Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3302   } else {
3303     lea(scratch_reg, src);
3304     Assembler::vpmulld(dst, nds, Address(scratch_reg, 0), vector_len);
3305   }
3306 }
3307 
3308 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3309   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3310   Assembler::vpsubb(dst, nds, src, vector_len);
3311 }
3312 
3313 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3314   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3315   Assembler::vpsubb(dst, nds, src, vector_len);
3316 }
3317 
3318 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3319   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3320   Assembler::vpsubw(dst, nds, src, vector_len);
3321 }
3322 
3323 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3324   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3325   Assembler::vpsubw(dst, nds, src, vector_len);
3326 }
3327 
3328 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3329   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3330   Assembler::vpsraw(dst, nds, shift, vector_len);
3331 }
3332 
3333 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3334   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3335   Assembler::vpsraw(dst, nds, shift, vector_len);
3336 }
3337 
3338 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3339   assert(UseAVX > 2,"");
3340   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3341      vector_len = 2;
3342   }
3343   Assembler::evpsraq(dst, nds, shift, vector_len);
3344 }
3345 
3346 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3347   assert(UseAVX > 2,"");
3348   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3349      vector_len = 2;
3350   }
3351   Assembler::evpsraq(dst, nds, shift, vector_len);
3352 }
3353 
3354 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3355   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3356   Assembler::vpsrlw(dst, nds, shift, vector_len);
3357 }
3358 
3359 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3360   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3361   Assembler::vpsrlw(dst, nds, shift, vector_len);
3362 }
3363 
3364 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3365   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3366   Assembler::vpsllw(dst, nds, shift, vector_len);
3367 }
3368 
3369 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3370   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3371   Assembler::vpsllw(dst, nds, shift, vector_len);
3372 }
3373 
3374 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3375   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3376   Assembler::vptest(dst, src);
3377 }
3378 
3379 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3380   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3381   Assembler::punpcklbw(dst, src);
3382 }
3383 
3384 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3385   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3386   Assembler::pshufd(dst, src, mode);
3387 }
3388 
3389 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3390   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3391   Assembler::pshuflw(dst, src, mode);
3392 }
3393 
3394 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3395   if (reachable(src)) {
3396     vandpd(dst, nds, as_Address(src), vector_len);
3397   } else {
3398     lea(scratch_reg, src);
3399     vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
3400   }
3401 }
3402 
3403 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3404   if (reachable(src)) {
3405     vandps(dst, nds, as_Address(src), vector_len);
3406   } else {
3407     lea(scratch_reg, src);
3408     vandps(dst, nds, Address(scratch_reg, 0), vector_len);
3409   }
3410 }
3411 
3412 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3413                             bool merge, int vector_len, Register scratch_reg) {
3414   if (reachable(src)) {
3415     Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3416   } else {
3417     lea(scratch_reg, src);
3418     Assembler::evpord(dst, mask, nds, Address(scratch_reg, 0), merge, vector_len);
3419   }
3420 }
3421 
3422 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3423   if (reachable(src)) {
3424     vdivsd(dst, nds, as_Address(src));
3425   } else {
3426     lea(rscratch1, src);
3427     vdivsd(dst, nds, Address(rscratch1, 0));
3428   }
3429 }
3430 
3431 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3432   if (reachable(src)) {
3433     vdivss(dst, nds, as_Address(src));
3434   } else {
3435     lea(rscratch1, src);
3436     vdivss(dst, nds, Address(rscratch1, 0));
3437   }
3438 }
3439 
3440 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3441   if (reachable(src)) {
3442     vmulsd(dst, nds, as_Address(src));
3443   } else {
3444     lea(rscratch1, src);
3445     vmulsd(dst, nds, Address(rscratch1, 0));
3446   }
3447 }
3448 
3449 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3450   if (reachable(src)) {
3451     vmulss(dst, nds, as_Address(src));
3452   } else {
3453     lea(rscratch1, src);
3454     vmulss(dst, nds, Address(rscratch1, 0));
3455   }
3456 }
3457 
3458 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3459   if (reachable(src)) {
3460     vsubsd(dst, nds, as_Address(src));
3461   } else {
3462     lea(rscratch1, src);
3463     vsubsd(dst, nds, Address(rscratch1, 0));
3464   }
3465 }
3466 
3467 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3468   if (reachable(src)) {
3469     vsubss(dst, nds, as_Address(src));
3470   } else {
3471     lea(rscratch1, src);
3472     vsubss(dst, nds, Address(rscratch1, 0));
3473   }
3474 }
3475 
3476 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3477   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3478   vxorps(dst, nds, src, Assembler::AVX_128bit);
3479 }
3480 
3481 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3482   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3483   vxorpd(dst, nds, src, Assembler::AVX_128bit);
3484 }
3485 
3486 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3487   if (reachable(src)) {
3488     vxorpd(dst, nds, as_Address(src), vector_len);
3489   } else {
3490     lea(scratch_reg, src);
3491     vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
3492   }
3493 }
3494 
3495 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3496   if (reachable(src)) {
3497     vxorps(dst, nds, as_Address(src), vector_len);
3498   } else {
3499     lea(scratch_reg, src);
3500     vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
3501   }
3502 }
3503 
3504 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3505   if (UseAVX > 1 || (vector_len < 1)) {
3506     if (reachable(src)) {
3507       Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3508     } else {
3509       lea(scratch_reg, src);
3510       Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
3511     }
3512   }
3513   else {
3514     MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
3515   }
3516 }
3517 
3518 void MacroAssembler::vpermd(XMMRegister dst,  XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3519   if (reachable(src)) {
3520     Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3521   } else {
3522     lea(scratch_reg, src);
3523     Assembler::vpermd(dst, nds, Address(scratch_reg, 0), vector_len);
3524   }
3525 }
3526 
3527 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
3528   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
3529   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
3530   // The inverted mask is sign-extended
3531   andptr(possibly_jweak, inverted_jweak_mask);
3532 }
3533 
3534 void MacroAssembler::resolve_jobject(Register value,
3535                                      Register thread,
3536                                      Register tmp) {
3537   assert_different_registers(value, thread, tmp);
3538   Label done, not_weak;
3539   testptr(value, value);
3540   jcc(Assembler::zero, done);                // Use NULL as-is.
3541   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
3542   jcc(Assembler::zero, not_weak);
3543   // Resolve jweak.
3544   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3545                  value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
3546   verify_oop(value);
3547   jmp(done);
3548   bind(not_weak);
3549   // Resolve (untagged) jobject.
3550   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
3551   verify_oop(value);
3552   bind(done);
3553 }
3554 
3555 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3556   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
3557 }
3558 
3559 // Force generation of a 4 byte immediate value even if it fits into 8bit
3560 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3561   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
3562 }
3563 
3564 void MacroAssembler::subptr(Register dst, Register src) {
3565   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
3566 }
3567 
3568 // C++ bool manipulation
3569 void MacroAssembler::testbool(Register dst) {
3570   if(sizeof(bool) == 1)
3571     testb(dst, 0xff);
3572   else if(sizeof(bool) == 2) {
3573     // testw implementation needed for two byte bools
3574     ShouldNotReachHere();
3575   } else if(sizeof(bool) == 4)
3576     testl(dst, dst);
3577   else
3578     // unsupported
3579     ShouldNotReachHere();
3580 }
3581 
3582 void MacroAssembler::testptr(Register dst, Register src) {
3583   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
3584 }
3585 
3586 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3587 void MacroAssembler::tlab_allocate(Register thread, Register obj,
3588                                    Register var_size_in_bytes,
3589                                    int con_size_in_bytes,
3590                                    Register t1,
3591                                    Register t2,
3592                                    Label& slow_case) {
3593   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3594   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3595 }
3596 
3597 // Defines obj, preserves var_size_in_bytes
3598 void MacroAssembler::eden_allocate(Register thread, Register obj,
3599                                    Register var_size_in_bytes,
3600                                    int con_size_in_bytes,
3601                                    Register t1,
3602                                    Label& slow_case) {
3603   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3604   bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
3605 }
3606 
3607 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3608 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3609   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3610   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3611   Label done;
3612 
3613   testptr(length_in_bytes, length_in_bytes);
3614   jcc(Assembler::zero, done);
3615 
3616   // initialize topmost word, divide index by 2, check if odd and test if zero
3617   // note: for the remaining code to work, index must be a multiple of BytesPerWord
3618 #ifdef ASSERT
3619   {
3620     Label L;
3621     testptr(length_in_bytes, BytesPerWord - 1);
3622     jcc(Assembler::zero, L);
3623     stop("length must be a multiple of BytesPerWord");
3624     bind(L);
3625   }
3626 #endif
3627   Register index = length_in_bytes;
3628   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
3629   if (UseIncDec) {
3630     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
3631   } else {
3632     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
3633     shrptr(index, 1);
3634   }
3635 #ifndef _LP64
3636   // index could have not been a multiple of 8 (i.e., bit 2 was set)
3637   {
3638     Label even;
3639     // note: if index was a multiple of 8, then it cannot
3640     //       be 0 now otherwise it must have been 0 before
3641     //       => if it is even, we don't need to check for 0 again
3642     jcc(Assembler::carryClear, even);
3643     // clear topmost word (no jump would be needed if conditional assignment worked here)
3644     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
3645     // index could be 0 now, must check again
3646     jcc(Assembler::zero, done);
3647     bind(even);
3648   }
3649 #endif // !_LP64
3650   // initialize remaining object fields: index is a multiple of 2 now
3651   {
3652     Label loop;
3653     bind(loop);
3654     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3655     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
3656     decrement(index);
3657     jcc(Assembler::notZero, loop);
3658   }
3659 
3660   bind(done);
3661 }
3662 
3663 // Look up the method for a megamorphic invokeinterface call.
3664 // The target method is determined by <intf_klass, itable_index>.
3665 // The receiver klass is in recv_klass.
3666 // On success, the result will be in method_result, and execution falls through.
3667 // On failure, execution transfers to the given label.
3668 void MacroAssembler::lookup_interface_method(Register recv_klass,
3669                                              Register intf_klass,
3670                                              RegisterOrConstant itable_index,
3671                                              Register method_result,
3672                                              Register scan_temp,
3673                                              Label& L_no_such_interface,
3674                                              bool return_method) {
3675   assert_different_registers(recv_klass, intf_klass, scan_temp);
3676   assert_different_registers(method_result, intf_klass, scan_temp);
3677   assert(recv_klass != method_result || !return_method,
3678          "recv_klass can be destroyed when method isn't needed");
3679 
3680   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3681          "caller must use same register for non-constant itable index as for method");
3682 
3683   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3684   int vtable_base = in_bytes(Klass::vtable_start_offset());
3685   int itentry_off = itableMethodEntry::method_offset_in_bytes();
3686   int scan_step   = itableOffsetEntry::size() * wordSize;
3687   int vte_size    = vtableEntry::size_in_bytes();
3688   Address::ScaleFactor times_vte_scale = Address::times_ptr;
3689   assert(vte_size == wordSize, "else adjust times_vte_scale");
3690 
3691   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3692 
3693   // %%% Could store the aligned, prescaled offset in the klassoop.
3694   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3695 
3696   if (return_method) {
3697     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3698     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3699     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
3700   }
3701 
3702   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
3703   //   if (scan->interface() == intf) {
3704   //     result = (klass + scan->offset() + itable_index);
3705   //   }
3706   // }
3707   Label search, found_method;
3708 
3709   for (int peel = 1; peel >= 0; peel--) {
3710     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
3711     cmpptr(intf_klass, method_result);
3712 
3713     if (peel) {
3714       jccb(Assembler::equal, found_method);
3715     } else {
3716       jccb(Assembler::notEqual, search);
3717       // (invert the test to fall through to found_method...)
3718     }
3719 
3720     if (!peel)  break;
3721 
3722     bind(search);
3723 
3724     // Check that the previous entry is non-null.  A null entry means that
3725     // the receiver class doesn't implement the interface, and wasn't the
3726     // same as when the caller was compiled.
3727     testptr(method_result, method_result);
3728     jcc(Assembler::zero, L_no_such_interface);
3729     addptr(scan_temp, scan_step);
3730   }
3731 
3732   bind(found_method);
3733 
3734   if (return_method) {
3735     // Got a hit.
3736     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
3737     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
3738   }
3739 }
3740 
3741 
3742 // virtual method calling
3743 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3744                                            RegisterOrConstant vtable_index,
3745                                            Register method_result) {
3746   const int base = in_bytes(Klass::vtable_start_offset());
3747   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
3748   Address vtable_entry_addr(recv_klass,
3749                             vtable_index, Address::times_ptr,
3750                             base + vtableEntry::method_offset_in_bytes());
3751   movptr(method_result, vtable_entry_addr);
3752 }
3753 
3754 
3755 void MacroAssembler::check_klass_subtype(Register sub_klass,
3756                            Register super_klass,
3757                            Register temp_reg,
3758                            Label& L_success) {
3759   Label L_failure;
3760   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
3761   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
3762   bind(L_failure);
3763 }
3764 
3765 
3766 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3767                                                    Register super_klass,
3768                                                    Register temp_reg,
3769                                                    Label* L_success,
3770                                                    Label* L_failure,
3771                                                    Label* L_slow_path,
3772                                         RegisterOrConstant super_check_offset) {
3773   assert_different_registers(sub_klass, super_klass, temp_reg);
3774   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
3775   if (super_check_offset.is_register()) {
3776     assert_different_registers(sub_klass, super_klass,
3777                                super_check_offset.as_register());
3778   } else if (must_load_sco) {
3779     assert(temp_reg != noreg, "supply either a temp or a register offset");
3780   }
3781 
3782   Label L_fallthrough;
3783   int label_nulls = 0;
3784   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
3785   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
3786   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
3787   assert(label_nulls <= 1, "at most one NULL in the batch");
3788 
3789   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3790   int sco_offset = in_bytes(Klass::super_check_offset_offset());
3791   Address super_check_offset_addr(super_klass, sco_offset);
3792 
3793   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
3794   // range of a jccb.  If this routine grows larger, reconsider at
3795   // least some of these.
3796 #define local_jcc(assembler_cond, label)                                \
3797   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
3798   else                             jcc( assembler_cond, label) /*omit semi*/
3799 
3800   // Hacked jmp, which may only be used just before L_fallthrough.
3801 #define final_jmp(label)                                                \
3802   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
3803   else                            jmp(label)                /*omit semi*/
3804 
3805   // If the pointers are equal, we are done (e.g., String[] elements).
3806   // This self-check enables sharing of secondary supertype arrays among
3807   // non-primary types such as array-of-interface.  Otherwise, each such
3808   // type would need its own customized SSA.
3809   // We move this check to the front of the fast path because many
3810   // type checks are in fact trivially successful in this manner,
3811   // so we get a nicely predicted branch right at the start of the check.
3812   cmpptr(sub_klass, super_klass);
3813   local_jcc(Assembler::equal, *L_success);
3814 
3815   // Check the supertype display:
3816   if (must_load_sco) {
3817     // Positive movl does right thing on LP64.
3818     movl(temp_reg, super_check_offset_addr);
3819     super_check_offset = RegisterOrConstant(temp_reg);
3820   }
3821   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
3822   cmpptr(super_klass, super_check_addr); // load displayed supertype
3823 
3824   // This check has worked decisively for primary supers.
3825   // Secondary supers are sought in the super_cache ('super_cache_addr').
3826   // (Secondary supers are interfaces and very deeply nested subtypes.)
3827   // This works in the same check above because of a tricky aliasing
3828   // between the super_cache and the primary super display elements.
3829   // (The 'super_check_addr' can address either, as the case requires.)
3830   // Note that the cache is updated below if it does not help us find
3831   // what we need immediately.
3832   // So if it was a primary super, we can just fail immediately.
3833   // Otherwise, it's the slow path for us (no success at this point).
3834 
3835   if (super_check_offset.is_register()) {
3836     local_jcc(Assembler::equal, *L_success);
3837     cmpl(super_check_offset.as_register(), sc_offset);
3838     if (L_failure == &L_fallthrough) {
3839       local_jcc(Assembler::equal, *L_slow_path);
3840     } else {
3841       local_jcc(Assembler::notEqual, *L_failure);
3842       final_jmp(*L_slow_path);
3843     }
3844   } else if (super_check_offset.as_constant() == sc_offset) {
3845     // Need a slow path; fast failure is impossible.
3846     if (L_slow_path == &L_fallthrough) {
3847       local_jcc(Assembler::equal, *L_success);
3848     } else {
3849       local_jcc(Assembler::notEqual, *L_slow_path);
3850       final_jmp(*L_success);
3851     }
3852   } else {
3853     // No slow path; it's a fast decision.
3854     if (L_failure == &L_fallthrough) {
3855       local_jcc(Assembler::equal, *L_success);
3856     } else {
3857       local_jcc(Assembler::notEqual, *L_failure);
3858       final_jmp(*L_success);
3859     }
3860   }
3861 
3862   bind(L_fallthrough);
3863 
3864 #undef local_jcc
3865 #undef final_jmp
3866 }
3867 
3868 
3869 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
3870                                                    Register super_klass,
3871                                                    Register temp_reg,
3872                                                    Register temp2_reg,
3873                                                    Label* L_success,
3874                                                    Label* L_failure,
3875                                                    bool set_cond_codes) {
3876   assert_different_registers(sub_klass, super_klass, temp_reg);
3877   if (temp2_reg != noreg)
3878     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
3879 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
3880 
3881   Label L_fallthrough;
3882   int label_nulls = 0;
3883   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
3884   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
3885   assert(label_nulls <= 1, "at most one NULL in the batch");
3886 
3887   // a couple of useful fields in sub_klass:
3888   int ss_offset = in_bytes(Klass::secondary_supers_offset());
3889   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3890   Address secondary_supers_addr(sub_klass, ss_offset);
3891   Address super_cache_addr(     sub_klass, sc_offset);
3892 
3893   // Do a linear scan of the secondary super-klass chain.
3894   // This code is rarely used, so simplicity is a virtue here.
3895   // The repne_scan instruction uses fixed registers, which we must spill.
3896   // Don't worry too much about pre-existing connections with the input regs.
3897 
3898   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
3899   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
3900 
3901   // Get super_klass value into rax (even if it was in rdi or rcx).
3902   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
3903   if (super_klass != rax || UseCompressedOops) {
3904     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
3905     mov(rax, super_klass);
3906   }
3907   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
3908   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
3909 
3910 #ifndef PRODUCT
3911   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
3912   ExternalAddress pst_counter_addr((address) pst_counter);
3913   NOT_LP64(  incrementl(pst_counter_addr) );
3914   LP64_ONLY( lea(rcx, pst_counter_addr) );
3915   LP64_ONLY( incrementl(Address(rcx, 0)) );
3916 #endif //PRODUCT
3917 
3918   // We will consult the secondary-super array.
3919   movptr(rdi, secondary_supers_addr);
3920   // Load the array length.  (Positive movl does right thing on LP64.)
3921   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
3922   // Skip to start of data.
3923   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
3924 
3925   // Scan RCX words at [RDI] for an occurrence of RAX.
3926   // Set NZ/Z based on last compare.
3927   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
3928   // not change flags (only scas instruction which is repeated sets flags).
3929   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
3930 
3931     testptr(rax,rax); // Set Z = 0
3932     repne_scan();
3933 
3934   // Unspill the temp. registers:
3935   if (pushed_rdi)  pop(rdi);
3936   if (pushed_rcx)  pop(rcx);
3937   if (pushed_rax)  pop(rax);
3938 
3939   if (set_cond_codes) {
3940     // Special hack for the AD files:  rdi is guaranteed non-zero.
3941     assert(!pushed_rdi, "rdi must be left non-NULL");
3942     // Also, the condition codes are properly set Z/NZ on succeed/failure.
3943   }
3944 
3945   if (L_failure == &L_fallthrough)
3946         jccb(Assembler::notEqual, *L_failure);
3947   else  jcc(Assembler::notEqual, *L_failure);
3948 
3949   // Success.  Cache the super we found and proceed in triumph.
3950   movptr(super_cache_addr, super_klass);
3951 
3952   if (L_success != &L_fallthrough) {
3953     jmp(*L_success);
3954   }
3955 
3956 #undef IS_A_TEMP
3957 
3958   bind(L_fallthrough);
3959 }
3960 
3961 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
3962   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
3963 
3964   Label L_fallthrough;
3965   if (L_fast_path == NULL) {
3966     L_fast_path = &L_fallthrough;
3967   } else if (L_slow_path == NULL) {
3968     L_slow_path = &L_fallthrough;
3969   }
3970 
3971   // Fast path check: class is fully initialized
3972   cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
3973   jcc(Assembler::equal, *L_fast_path);
3974 
3975   // Fast path check: current thread is initializer thread
3976   cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
3977   if (L_slow_path == &L_fallthrough) {
3978     jcc(Assembler::equal, *L_fast_path);
3979     bind(*L_slow_path);
3980   } else if (L_fast_path == &L_fallthrough) {
3981     jcc(Assembler::notEqual, *L_slow_path);
3982     bind(*L_fast_path);
3983   } else {
3984     Unimplemented();
3985   }
3986 }
3987 
3988 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
3989   if (VM_Version::supports_cmov()) {
3990     cmovl(cc, dst, src);
3991   } else {
3992     Label L;
3993     jccb(negate_condition(cc), L);
3994     movl(dst, src);
3995     bind(L);
3996   }
3997 }
3998 
3999 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4000   if (VM_Version::supports_cmov()) {
4001     cmovl(cc, dst, src);
4002   } else {
4003     Label L;
4004     jccb(negate_condition(cc), L);
4005     movl(dst, src);
4006     bind(L);
4007   }
4008 }
4009 
4010 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4011   if (!VerifyOops) return;
4012 
4013   // Pass register number to verify_oop_subroutine
4014   const char* b = NULL;
4015   {
4016     ResourceMark rm;
4017     stringStream ss;
4018     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4019     b = code_string(ss.as_string());
4020   }
4021   BLOCK_COMMENT("verify_oop {");
4022 #ifdef _LP64
4023   push(rscratch1);                    // save r10, trashed by movptr()
4024 #endif
4025   push(rax);                          // save rax,
4026   push(reg);                          // pass register argument
4027   ExternalAddress buffer((address) b);
4028   // avoid using pushptr, as it modifies scratch registers
4029   // and our contract is not to modify anything
4030   movptr(rax, buffer.addr());
4031   push(rax);
4032   // call indirectly to solve generation ordering problem
4033   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4034   call(rax);
4035   // Caller pops the arguments (oop, message) and restores rax, r10
4036   BLOCK_COMMENT("} verify_oop");
4037 }
4038 
4039 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4040   if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4041     vpternlogd(dst, 0xFF, dst, dst, vector_len);
4042   } else {
4043     assert(UseAVX > 0, "");
4044     vpcmpeqb(dst, dst, dst, vector_len);
4045   }
4046 }
4047 
4048 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4049                                          int extra_slot_offset) {
4050   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4051   int stackElementSize = Interpreter::stackElementSize;
4052   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4053 #ifdef ASSERT
4054   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4055   assert(offset1 - offset == stackElementSize, "correct arithmetic");
4056 #endif
4057   Register             scale_reg    = noreg;
4058   Address::ScaleFactor scale_factor = Address::no_scale;
4059   if (arg_slot.is_constant()) {
4060     offset += arg_slot.as_constant() * stackElementSize;
4061   } else {
4062     scale_reg    = arg_slot.as_register();
4063     scale_factor = Address::times(stackElementSize);
4064   }
4065   offset += wordSize;           // return PC is on stack
4066   return Address(rsp, scale_reg, scale_factor, offset);
4067 }
4068 
4069 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
4070   if (!VerifyOops) return;
4071 
4072   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4073   // Pass register number to verify_oop_subroutine
4074   const char* b = NULL;
4075   {
4076     ResourceMark rm;
4077     stringStream ss;
4078     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
4079     b = code_string(ss.as_string());
4080   }
4081 #ifdef _LP64
4082   push(rscratch1);                    // save r10, trashed by movptr()
4083 #endif
4084   push(rax);                          // save rax,
4085   // addr may contain rsp so we will have to adjust it based on the push
4086   // we just did (and on 64 bit we do two pushes)
4087   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4088   // stores rax into addr which is backwards of what was intended.
4089   if (addr.uses(rsp)) {
4090     lea(rax, addr);
4091     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4092   } else {
4093     pushptr(addr);
4094   }
4095 
4096   ExternalAddress buffer((address) b);
4097   // pass msg argument
4098   // avoid using pushptr, as it modifies scratch registers
4099   // and our contract is not to modify anything
4100   movptr(rax, buffer.addr());
4101   push(rax);
4102 
4103   // call indirectly to solve generation ordering problem
4104   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4105   call(rax);
4106   // Caller pops the arguments (addr, message) and restores rax, r10.
4107 }
4108 
4109 void MacroAssembler::verify_tlab() {
4110 #ifdef ASSERT
4111   if (UseTLAB && VerifyOops) {
4112     Label next, ok;
4113     Register t1 = rsi;
4114     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4115 
4116     push(t1);
4117     NOT_LP64(push(thread_reg));
4118     NOT_LP64(get_thread(thread_reg));
4119 
4120     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4121     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4122     jcc(Assembler::aboveEqual, next);
4123     STOP("assert(top >= start)");
4124     should_not_reach_here();
4125 
4126     bind(next);
4127     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4128     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4129     jcc(Assembler::aboveEqual, ok);
4130     STOP("assert(top <= end)");
4131     should_not_reach_here();
4132 
4133     bind(ok);
4134     NOT_LP64(pop(thread_reg));
4135     pop(t1);
4136   }
4137 #endif
4138 }
4139 
4140 class ControlWord {
4141  public:
4142   int32_t _value;
4143 
4144   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4145   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4146   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4147   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4148   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4149   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4150   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4151   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4152 
4153   void print() const {
4154     // rounding control
4155     const char* rc;
4156     switch (rounding_control()) {
4157       case 0: rc = "round near"; break;
4158       case 1: rc = "round down"; break;
4159       case 2: rc = "round up  "; break;
4160       case 3: rc = "chop      "; break;
4161       default:
4162         rc = NULL; // silence compiler warnings
4163         fatal("Unknown rounding control: %d", rounding_control());
4164     };
4165     // precision control
4166     const char* pc;
4167     switch (precision_control()) {
4168       case 0: pc = "24 bits "; break;
4169       case 1: pc = "reserved"; break;
4170       case 2: pc = "53 bits "; break;
4171       case 3: pc = "64 bits "; break;
4172       default:
4173         pc = NULL; // silence compiler warnings
4174         fatal("Unknown precision control: %d", precision_control());
4175     };
4176     // flags
4177     char f[9];
4178     f[0] = ' ';
4179     f[1] = ' ';
4180     f[2] = (precision   ()) ? 'P' : 'p';
4181     f[3] = (underflow   ()) ? 'U' : 'u';
4182     f[4] = (overflow    ()) ? 'O' : 'o';
4183     f[5] = (zero_divide ()) ? 'Z' : 'z';
4184     f[6] = (denormalized()) ? 'D' : 'd';
4185     f[7] = (invalid     ()) ? 'I' : 'i';
4186     f[8] = '\x0';
4187     // output
4188     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4189   }
4190 
4191 };
4192 
4193 class StatusWord {
4194  public:
4195   int32_t _value;
4196 
4197   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4198   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4199   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4200   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4201   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4202   int  top() const                     { return  (_value >> 11) & 7      ; }
4203   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4204   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4205   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4206   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4207   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4208   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4209   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4210   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4211 
4212   void print() const {
4213     // condition codes
4214     char c[5];
4215     c[0] = (C3()) ? '3' : '-';
4216     c[1] = (C2()) ? '2' : '-';
4217     c[2] = (C1()) ? '1' : '-';
4218     c[3] = (C0()) ? '0' : '-';
4219     c[4] = '\x0';
4220     // flags
4221     char f[9];
4222     f[0] = (error_status()) ? 'E' : '-';
4223     f[1] = (stack_fault ()) ? 'S' : '-';
4224     f[2] = (precision   ()) ? 'P' : '-';
4225     f[3] = (underflow   ()) ? 'U' : '-';
4226     f[4] = (overflow    ()) ? 'O' : '-';
4227     f[5] = (zero_divide ()) ? 'Z' : '-';
4228     f[6] = (denormalized()) ? 'D' : '-';
4229     f[7] = (invalid     ()) ? 'I' : '-';
4230     f[8] = '\x0';
4231     // output
4232     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4233   }
4234 
4235 };
4236 
4237 class TagWord {
4238  public:
4239   int32_t _value;
4240 
4241   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4242 
4243   void print() const {
4244     printf("%04x", _value & 0xFFFF);
4245   }
4246 
4247 };
4248 
4249 class FPU_Register {
4250  public:
4251   int32_t _m0;
4252   int32_t _m1;
4253   int16_t _ex;
4254 
4255   bool is_indefinite() const           {
4256     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4257   }
4258 
4259   void print() const {
4260     char  sign = (_ex < 0) ? '-' : '+';
4261     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
4262     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
4263   };
4264 
4265 };
4266 
4267 class FPU_State {
4268  public:
4269   enum {
4270     register_size       = 10,
4271     number_of_registers =  8,
4272     register_mask       =  7
4273   };
4274 
4275   ControlWord  _control_word;
4276   StatusWord   _status_word;
4277   TagWord      _tag_word;
4278   int32_t      _error_offset;
4279   int32_t      _error_selector;
4280   int32_t      _data_offset;
4281   int32_t      _data_selector;
4282   int8_t       _register[register_size * number_of_registers];
4283 
4284   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4285   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
4286 
4287   const char* tag_as_string(int tag) const {
4288     switch (tag) {
4289       case 0: return "valid";
4290       case 1: return "zero";
4291       case 2: return "special";
4292       case 3: return "empty";
4293     }
4294     ShouldNotReachHere();
4295     return NULL;
4296   }
4297 
4298   void print() const {
4299     // print computation registers
4300     { int t = _status_word.top();
4301       for (int i = 0; i < number_of_registers; i++) {
4302         int j = (i - t) & register_mask;
4303         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4304         st(j)->print();
4305         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4306       }
4307     }
4308     printf("\n");
4309     // print control registers
4310     printf("ctrl = "); _control_word.print(); printf("\n");
4311     printf("stat = "); _status_word .print(); printf("\n");
4312     printf("tags = "); _tag_word    .print(); printf("\n");
4313   }
4314 
4315 };
4316 
4317 class Flag_Register {
4318  public:
4319   int32_t _value;
4320 
4321   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
4322   bool direction() const               { return ((_value >> 10) & 1) != 0; }
4323   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
4324   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
4325   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
4326   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
4327   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
4328 
4329   void print() const {
4330     // flags
4331     char f[8];
4332     f[0] = (overflow       ()) ? 'O' : '-';
4333     f[1] = (direction      ()) ? 'D' : '-';
4334     f[2] = (sign           ()) ? 'S' : '-';
4335     f[3] = (zero           ()) ? 'Z' : '-';
4336     f[4] = (auxiliary_carry()) ? 'A' : '-';
4337     f[5] = (parity         ()) ? 'P' : '-';
4338     f[6] = (carry          ()) ? 'C' : '-';
4339     f[7] = '\x0';
4340     // output
4341     printf("%08x  flags = %s", _value, f);
4342   }
4343 
4344 };
4345 
4346 class IU_Register {
4347  public:
4348   int32_t _value;
4349 
4350   void print() const {
4351     printf("%08x  %11d", _value, _value);
4352   }
4353 
4354 };
4355 
4356 class IU_State {
4357  public:
4358   Flag_Register _eflags;
4359   IU_Register   _rdi;
4360   IU_Register   _rsi;
4361   IU_Register   _rbp;
4362   IU_Register   _rsp;
4363   IU_Register   _rbx;
4364   IU_Register   _rdx;
4365   IU_Register   _rcx;
4366   IU_Register   _rax;
4367 
4368   void print() const {
4369     // computation registers
4370     printf("rax,  = "); _rax.print(); printf("\n");
4371     printf("rbx,  = "); _rbx.print(); printf("\n");
4372     printf("rcx  = "); _rcx.print(); printf("\n");
4373     printf("rdx  = "); _rdx.print(); printf("\n");
4374     printf("rdi  = "); _rdi.print(); printf("\n");
4375     printf("rsi  = "); _rsi.print(); printf("\n");
4376     printf("rbp,  = "); _rbp.print(); printf("\n");
4377     printf("rsp  = "); _rsp.print(); printf("\n");
4378     printf("\n");
4379     // control registers
4380     printf("flgs = "); _eflags.print(); printf("\n");
4381   }
4382 };
4383 
4384 
4385 class CPU_State {
4386  public:
4387   FPU_State _fpu_state;
4388   IU_State  _iu_state;
4389 
4390   void print() const {
4391     printf("--------------------------------------------------\n");
4392     _iu_state .print();
4393     printf("\n");
4394     _fpu_state.print();
4395     printf("--------------------------------------------------\n");
4396   }
4397 
4398 };
4399 
4400 
4401 static void _print_CPU_state(CPU_State* state) {
4402   state->print();
4403 };
4404 
4405 
4406 void MacroAssembler::print_CPU_state() {
4407   push_CPU_state();
4408   push(rsp);                // pass CPU state
4409   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
4410   addptr(rsp, wordSize);       // discard argument
4411   pop_CPU_state();
4412 }
4413 
4414 
4415 #ifndef _LP64
4416 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
4417   static int counter = 0;
4418   FPU_State* fs = &state->_fpu_state;
4419   counter++;
4420   // For leaf calls, only verify that the top few elements remain empty.
4421   // We only need 1 empty at the top for C2 code.
4422   if( stack_depth < 0 ) {
4423     if( fs->tag_for_st(7) != 3 ) {
4424       printf("FPR7 not empty\n");
4425       state->print();
4426       assert(false, "error");
4427       return false;
4428     }
4429     return true;                // All other stack states do not matter
4430   }
4431 
4432   assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(),
4433          "bad FPU control word");
4434 
4435   // compute stack depth
4436   int i = 0;
4437   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
4438   int d = i;
4439   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
4440   // verify findings
4441   if (i != FPU_State::number_of_registers) {
4442     // stack not contiguous
4443     printf("%s: stack not contiguous at ST%d\n", s, i);
4444     state->print();
4445     assert(false, "error");
4446     return false;
4447   }
4448   // check if computed stack depth corresponds to expected stack depth
4449   if (stack_depth < 0) {
4450     // expected stack depth is -stack_depth or less
4451     if (d > -stack_depth) {
4452       // too many elements on the stack
4453       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
4454       state->print();
4455       assert(false, "error");
4456       return false;
4457     }
4458   } else {
4459     // expected stack depth is stack_depth
4460     if (d != stack_depth) {
4461       // wrong stack depth
4462       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
4463       state->print();
4464       assert(false, "error");
4465       return false;
4466     }
4467   }
4468   // everything is cool
4469   return true;
4470 }
4471 
4472 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
4473   if (!VerifyFPU) return;
4474   push_CPU_state();
4475   push(rsp);                // pass CPU state
4476   ExternalAddress msg((address) s);
4477   // pass message string s
4478   pushptr(msg.addr());
4479   push(stack_depth);        // pass stack depth
4480   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
4481   addptr(rsp, 3 * wordSize);   // discard arguments
4482   // check for error
4483   { Label L;
4484     testl(rax, rax);
4485     jcc(Assembler::notZero, L);
4486     int3();                  // break if error condition
4487     bind(L);
4488   }
4489   pop_CPU_state();
4490 }
4491 #endif // _LP64
4492 
4493 void MacroAssembler::restore_cpu_control_state_after_jni() {
4494   // Either restore the MXCSR register after returning from the JNI Call
4495   // or verify that it wasn't changed (with -Xcheck:jni flag).
4496   if (VM_Version::supports_sse()) {
4497     if (RestoreMXCSROnJNICalls) {
4498       ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()));
4499     } else if (CheckJNICalls) {
4500       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
4501     }
4502   }
4503   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
4504   vzeroupper();
4505   // Reset k1 to 0xffff.
4506 
4507 #ifdef COMPILER2
4508   if (PostLoopMultiversioning && VM_Version::supports_evex()) {
4509     push(rcx);
4510     movl(rcx, 0xffff);
4511     kmovwl(k1, rcx);
4512     pop(rcx);
4513   }
4514 #endif // COMPILER2
4515 
4516 #ifndef _LP64
4517   // Either restore the x87 floating pointer control word after returning
4518   // from the JNI call or verify that it wasn't changed.
4519   if (CheckJNICalls) {
4520     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
4521   }
4522 #endif // _LP64
4523 }
4524 
4525 // ((OopHandle)result).resolve();
4526 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
4527   assert_different_registers(result, tmp);
4528 
4529   // Only 64 bit platforms support GCs that require a tmp register
4530   // Only IN_HEAP loads require a thread_tmp register
4531   // OopHandle::resolve is an indirection like jobject.
4532   access_load_at(T_OBJECT, IN_NATIVE,
4533                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
4534 }
4535 
4536 // ((WeakHandle)result).resolve();
4537 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
4538   assert_different_registers(rresult, rtmp);
4539   Label resolved;
4540 
4541   // A null weak handle resolves to null.
4542   cmpptr(rresult, 0);
4543   jcc(Assembler::equal, resolved);
4544 
4545   // Only 64 bit platforms support GCs that require a tmp register
4546   // Only IN_HEAP loads require a thread_tmp register
4547   // WeakHandle::resolve is an indirection like jweak.
4548   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4549                  rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
4550   bind(resolved);
4551 }
4552 
4553 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
4554   // get mirror
4555   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
4556   load_method_holder(mirror, method);
4557   movptr(mirror, Address(mirror, mirror_offset));
4558   resolve_oop_handle(mirror, tmp);
4559 }
4560 
4561 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
4562   load_method_holder(rresult, rmethod);
4563   movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
4564 }
4565 
4566 void MacroAssembler::load_method_holder(Register holder, Register method) {
4567   movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
4568   movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
4569   movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
4570 }
4571 
4572 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
4573   assert_different_registers(src, tmp);
4574   assert_different_registers(dst, tmp);
4575 #ifdef _LP64
4576   if (UseCompressedClassPointers) {
4577     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4578     decode_klass_not_null(dst, tmp);
4579   } else
4580 #endif
4581     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4582 }
4583 
4584 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
4585   assert_different_registers(src, tmp);
4586   assert_different_registers(dst, tmp);
4587 #ifdef _LP64
4588   if (UseCompressedClassPointers) {
4589     encode_klass_not_null(src, tmp);
4590     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4591   } else
4592 #endif
4593     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4594 }
4595 
4596 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
4597                                     Register tmp1, Register thread_tmp) {
4598   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4599   decorators = AccessInternal::decorator_fixup(decorators);
4600   bool as_raw = (decorators & AS_RAW) != 0;
4601   if (as_raw) {
4602     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4603   } else {
4604     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4605   }
4606 }
4607 
4608 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
4609                                      Register tmp1, Register tmp2) {
4610   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4611   decorators = AccessInternal::decorator_fixup(decorators);
4612   bool as_raw = (decorators & AS_RAW) != 0;
4613   if (as_raw) {
4614     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2);
4615   } else {
4616     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2);
4617   }
4618 }
4619 
4620 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4621                                    Register thread_tmp, DecoratorSet decorators) {
4622   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4623 }
4624 
4625 // Doesn't do verfication, generates fixed size code
4626 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4627                                             Register thread_tmp, DecoratorSet decorators) {
4628   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4629 }
4630 
4631 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4632                                     Register tmp2, DecoratorSet decorators) {
4633   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
4634 }
4635 
4636 // Used for storing NULLs.
4637 void MacroAssembler::store_heap_oop_null(Address dst) {
4638   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4639 }
4640 
4641 #ifdef _LP64
4642 void MacroAssembler::store_klass_gap(Register dst, Register src) {
4643   if (UseCompressedClassPointers) {
4644     // Store to klass gap in destination
4645     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
4646   }
4647 }
4648 
4649 #ifdef ASSERT
4650 void MacroAssembler::verify_heapbase(const char* msg) {
4651   assert (UseCompressedOops, "should be compressed");
4652   assert (Universe::heap() != NULL, "java heap should be initialized");
4653   if (CheckCompressedOops) {
4654     Label ok;
4655     push(rscratch1); // cmpptr trashes rscratch1
4656     cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
4657     jcc(Assembler::equal, ok);
4658     STOP(msg);
4659     bind(ok);
4660     pop(rscratch1);
4661   }
4662 }
4663 #endif
4664 
4665 // Algorithm must match oop.inline.hpp encode_heap_oop.
4666 void MacroAssembler::encode_heap_oop(Register r) {
4667 #ifdef ASSERT
4668   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
4669 #endif
4670   verify_oop_msg(r, "broken oop in encode_heap_oop");
4671   if (CompressedOops::base() == NULL) {
4672     if (CompressedOops::shift() != 0) {
4673       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4674       shrq(r, LogMinObjAlignmentInBytes);
4675     }
4676     return;
4677   }
4678   testq(r, r);
4679   cmovq(Assembler::equal, r, r12_heapbase);
4680   subq(r, r12_heapbase);
4681   shrq(r, LogMinObjAlignmentInBytes);
4682 }
4683 
4684 void MacroAssembler::encode_heap_oop_not_null(Register r) {
4685 #ifdef ASSERT
4686   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
4687   if (CheckCompressedOops) {
4688     Label ok;
4689     testq(r, r);
4690     jcc(Assembler::notEqual, ok);
4691     STOP("null oop passed to encode_heap_oop_not_null");
4692     bind(ok);
4693   }
4694 #endif
4695   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
4696   if (CompressedOops::base() != NULL) {
4697     subq(r, r12_heapbase);
4698   }
4699   if (CompressedOops::shift() != 0) {
4700     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4701     shrq(r, LogMinObjAlignmentInBytes);
4702   }
4703 }
4704 
4705 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
4706 #ifdef ASSERT
4707   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
4708   if (CheckCompressedOops) {
4709     Label ok;
4710     testq(src, src);
4711     jcc(Assembler::notEqual, ok);
4712     STOP("null oop passed to encode_heap_oop_not_null2");
4713     bind(ok);
4714   }
4715 #endif
4716   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
4717   if (dst != src) {
4718     movq(dst, src);
4719   }
4720   if (CompressedOops::base() != NULL) {
4721     subq(dst, r12_heapbase);
4722   }
4723   if (CompressedOops::shift() != 0) {
4724     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4725     shrq(dst, LogMinObjAlignmentInBytes);
4726   }
4727 }
4728 
4729 void  MacroAssembler::decode_heap_oop(Register r) {
4730 #ifdef ASSERT
4731   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
4732 #endif
4733   if (CompressedOops::base() == NULL) {
4734     if (CompressedOops::shift() != 0) {
4735       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4736       shlq(r, LogMinObjAlignmentInBytes);
4737     }
4738   } else {
4739     Label done;
4740     shlq(r, LogMinObjAlignmentInBytes);
4741     jccb(Assembler::equal, done);
4742     addq(r, r12_heapbase);
4743     bind(done);
4744   }
4745   verify_oop_msg(r, "broken oop in decode_heap_oop");
4746 }
4747 
4748 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
4749   // Note: it will change flags
4750   assert (UseCompressedOops, "should only be used for compressed headers");
4751   assert (Universe::heap() != NULL, "java heap should be initialized");
4752   // Cannot assert, unverified entry point counts instructions (see .ad file)
4753   // vtableStubs also counts instructions in pd_code_size_limit.
4754   // Also do not verify_oop as this is called by verify_oop.
4755   if (CompressedOops::shift() != 0) {
4756     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4757     shlq(r, LogMinObjAlignmentInBytes);
4758     if (CompressedOops::base() != NULL) {
4759       addq(r, r12_heapbase);
4760     }
4761   } else {
4762     assert (CompressedOops::base() == NULL, "sanity");
4763   }
4764 }
4765 
4766 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
4767   // Note: it will change flags
4768   assert (UseCompressedOops, "should only be used for compressed headers");
4769   assert (Universe::heap() != NULL, "java heap should be initialized");
4770   // Cannot assert, unverified entry point counts instructions (see .ad file)
4771   // vtableStubs also counts instructions in pd_code_size_limit.
4772   // Also do not verify_oop as this is called by verify_oop.
4773   if (CompressedOops::shift() != 0) {
4774     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4775     if (LogMinObjAlignmentInBytes == Address::times_8) {
4776       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
4777     } else {
4778       if (dst != src) {
4779         movq(dst, src);
4780       }
4781       shlq(dst, LogMinObjAlignmentInBytes);
4782       if (CompressedOops::base() != NULL) {
4783         addq(dst, r12_heapbase);
4784       }
4785     }
4786   } else {
4787     assert (CompressedOops::base() == NULL, "sanity");
4788     if (dst != src) {
4789       movq(dst, src);
4790     }
4791   }
4792 }
4793 
4794 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
4795   assert_different_registers(r, tmp);
4796   if (CompressedKlassPointers::base() != NULL) {
4797     mov64(tmp, (int64_t)CompressedKlassPointers::base());
4798     subq(r, tmp);
4799   }
4800   if (CompressedKlassPointers::shift() != 0) {
4801     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4802     shrq(r, LogKlassAlignmentInBytes);
4803   }
4804 }
4805 
4806 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
4807   assert_different_registers(src, dst);
4808   if (CompressedKlassPointers::base() != NULL) {
4809     mov64(dst, -(int64_t)CompressedKlassPointers::base());
4810     addq(dst, src);
4811   } else {
4812     movptr(dst, src);
4813   }
4814   if (CompressedKlassPointers::shift() != 0) {
4815     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4816     shrq(dst, LogKlassAlignmentInBytes);
4817   }
4818 }
4819 
4820 void  MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
4821   assert_different_registers(r, tmp);
4822   // Note: it will change flags
4823   assert(UseCompressedClassPointers, "should only be used for compressed headers");
4824   // Cannot assert, unverified entry point counts instructions (see .ad file)
4825   // vtableStubs also counts instructions in pd_code_size_limit.
4826   // Also do not verify_oop as this is called by verify_oop.
4827   if (CompressedKlassPointers::shift() != 0) {
4828     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4829     shlq(r, LogKlassAlignmentInBytes);
4830   }
4831   if (CompressedKlassPointers::base() != NULL) {
4832     mov64(tmp, (int64_t)CompressedKlassPointers::base());
4833     addq(r, tmp);
4834   }
4835 }
4836 
4837 void  MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
4838   assert_different_registers(src, dst);
4839   // Note: it will change flags
4840   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4841   // Cannot assert, unverified entry point counts instructions (see .ad file)
4842   // vtableStubs also counts instructions in pd_code_size_limit.
4843   // Also do not verify_oop as this is called by verify_oop.
4844 
4845   if (CompressedKlassPointers::base() == NULL &&
4846       CompressedKlassPointers::shift() == 0) {
4847     // The best case scenario is that there is no base or shift. Then it is already
4848     // a pointer that needs nothing but a register rename.
4849     movl(dst, src);
4850   } else {
4851     if (CompressedKlassPointers::base() != NULL) {
4852       mov64(dst, (int64_t)CompressedKlassPointers::base());
4853     } else {
4854       xorq(dst, dst);
4855     }
4856     if (CompressedKlassPointers::shift() != 0) {
4857       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4858       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
4859       leaq(dst, Address(dst, src, Address::times_8, 0));
4860     } else {
4861       addq(dst, src);
4862     }
4863   }
4864 }
4865 
4866 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4867   assert (UseCompressedOops, "should only be used for compressed headers");
4868   assert (Universe::heap() != NULL, "java heap should be initialized");
4869   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4870   int oop_index = oop_recorder()->find_index(obj);
4871   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4872   mov_narrow_oop(dst, oop_index, rspec);
4873 }
4874 
4875 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
4876   assert (UseCompressedOops, "should only be used for compressed headers");
4877   assert (Universe::heap() != NULL, "java heap should be initialized");
4878   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4879   int oop_index = oop_recorder()->find_index(obj);
4880   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4881   mov_narrow_oop(dst, oop_index, rspec);
4882 }
4883 
4884 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4885   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4886   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4887   int klass_index = oop_recorder()->find_index(k);
4888   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
4889   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
4890 }
4891 
4892 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
4893   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4894   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4895   int klass_index = oop_recorder()->find_index(k);
4896   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
4897   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
4898 }
4899 
4900 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
4901   assert (UseCompressedOops, "should only be used for compressed headers");
4902   assert (Universe::heap() != NULL, "java heap should be initialized");
4903   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4904   int oop_index = oop_recorder()->find_index(obj);
4905   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4906   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
4907 }
4908 
4909 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
4910   assert (UseCompressedOops, "should only be used for compressed headers");
4911   assert (Universe::heap() != NULL, "java heap should be initialized");
4912   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4913   int oop_index = oop_recorder()->find_index(obj);
4914   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4915   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
4916 }
4917 
4918 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
4919   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4920   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4921   int klass_index = oop_recorder()->find_index(k);
4922   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
4923   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
4924 }
4925 
4926 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
4927   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4928   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4929   int klass_index = oop_recorder()->find_index(k);
4930   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
4931   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
4932 }
4933 
4934 void MacroAssembler::reinit_heapbase() {
4935   if (UseCompressedOops) {
4936     if (Universe::heap() != NULL) {
4937       if (CompressedOops::base() == NULL) {
4938         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
4939       } else {
4940         mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
4941       }
4942     } else {
4943       movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
4944     }
4945   }
4946 }
4947 
4948 #endif // _LP64
4949 
4950 // C2 compiled method's prolog code.
4951 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
4952 
4953   // WARNING: Initial instruction MUST be 5 bytes or longer so that
4954   // NativeJump::patch_verified_entry will be able to patch out the entry
4955   // code safely. The push to verify stack depth is ok at 5 bytes,
4956   // the frame allocation can be either 3 or 6 bytes. So if we don't do
4957   // stack bang then we must use the 6 byte frame allocation even if
4958   // we have no frame. :-(
4959   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
4960 
4961   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
4962   // Remove word for return addr
4963   framesize -= wordSize;
4964   stack_bang_size -= wordSize;
4965 
4966   // Calls to C2R adapters often do not accept exceptional returns.
4967   // We require that their callers must bang for them.  But be careful, because
4968   // some VM calls (such as call site linkage) can use several kilobytes of
4969   // stack.  But the stack safety zone should account for that.
4970   // See bugs 4446381, 4468289, 4497237.
4971   if (stack_bang_size > 0) {
4972     generate_stack_overflow_check(stack_bang_size);
4973 
4974     // We always push rbp, so that on return to interpreter rbp, will be
4975     // restored correctly and we can correct the stack.
4976     push(rbp);
4977     // Save caller's stack pointer into RBP if the frame pointer is preserved.
4978     if (PreserveFramePointer) {
4979       mov(rbp, rsp);
4980     }
4981     // Remove word for ebp
4982     framesize -= wordSize;
4983 
4984     // Create frame
4985     if (framesize) {
4986       subptr(rsp, framesize);
4987     }
4988   } else {
4989     // Create frame (force generation of a 4 byte immediate value)
4990     subptr_imm32(rsp, framesize);
4991 
4992     // Save RBP register now.
4993     framesize -= wordSize;
4994     movptr(Address(rsp, framesize), rbp);
4995     // Save caller's stack pointer into RBP if the frame pointer is preserved.
4996     if (PreserveFramePointer) {
4997       movptr(rbp, rsp);
4998       if (framesize > 0) {
4999         addptr(rbp, framesize);
5000       }
5001     }
5002   }
5003 
5004   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5005     framesize -= wordSize;
5006     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5007   }
5008 
5009 #ifndef _LP64
5010   // If method sets FPU control word do it now
5011   if (fp_mode_24b) {
5012     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
5013   }
5014   if (UseSSE >= 2 && VerifyFPU) {
5015     verify_FPU(0, "FPU stack must be clean on entry");
5016   }
5017 #endif
5018 
5019 #ifdef ASSERT
5020   if (VerifyStackAtCalls) {
5021     Label L;
5022     push(rax);
5023     mov(rax, rsp);
5024     andptr(rax, StackAlignmentInBytes-1);
5025     cmpptr(rax, StackAlignmentInBytes-wordSize);
5026     pop(rax);
5027     jcc(Assembler::equal, L);
5028     STOP("Stack is not properly aligned!");
5029     bind(L);
5030   }
5031 #endif
5032 
5033   if (!is_stub) {
5034     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5035     bs->nmethod_entry_barrier(this);
5036   }
5037 }
5038 
5039 #if COMPILER2_OR_JVMCI
5040 
5041 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5042 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5043   // cnt - number of qwords (8-byte words).
5044   // base - start address, qword aligned.
5045   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5046   bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0);
5047   if (use64byteVector) {
5048     vpxor(xtmp, xtmp, xtmp, AVX_512bit);
5049   } else if (MaxVectorSize >= 32) {
5050     vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5051   } else {
5052     pxor(xtmp, xtmp);
5053   }
5054   jmp(L_zero_64_bytes);
5055 
5056   BIND(L_loop);
5057   if (MaxVectorSize >= 32) {
5058     fill64(base, 0, xtmp, use64byteVector);
5059   } else {
5060     movdqu(Address(base,  0), xtmp);
5061     movdqu(Address(base, 16), xtmp);
5062     movdqu(Address(base, 32), xtmp);
5063     movdqu(Address(base, 48), xtmp);
5064   }
5065   addptr(base, 64);
5066 
5067   BIND(L_zero_64_bytes);
5068   subptr(cnt, 8);
5069   jccb(Assembler::greaterEqual, L_loop);
5070 
5071   // Copy trailing 64 bytes
5072   if (use64byteVector) {
5073     addptr(cnt, 8);
5074     jccb(Assembler::equal, L_end);
5075     fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
5076     jmp(L_end);
5077   } else {
5078     addptr(cnt, 4);
5079     jccb(Assembler::less, L_tail);
5080     if (MaxVectorSize >= 32) {
5081       vmovdqu(Address(base, 0), xtmp);
5082     } else {
5083       movdqu(Address(base,  0), xtmp);
5084       movdqu(Address(base, 16), xtmp);
5085     }
5086   }
5087   addptr(base, 32);
5088   subptr(cnt, 4);
5089 
5090   BIND(L_tail);
5091   addptr(cnt, 4);
5092   jccb(Assembler::lessEqual, L_end);
5093   if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5094     fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
5095   } else {
5096     decrement(cnt);
5097 
5098     BIND(L_sloop);
5099     movq(Address(base, 0), xtmp);
5100     addptr(base, 8);
5101     decrement(cnt);
5102     jccb(Assembler::greaterEqual, L_sloop);
5103   }
5104   BIND(L_end);
5105 }
5106 
5107 // Clearing constant sized memory using YMM/ZMM registers.
5108 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5109   assert(UseAVX > 2 && VM_Version::supports_avx512vlbw(), "");
5110   bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
5111 
5112   int vector64_count = (cnt & (~0x7)) >> 3;
5113   cnt = cnt & 0x7;
5114 
5115   // 64 byte initialization loop.
5116   vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
5117   for (int i = 0; i < vector64_count; i++) {
5118     fill64(base, i * 64, xtmp, use64byteVector);
5119   }
5120 
5121   // Clear remaining 64 byte tail.
5122   int disp = vector64_count * 64;
5123   if (cnt) {
5124     switch (cnt) {
5125       case 1:
5126         movq(Address(base, disp), xtmp);
5127         break;
5128       case 2:
5129         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_128bit);
5130         break;
5131       case 3:
5132         movl(rtmp, 0x7);
5133         kmovwl(mask, rtmp);
5134         evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_256bit);
5135         break;
5136       case 4:
5137         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5138         break;
5139       case 5:
5140         if (use64byteVector) {
5141           movl(rtmp, 0x1F);
5142           kmovwl(mask, rtmp);
5143           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5144         } else {
5145           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5146           movq(Address(base, disp + 32), xtmp);
5147         }
5148         break;
5149       case 6:
5150         if (use64byteVector) {
5151           movl(rtmp, 0x3F);
5152           kmovwl(mask, rtmp);
5153           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5154         } else {
5155           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5156           evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, Assembler::AVX_128bit);
5157         }
5158         break;
5159       case 7:
5160         if (use64byteVector) {
5161           movl(rtmp, 0x7F);
5162           kmovwl(mask, rtmp);
5163           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5164         } else {
5165           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5166           movl(rtmp, 0x7);
5167           kmovwl(mask, rtmp);
5168           evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, Assembler::AVX_256bit);
5169         }
5170         break;
5171       default:
5172         fatal("Unexpected length : %d\n",cnt);
5173         break;
5174     }
5175   }
5176 }
5177 
5178 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
5179                                bool is_large, KRegister mask) {
5180   // cnt      - number of qwords (8-byte words).
5181   // base     - start address, qword aligned.
5182   // is_large - if optimizers know cnt is larger than InitArrayShortSize
5183   assert(base==rdi, "base register must be edi for rep stos");
5184   assert(tmp==rax,   "tmp register must be eax for rep stos");
5185   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
5186   assert(InitArrayShortSize % BytesPerLong == 0,
5187     "InitArrayShortSize should be the multiple of BytesPerLong");
5188 
5189   Label DONE;
5190   if (!is_large || !UseXMMForObjInit) {
5191     xorptr(tmp, tmp);
5192   }
5193 
5194   if (!is_large) {
5195     Label LOOP, LONG;
5196     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5197     jccb(Assembler::greater, LONG);
5198 
5199     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5200 
5201     decrement(cnt);
5202     jccb(Assembler::negative, DONE); // Zero length
5203 
5204     // Use individual pointer-sized stores for small counts:
5205     BIND(LOOP);
5206     movptr(Address(base, cnt, Address::times_ptr), tmp);
5207     decrement(cnt);
5208     jccb(Assembler::greaterEqual, LOOP);
5209     jmpb(DONE);
5210 
5211     BIND(LONG);
5212   }
5213 
5214   // Use longer rep-prefixed ops for non-small counts:
5215   if (UseFastStosb) {
5216     shlptr(cnt, 3); // convert to number of bytes
5217     rep_stosb();
5218   } else if (UseXMMForObjInit) {
5219     xmm_clear_mem(base, cnt, tmp, xtmp, mask);
5220   } else {
5221     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5222     rep_stos();
5223   }
5224 
5225   BIND(DONE);
5226 }
5227 
5228 #endif //COMPILER2_OR_JVMCI
5229 
5230 
5231 void MacroAssembler::generate_fill(BasicType t, bool aligned,
5232                                    Register to, Register value, Register count,
5233                                    Register rtmp, XMMRegister xtmp) {
5234   ShortBranchVerifier sbv(this);
5235   assert_different_registers(to, value, count, rtmp);
5236   Label L_exit;
5237   Label L_fill_2_bytes, L_fill_4_bytes;
5238 
5239 #if defined(COMPILER2) && defined(_LP64)
5240   if(MaxVectorSize >=32 &&
5241      VM_Version::supports_avx512vlbw() &&
5242      VM_Version::supports_bmi2()) {
5243     generate_fill_avx3(t, to, value, count, rtmp, xtmp);
5244     return;
5245   }
5246 #endif
5247 
5248   int shift = -1;
5249   switch (t) {
5250     case T_BYTE:
5251       shift = 2;
5252       break;
5253     case T_SHORT:
5254       shift = 1;
5255       break;
5256     case T_INT:
5257       shift = 0;
5258       break;
5259     default: ShouldNotReachHere();
5260   }
5261 
5262   if (t == T_BYTE) {
5263     andl(value, 0xff);
5264     movl(rtmp, value);
5265     shll(rtmp, 8);
5266     orl(value, rtmp);
5267   }
5268   if (t == T_SHORT) {
5269     andl(value, 0xffff);
5270   }
5271   if (t == T_BYTE || t == T_SHORT) {
5272     movl(rtmp, value);
5273     shll(rtmp, 16);
5274     orl(value, rtmp);
5275   }
5276 
5277   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
5278   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
5279   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
5280     Label L_skip_align2;
5281     // align source address at 4 bytes address boundary
5282     if (t == T_BYTE) {
5283       Label L_skip_align1;
5284       // One byte misalignment happens only for byte arrays
5285       testptr(to, 1);
5286       jccb(Assembler::zero, L_skip_align1);
5287       movb(Address(to, 0), value);
5288       increment(to);
5289       decrement(count);
5290       BIND(L_skip_align1);
5291     }
5292     // Two bytes misalignment happens only for byte and short (char) arrays
5293     testptr(to, 2);
5294     jccb(Assembler::zero, L_skip_align2);
5295     movw(Address(to, 0), value);
5296     addptr(to, 2);
5297     subl(count, 1<<(shift-1));
5298     BIND(L_skip_align2);
5299   }
5300   if (UseSSE < 2) {
5301     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5302     // Fill 32-byte chunks
5303     subl(count, 8 << shift);
5304     jcc(Assembler::less, L_check_fill_8_bytes);
5305     align(16);
5306 
5307     BIND(L_fill_32_bytes_loop);
5308 
5309     for (int i = 0; i < 32; i += 4) {
5310       movl(Address(to, i), value);
5311     }
5312 
5313     addptr(to, 32);
5314     subl(count, 8 << shift);
5315     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5316     BIND(L_check_fill_8_bytes);
5317     addl(count, 8 << shift);
5318     jccb(Assembler::zero, L_exit);
5319     jmpb(L_fill_8_bytes);
5320 
5321     //
5322     // length is too short, just fill qwords
5323     //
5324     BIND(L_fill_8_bytes_loop);
5325     movl(Address(to, 0), value);
5326     movl(Address(to, 4), value);
5327     addptr(to, 8);
5328     BIND(L_fill_8_bytes);
5329     subl(count, 1 << (shift + 1));
5330     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5331     // fall through to fill 4 bytes
5332   } else {
5333     Label L_fill_32_bytes;
5334     if (!UseUnalignedLoadStores) {
5335       // align to 8 bytes, we know we are 4 byte aligned to start
5336       testptr(to, 4);
5337       jccb(Assembler::zero, L_fill_32_bytes);
5338       movl(Address(to, 0), value);
5339       addptr(to, 4);
5340       subl(count, 1<<shift);
5341     }
5342     BIND(L_fill_32_bytes);
5343     {
5344       assert( UseSSE >= 2, "supported cpu only" );
5345       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5346       movdl(xtmp, value);
5347       if (UseAVX >= 2 && UseUnalignedLoadStores) {
5348         Label L_check_fill_32_bytes;
5349         if (UseAVX > 2) {
5350           // Fill 64-byte chunks
5351           Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
5352 
5353           // If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2
5354           cmpl(count, VM_Version::avx3_threshold());
5355           jccb(Assembler::below, L_check_fill_64_bytes_avx2);
5356 
5357           vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
5358 
5359           subl(count, 16 << shift);
5360           jccb(Assembler::less, L_check_fill_32_bytes);
5361           align(16);
5362 
5363           BIND(L_fill_64_bytes_loop_avx3);
5364           evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
5365           addptr(to, 64);
5366           subl(count, 16 << shift);
5367           jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
5368           jmpb(L_check_fill_32_bytes);
5369 
5370           BIND(L_check_fill_64_bytes_avx2);
5371         }
5372         // Fill 64-byte chunks
5373         Label L_fill_64_bytes_loop;
5374         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
5375 
5376         subl(count, 16 << shift);
5377         jcc(Assembler::less, L_check_fill_32_bytes);
5378         align(16);
5379 
5380         BIND(L_fill_64_bytes_loop);
5381         vmovdqu(Address(to, 0), xtmp);
5382         vmovdqu(Address(to, 32), xtmp);
5383         addptr(to, 64);
5384         subl(count, 16 << shift);
5385         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
5386 
5387         BIND(L_check_fill_32_bytes);
5388         addl(count, 8 << shift);
5389         jccb(Assembler::less, L_check_fill_8_bytes);
5390         vmovdqu(Address(to, 0), xtmp);
5391         addptr(to, 32);
5392         subl(count, 8 << shift);
5393 
5394         BIND(L_check_fill_8_bytes);
5395         // clean upper bits of YMM registers
5396         movdl(xtmp, value);
5397         pshufd(xtmp, xtmp, 0);
5398       } else {
5399         // Fill 32-byte chunks
5400         pshufd(xtmp, xtmp, 0);
5401 
5402         subl(count, 8 << shift);
5403         jcc(Assembler::less, L_check_fill_8_bytes);
5404         align(16);
5405 
5406         BIND(L_fill_32_bytes_loop);
5407 
5408         if (UseUnalignedLoadStores) {
5409           movdqu(Address(to, 0), xtmp);
5410           movdqu(Address(to, 16), xtmp);
5411         } else {
5412           movq(Address(to, 0), xtmp);
5413           movq(Address(to, 8), xtmp);
5414           movq(Address(to, 16), xtmp);
5415           movq(Address(to, 24), xtmp);
5416         }
5417 
5418         addptr(to, 32);
5419         subl(count, 8 << shift);
5420         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5421 
5422         BIND(L_check_fill_8_bytes);
5423       }
5424       addl(count, 8 << shift);
5425       jccb(Assembler::zero, L_exit);
5426       jmpb(L_fill_8_bytes);
5427 
5428       //
5429       // length is too short, just fill qwords
5430       //
5431       BIND(L_fill_8_bytes_loop);
5432       movq(Address(to, 0), xtmp);
5433       addptr(to, 8);
5434       BIND(L_fill_8_bytes);
5435       subl(count, 1 << (shift + 1));
5436       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5437     }
5438   }
5439   // fill trailing 4 bytes
5440   BIND(L_fill_4_bytes);
5441   testl(count, 1<<shift);
5442   jccb(Assembler::zero, L_fill_2_bytes);
5443   movl(Address(to, 0), value);
5444   if (t == T_BYTE || t == T_SHORT) {
5445     Label L_fill_byte;
5446     addptr(to, 4);
5447     BIND(L_fill_2_bytes);
5448     // fill trailing 2 bytes
5449     testl(count, 1<<(shift-1));
5450     jccb(Assembler::zero, L_fill_byte);
5451     movw(Address(to, 0), value);
5452     if (t == T_BYTE) {
5453       addptr(to, 2);
5454       BIND(L_fill_byte);
5455       // fill trailing byte
5456       testl(count, 1);
5457       jccb(Assembler::zero, L_exit);
5458       movb(Address(to, 0), value);
5459     } else {
5460       BIND(L_fill_byte);
5461     }
5462   } else {
5463     BIND(L_fill_2_bytes);
5464   }
5465   BIND(L_exit);
5466 }
5467 
5468 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
5469   switch(type) {
5470     case T_BYTE:
5471     case T_BOOLEAN:
5472       evpbroadcastb(dst, src, vector_len);
5473       break;
5474     case T_SHORT:
5475     case T_CHAR:
5476       evpbroadcastw(dst, src, vector_len);
5477       break;
5478     case T_INT:
5479     case T_FLOAT:
5480       evpbroadcastd(dst, src, vector_len);
5481       break;
5482     case T_LONG:
5483     case T_DOUBLE:
5484       evpbroadcastq(dst, src, vector_len);
5485       break;
5486     default:
5487       fatal("Unhandled type : %s", type2name(type));
5488       break;
5489   }
5490 }
5491 
5492 // encode char[] to byte[] in ISO_8859_1 or ASCII
5493    //@IntrinsicCandidate
5494    //private static int implEncodeISOArray(byte[] sa, int sp,
5495    //byte[] da, int dp, int len) {
5496    //  int i = 0;
5497    //  for (; i < len; i++) {
5498    //    char c = StringUTF16.getChar(sa, sp++);
5499    //    if (c > '\u00FF')
5500    //      break;
5501    //    da[dp++] = (byte)c;
5502    //  }
5503    //  return i;
5504    //}
5505    //
5506    //@IntrinsicCandidate
5507    //private static int implEncodeAsciiArray(char[] sa, int sp,
5508    //    byte[] da, int dp, int len) {
5509    //  int i = 0;
5510    //  for (; i < len; i++) {
5511    //    char c = sa[sp++];
5512    //    if (c >= '\u0080')
5513    //      break;
5514    //    da[dp++] = (byte)c;
5515    //  }
5516    //  return i;
5517    //}
5518 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
5519   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
5520   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
5521   Register tmp5, Register result, bool ascii) {
5522 
5523   // rsi: src
5524   // rdi: dst
5525   // rdx: len
5526   // rcx: tmp5
5527   // rax: result
5528   ShortBranchVerifier sbv(this);
5529   assert_different_registers(src, dst, len, tmp5, result);
5530   Label L_done, L_copy_1_char, L_copy_1_char_exit;
5531 
5532   int mask = ascii ? 0xff80ff80 : 0xff00ff00;
5533   int short_mask = ascii ? 0xff80 : 0xff00;
5534 
5535   // set result
5536   xorl(result, result);
5537   // check for zero length
5538   testl(len, len);
5539   jcc(Assembler::zero, L_done);
5540 
5541   movl(result, len);
5542 
5543   // Setup pointers
5544   lea(src, Address(src, len, Address::times_2)); // char[]
5545   lea(dst, Address(dst, len, Address::times_1)); // byte[]
5546   negptr(len);
5547 
5548   if (UseSSE42Intrinsics || UseAVX >= 2) {
5549     Label L_copy_8_chars, L_copy_8_chars_exit;
5550     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
5551 
5552     if (UseAVX >= 2) {
5553       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
5554       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
5555       movdl(tmp1Reg, tmp5);
5556       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
5557       jmp(L_chars_32_check);
5558 
5559       bind(L_copy_32_chars);
5560       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
5561       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
5562       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5563       vptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
5564       jccb(Assembler::notZero, L_copy_32_chars_exit);
5565       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5566       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
5567       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
5568 
5569       bind(L_chars_32_check);
5570       addptr(len, 32);
5571       jcc(Assembler::lessEqual, L_copy_32_chars);
5572 
5573       bind(L_copy_32_chars_exit);
5574       subptr(len, 16);
5575       jccb(Assembler::greater, L_copy_16_chars_exit);
5576 
5577     } else if (UseSSE42Intrinsics) {
5578       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
5579       movdl(tmp1Reg, tmp5);
5580       pshufd(tmp1Reg, tmp1Reg, 0);
5581       jmpb(L_chars_16_check);
5582     }
5583 
5584     bind(L_copy_16_chars);
5585     if (UseAVX >= 2) {
5586       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
5587       vptest(tmp2Reg, tmp1Reg);
5588       jcc(Assembler::notZero, L_copy_16_chars_exit);
5589       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
5590       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
5591     } else {
5592       if (UseAVX > 0) {
5593         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5594         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5595         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
5596       } else {
5597         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5598         por(tmp2Reg, tmp3Reg);
5599         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5600         por(tmp2Reg, tmp4Reg);
5601       }
5602       ptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
5603       jccb(Assembler::notZero, L_copy_16_chars_exit);
5604       packuswb(tmp3Reg, tmp4Reg);
5605     }
5606     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
5607 
5608     bind(L_chars_16_check);
5609     addptr(len, 16);
5610     jcc(Assembler::lessEqual, L_copy_16_chars);
5611 
5612     bind(L_copy_16_chars_exit);
5613     if (UseAVX >= 2) {
5614       // clean upper bits of YMM registers
5615       vpxor(tmp2Reg, tmp2Reg);
5616       vpxor(tmp3Reg, tmp3Reg);
5617       vpxor(tmp4Reg, tmp4Reg);
5618       movdl(tmp1Reg, tmp5);
5619       pshufd(tmp1Reg, tmp1Reg, 0);
5620     }
5621     subptr(len, 8);
5622     jccb(Assembler::greater, L_copy_8_chars_exit);
5623 
5624     bind(L_copy_8_chars);
5625     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
5626     ptest(tmp3Reg, tmp1Reg);
5627     jccb(Assembler::notZero, L_copy_8_chars_exit);
5628     packuswb(tmp3Reg, tmp1Reg);
5629     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
5630     addptr(len, 8);
5631     jccb(Assembler::lessEqual, L_copy_8_chars);
5632 
5633     bind(L_copy_8_chars_exit);
5634     subptr(len, 8);
5635     jccb(Assembler::zero, L_done);
5636   }
5637 
5638   bind(L_copy_1_char);
5639   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
5640   testl(tmp5, short_mask);      // check if Unicode or non-ASCII char
5641   jccb(Assembler::notZero, L_copy_1_char_exit);
5642   movb(Address(dst, len, Address::times_1, 0), tmp5);
5643   addptr(len, 1);
5644   jccb(Assembler::less, L_copy_1_char);
5645 
5646   bind(L_copy_1_char_exit);
5647   addptr(result, len); // len is negative count of not processed elements
5648 
5649   bind(L_done);
5650 }
5651 
5652 #ifdef _LP64
5653 /**
5654  * Helper for multiply_to_len().
5655  */
5656 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
5657   addq(dest_lo, src1);
5658   adcq(dest_hi, 0);
5659   addq(dest_lo, src2);
5660   adcq(dest_hi, 0);
5661 }
5662 
5663 /**
5664  * Multiply 64 bit by 64 bit first loop.
5665  */
5666 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5667                                            Register y, Register y_idx, Register z,
5668                                            Register carry, Register product,
5669                                            Register idx, Register kdx) {
5670   //
5671   //  jlong carry, x[], y[], z[];
5672   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5673   //    huge_128 product = y[idx] * x[xstart] + carry;
5674   //    z[kdx] = (jlong)product;
5675   //    carry  = (jlong)(product >>> 64);
5676   //  }
5677   //  z[xstart] = carry;
5678   //
5679 
5680   Label L_first_loop, L_first_loop_exit;
5681   Label L_one_x, L_one_y, L_multiply;
5682 
5683   decrementl(xstart);
5684   jcc(Assembler::negative, L_one_x);
5685 
5686   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
5687   rorq(x_xstart, 32); // convert big-endian to little-endian
5688 
5689   bind(L_first_loop);
5690   decrementl(idx);
5691   jcc(Assembler::negative, L_first_loop_exit);
5692   decrementl(idx);
5693   jcc(Assembler::negative, L_one_y);
5694   movq(y_idx, Address(y, idx, Address::times_4,  0));
5695   rorq(y_idx, 32); // convert big-endian to little-endian
5696   bind(L_multiply);
5697   movq(product, x_xstart);
5698   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
5699   addq(product, carry);
5700   adcq(rdx, 0);
5701   subl(kdx, 2);
5702   movl(Address(z, kdx, Address::times_4,  4), product);
5703   shrq(product, 32);
5704   movl(Address(z, kdx, Address::times_4,  0), product);
5705   movq(carry, rdx);
5706   jmp(L_first_loop);
5707 
5708   bind(L_one_y);
5709   movl(y_idx, Address(y,  0));
5710   jmp(L_multiply);
5711 
5712   bind(L_one_x);
5713   movl(x_xstart, Address(x,  0));
5714   jmp(L_first_loop);
5715 
5716   bind(L_first_loop_exit);
5717 }
5718 
5719 /**
5720  * Multiply 64 bit by 64 bit and add 128 bit.
5721  */
5722 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
5723                                             Register yz_idx, Register idx,
5724                                             Register carry, Register product, int offset) {
5725   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
5726   //     z[kdx] = (jlong)product;
5727 
5728   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
5729   rorq(yz_idx, 32); // convert big-endian to little-endian
5730   movq(product, x_xstart);
5731   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
5732   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
5733   rorq(yz_idx, 32); // convert big-endian to little-endian
5734 
5735   add2_with_carry(rdx, product, carry, yz_idx);
5736 
5737   movl(Address(z, idx, Address::times_4,  offset+4), product);
5738   shrq(product, 32);
5739   movl(Address(z, idx, Address::times_4,  offset), product);
5740 
5741 }
5742 
5743 /**
5744  * Multiply 128 bit by 128 bit. Unrolled inner loop.
5745  */
5746 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
5747                                              Register yz_idx, Register idx, Register jdx,
5748                                              Register carry, Register product,
5749                                              Register carry2) {
5750   //   jlong carry, x[], y[], z[];
5751   //   int kdx = ystart+1;
5752   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5753   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
5754   //     z[kdx+idx+1] = (jlong)product;
5755   //     jlong carry2  = (jlong)(product >>> 64);
5756   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
5757   //     z[kdx+idx] = (jlong)product;
5758   //     carry  = (jlong)(product >>> 64);
5759   //   }
5760   //   idx += 2;
5761   //   if (idx > 0) {
5762   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
5763   //     z[kdx+idx] = (jlong)product;
5764   //     carry  = (jlong)(product >>> 64);
5765   //   }
5766   //
5767 
5768   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5769 
5770   movl(jdx, idx);
5771   andl(jdx, 0xFFFFFFFC);
5772   shrl(jdx, 2);
5773 
5774   bind(L_third_loop);
5775   subl(jdx, 1);
5776   jcc(Assembler::negative, L_third_loop_exit);
5777   subl(idx, 4);
5778 
5779   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
5780   movq(carry2, rdx);
5781 
5782   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
5783   movq(carry, rdx);
5784   jmp(L_third_loop);
5785 
5786   bind (L_third_loop_exit);
5787 
5788   andl (idx, 0x3);
5789   jcc(Assembler::zero, L_post_third_loop_done);
5790 
5791   Label L_check_1;
5792   subl(idx, 2);
5793   jcc(Assembler::negative, L_check_1);
5794 
5795   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
5796   movq(carry, rdx);
5797 
5798   bind (L_check_1);
5799   addl (idx, 0x2);
5800   andl (idx, 0x1);
5801   subl(idx, 1);
5802   jcc(Assembler::negative, L_post_third_loop_done);
5803 
5804   movl(yz_idx, Address(y, idx, Address::times_4,  0));
5805   movq(product, x_xstart);
5806   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
5807   movl(yz_idx, Address(z, idx, Address::times_4,  0));
5808 
5809   add2_with_carry(rdx, product, yz_idx, carry);
5810 
5811   movl(Address(z, idx, Address::times_4,  0), product);
5812   shrq(product, 32);
5813 
5814   shlq(rdx, 32);
5815   orq(product, rdx);
5816   movq(carry, product);
5817 
5818   bind(L_post_third_loop_done);
5819 }
5820 
5821 /**
5822  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
5823  *
5824  */
5825 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
5826                                                   Register carry, Register carry2,
5827                                                   Register idx, Register jdx,
5828                                                   Register yz_idx1, Register yz_idx2,
5829                                                   Register tmp, Register tmp3, Register tmp4) {
5830   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
5831 
5832   //   jlong carry, x[], y[], z[];
5833   //   int kdx = ystart+1;
5834   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5835   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
5836   //     jlong carry2  = (jlong)(tmp3 >>> 64);
5837   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
5838   //     carry  = (jlong)(tmp4 >>> 64);
5839   //     z[kdx+idx+1] = (jlong)tmp3;
5840   //     z[kdx+idx] = (jlong)tmp4;
5841   //   }
5842   //   idx += 2;
5843   //   if (idx > 0) {
5844   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
5845   //     z[kdx+idx] = (jlong)yz_idx1;
5846   //     carry  = (jlong)(yz_idx1 >>> 64);
5847   //   }
5848   //
5849 
5850   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5851 
5852   movl(jdx, idx);
5853   andl(jdx, 0xFFFFFFFC);
5854   shrl(jdx, 2);
5855 
5856   bind(L_third_loop);
5857   subl(jdx, 1);
5858   jcc(Assembler::negative, L_third_loop_exit);
5859   subl(idx, 4);
5860 
5861   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
5862   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
5863   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
5864   rorxq(yz_idx2, yz_idx2, 32);
5865 
5866   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
5867   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
5868 
5869   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
5870   rorxq(yz_idx1, yz_idx1, 32);
5871   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
5872   rorxq(yz_idx2, yz_idx2, 32);
5873 
5874   if (VM_Version::supports_adx()) {
5875     adcxq(tmp3, carry);
5876     adoxq(tmp3, yz_idx1);
5877 
5878     adcxq(tmp4, tmp);
5879     adoxq(tmp4, yz_idx2);
5880 
5881     movl(carry, 0); // does not affect flags
5882     adcxq(carry2, carry);
5883     adoxq(carry2, carry);
5884   } else {
5885     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
5886     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
5887   }
5888   movq(carry, carry2);
5889 
5890   movl(Address(z, idx, Address::times_4, 12), tmp3);
5891   shrq(tmp3, 32);
5892   movl(Address(z, idx, Address::times_4,  8), tmp3);
5893 
5894   movl(Address(z, idx, Address::times_4,  4), tmp4);
5895   shrq(tmp4, 32);
5896   movl(Address(z, idx, Address::times_4,  0), tmp4);
5897 
5898   jmp(L_third_loop);
5899 
5900   bind (L_third_loop_exit);
5901 
5902   andl (idx, 0x3);
5903   jcc(Assembler::zero, L_post_third_loop_done);
5904 
5905   Label L_check_1;
5906   subl(idx, 2);
5907   jcc(Assembler::negative, L_check_1);
5908 
5909   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
5910   rorxq(yz_idx1, yz_idx1, 32);
5911   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
5912   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
5913   rorxq(yz_idx2, yz_idx2, 32);
5914 
5915   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
5916 
5917   movl(Address(z, idx, Address::times_4,  4), tmp3);
5918   shrq(tmp3, 32);
5919   movl(Address(z, idx, Address::times_4,  0), tmp3);
5920   movq(carry, tmp4);
5921 
5922   bind (L_check_1);
5923   addl (idx, 0x2);
5924   andl (idx, 0x1);
5925   subl(idx, 1);
5926   jcc(Assembler::negative, L_post_third_loop_done);
5927   movl(tmp4, Address(y, idx, Address::times_4,  0));
5928   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
5929   movl(tmp4, Address(z, idx, Address::times_4,  0));
5930 
5931   add2_with_carry(carry2, tmp3, tmp4, carry);
5932 
5933   movl(Address(z, idx, Address::times_4,  0), tmp3);
5934   shrq(tmp3, 32);
5935 
5936   shlq(carry2, 32);
5937   orq(tmp3, carry2);
5938   movq(carry, tmp3);
5939 
5940   bind(L_post_third_loop_done);
5941 }
5942 
5943 /**
5944  * Code for BigInteger::multiplyToLen() instrinsic.
5945  *
5946  * rdi: x
5947  * rax: xlen
5948  * rsi: y
5949  * rcx: ylen
5950  * r8:  z
5951  * r11: zlen
5952  * r12: tmp1
5953  * r13: tmp2
5954  * r14: tmp3
5955  * r15: tmp4
5956  * rbx: tmp5
5957  *
5958  */
5959 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
5960                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
5961   ShortBranchVerifier sbv(this);
5962   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
5963 
5964   push(tmp1);
5965   push(tmp2);
5966   push(tmp3);
5967   push(tmp4);
5968   push(tmp5);
5969 
5970   push(xlen);
5971   push(zlen);
5972 
5973   const Register idx = tmp1;
5974   const Register kdx = tmp2;
5975   const Register xstart = tmp3;
5976 
5977   const Register y_idx = tmp4;
5978   const Register carry = tmp5;
5979   const Register product  = xlen;
5980   const Register x_xstart = zlen;  // reuse register
5981 
5982   // First Loop.
5983   //
5984   //  final static long LONG_MASK = 0xffffffffL;
5985   //  int xstart = xlen - 1;
5986   //  int ystart = ylen - 1;
5987   //  long carry = 0;
5988   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5989   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
5990   //    z[kdx] = (int)product;
5991   //    carry = product >>> 32;
5992   //  }
5993   //  z[xstart] = (int)carry;
5994   //
5995 
5996   movl(idx, ylen);      // idx = ylen;
5997   movl(kdx, zlen);      // kdx = xlen+ylen;
5998   xorq(carry, carry);   // carry = 0;
5999 
6000   Label L_done;
6001 
6002   movl(xstart, xlen);
6003   decrementl(xstart);
6004   jcc(Assembler::negative, L_done);
6005 
6006   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6007 
6008   Label L_second_loop;
6009   testl(kdx, kdx);
6010   jcc(Assembler::zero, L_second_loop);
6011 
6012   Label L_carry;
6013   subl(kdx, 1);
6014   jcc(Assembler::zero, L_carry);
6015 
6016   movl(Address(z, kdx, Address::times_4,  0), carry);
6017   shrq(carry, 32);
6018   subl(kdx, 1);
6019 
6020   bind(L_carry);
6021   movl(Address(z, kdx, Address::times_4,  0), carry);
6022 
6023   // Second and third (nested) loops.
6024   //
6025   // for (int i = xstart-1; i >= 0; i--) { // Second loop
6026   //   carry = 0;
6027   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6028   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6029   //                    (z[k] & LONG_MASK) + carry;
6030   //     z[k] = (int)product;
6031   //     carry = product >>> 32;
6032   //   }
6033   //   z[i] = (int)carry;
6034   // }
6035   //
6036   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
6037 
6038   const Register jdx = tmp1;
6039 
6040   bind(L_second_loop);
6041   xorl(carry, carry);    // carry = 0;
6042   movl(jdx, ylen);       // j = ystart+1
6043 
6044   subl(xstart, 1);       // i = xstart-1;
6045   jcc(Assembler::negative, L_done);
6046 
6047   push (z);
6048 
6049   Label L_last_x;
6050   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
6051   subl(xstart, 1);       // i = xstart-1;
6052   jcc(Assembler::negative, L_last_x);
6053 
6054   if (UseBMI2Instructions) {
6055     movq(rdx,  Address(x, xstart, Address::times_4,  0));
6056     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
6057   } else {
6058     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
6059     rorq(x_xstart, 32);  // convert big-endian to little-endian
6060   }
6061 
6062   Label L_third_loop_prologue;
6063   bind(L_third_loop_prologue);
6064 
6065   push (x);
6066   push (xstart);
6067   push (ylen);
6068 
6069 
6070   if (UseBMI2Instructions) {
6071     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
6072   } else { // !UseBMI2Instructions
6073     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
6074   }
6075 
6076   pop(ylen);
6077   pop(xlen);
6078   pop(x);
6079   pop(z);
6080 
6081   movl(tmp3, xlen);
6082   addl(tmp3, 1);
6083   movl(Address(z, tmp3, Address::times_4,  0), carry);
6084   subl(tmp3, 1);
6085   jccb(Assembler::negative, L_done);
6086 
6087   shrq(carry, 32);
6088   movl(Address(z, tmp3, Address::times_4,  0), carry);
6089   jmp(L_second_loop);
6090 
6091   // Next infrequent code is moved outside loops.
6092   bind(L_last_x);
6093   if (UseBMI2Instructions) {
6094     movl(rdx, Address(x,  0));
6095   } else {
6096     movl(x_xstart, Address(x,  0));
6097   }
6098   jmp(L_third_loop_prologue);
6099 
6100   bind(L_done);
6101 
6102   pop(zlen);
6103   pop(xlen);
6104 
6105   pop(tmp5);
6106   pop(tmp4);
6107   pop(tmp3);
6108   pop(tmp2);
6109   pop(tmp1);
6110 }
6111 
6112 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
6113   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
6114   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
6115   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
6116   Label VECTOR8_TAIL, VECTOR4_TAIL;
6117   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
6118   Label SAME_TILL_END, DONE;
6119   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
6120 
6121   //scale is in rcx in both Win64 and Unix
6122   ShortBranchVerifier sbv(this);
6123 
6124   shlq(length);
6125   xorq(result, result);
6126 
6127   if ((AVX3Threshold == 0) && (UseAVX > 2) &&
6128       VM_Version::supports_avx512vlbw()) {
6129     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
6130 
6131     cmpq(length, 64);
6132     jcc(Assembler::less, VECTOR32_TAIL);
6133 
6134     movq(tmp1, length);
6135     andq(tmp1, 0x3F);      // tail count
6136     andq(length, ~(0x3F)); //vector count
6137 
6138     bind(VECTOR64_LOOP);
6139     // AVX512 code to compare 64 byte vectors.
6140     evmovdqub(rymm0, Address(obja, result), false, Assembler::AVX_512bit);
6141     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
6142     kortestql(k7, k7);
6143     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
6144     addq(result, 64);
6145     subq(length, 64);
6146     jccb(Assembler::notZero, VECTOR64_LOOP);
6147 
6148     //bind(VECTOR64_TAIL);
6149     testq(tmp1, tmp1);
6150     jcc(Assembler::zero, SAME_TILL_END);
6151 
6152     //bind(VECTOR64_TAIL);
6153     // AVX512 code to compare upto 63 byte vectors.
6154     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
6155     shlxq(tmp2, tmp2, tmp1);
6156     notq(tmp2);
6157     kmovql(k3, tmp2);
6158 
6159     evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
6160     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
6161 
6162     ktestql(k7, k3);
6163     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
6164 
6165     bind(VECTOR64_NOT_EQUAL);
6166     kmovql(tmp1, k7);
6167     notq(tmp1);
6168     tzcntq(tmp1, tmp1);
6169     addq(result, tmp1);
6170     shrq(result);
6171     jmp(DONE);
6172     bind(VECTOR32_TAIL);
6173   }
6174 
6175   cmpq(length, 8);
6176   jcc(Assembler::equal, VECTOR8_LOOP);
6177   jcc(Assembler::less, VECTOR4_TAIL);
6178 
6179   if (UseAVX >= 2) {
6180     Label VECTOR16_TAIL, VECTOR32_LOOP;
6181 
6182     cmpq(length, 16);
6183     jcc(Assembler::equal, VECTOR16_LOOP);
6184     jcc(Assembler::less, VECTOR8_LOOP);
6185 
6186     cmpq(length, 32);
6187     jccb(Assembler::less, VECTOR16_TAIL);
6188 
6189     subq(length, 32);
6190     bind(VECTOR32_LOOP);
6191     vmovdqu(rymm0, Address(obja, result));
6192     vmovdqu(rymm1, Address(objb, result));
6193     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
6194     vptest(rymm2, rymm2);
6195     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
6196     addq(result, 32);
6197     subq(length, 32);
6198     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
6199     addq(length, 32);
6200     jcc(Assembler::equal, SAME_TILL_END);
6201     //falling through if less than 32 bytes left //close the branch here.
6202 
6203     bind(VECTOR16_TAIL);
6204     cmpq(length, 16);
6205     jccb(Assembler::less, VECTOR8_TAIL);
6206     bind(VECTOR16_LOOP);
6207     movdqu(rymm0, Address(obja, result));
6208     movdqu(rymm1, Address(objb, result));
6209     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
6210     ptest(rymm2, rymm2);
6211     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6212     addq(result, 16);
6213     subq(length, 16);
6214     jcc(Assembler::equal, SAME_TILL_END);
6215     //falling through if less than 16 bytes left
6216   } else {//regular intrinsics
6217 
6218     cmpq(length, 16);
6219     jccb(Assembler::less, VECTOR8_TAIL);
6220 
6221     subq(length, 16);
6222     bind(VECTOR16_LOOP);
6223     movdqu(rymm0, Address(obja, result));
6224     movdqu(rymm1, Address(objb, result));
6225     pxor(rymm0, rymm1);
6226     ptest(rymm0, rymm0);
6227     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6228     addq(result, 16);
6229     subq(length, 16);
6230     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
6231     addq(length, 16);
6232     jcc(Assembler::equal, SAME_TILL_END);
6233     //falling through if less than 16 bytes left
6234   }
6235 
6236   bind(VECTOR8_TAIL);
6237   cmpq(length, 8);
6238   jccb(Assembler::less, VECTOR4_TAIL);
6239   bind(VECTOR8_LOOP);
6240   movq(tmp1, Address(obja, result));
6241   movq(tmp2, Address(objb, result));
6242   xorq(tmp1, tmp2);
6243   testq(tmp1, tmp1);
6244   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
6245   addq(result, 8);
6246   subq(length, 8);
6247   jcc(Assembler::equal, SAME_TILL_END);
6248   //falling through if less than 8 bytes left
6249 
6250   bind(VECTOR4_TAIL);
6251   cmpq(length, 4);
6252   jccb(Assembler::less, BYTES_TAIL);
6253   bind(VECTOR4_LOOP);
6254   movl(tmp1, Address(obja, result));
6255   xorl(tmp1, Address(objb, result));
6256   testl(tmp1, tmp1);
6257   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
6258   addq(result, 4);
6259   subq(length, 4);
6260   jcc(Assembler::equal, SAME_TILL_END);
6261   //falling through if less than 4 bytes left
6262 
6263   bind(BYTES_TAIL);
6264   bind(BYTES_LOOP);
6265   load_unsigned_byte(tmp1, Address(obja, result));
6266   load_unsigned_byte(tmp2, Address(objb, result));
6267   xorl(tmp1, tmp2);
6268   testl(tmp1, tmp1);
6269   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6270   decq(length);
6271   jcc(Assembler::zero, SAME_TILL_END);
6272   incq(result);
6273   load_unsigned_byte(tmp1, Address(obja, result));
6274   load_unsigned_byte(tmp2, Address(objb, result));
6275   xorl(tmp1, tmp2);
6276   testl(tmp1, tmp1);
6277   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6278   decq(length);
6279   jcc(Assembler::zero, SAME_TILL_END);
6280   incq(result);
6281   load_unsigned_byte(tmp1, Address(obja, result));
6282   load_unsigned_byte(tmp2, Address(objb, result));
6283   xorl(tmp1, tmp2);
6284   testl(tmp1, tmp1);
6285   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6286   jmp(SAME_TILL_END);
6287 
6288   if (UseAVX >= 2) {
6289     bind(VECTOR32_NOT_EQUAL);
6290     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
6291     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
6292     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
6293     vpmovmskb(tmp1, rymm0);
6294     bsfq(tmp1, tmp1);
6295     addq(result, tmp1);
6296     shrq(result);
6297     jmp(DONE);
6298   }
6299 
6300   bind(VECTOR16_NOT_EQUAL);
6301   if (UseAVX >= 2) {
6302     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
6303     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
6304     pxor(rymm0, rymm2);
6305   } else {
6306     pcmpeqb(rymm2, rymm2);
6307     pxor(rymm0, rymm1);
6308     pcmpeqb(rymm0, rymm1);
6309     pxor(rymm0, rymm2);
6310   }
6311   pmovmskb(tmp1, rymm0);
6312   bsfq(tmp1, tmp1);
6313   addq(result, tmp1);
6314   shrq(result);
6315   jmpb(DONE);
6316 
6317   bind(VECTOR8_NOT_EQUAL);
6318   bind(VECTOR4_NOT_EQUAL);
6319   bsfq(tmp1, tmp1);
6320   shrq(tmp1, 3);
6321   addq(result, tmp1);
6322   bind(BYTES_NOT_EQUAL);
6323   shrq(result);
6324   jmpb(DONE);
6325 
6326   bind(SAME_TILL_END);
6327   mov64(result, -1);
6328 
6329   bind(DONE);
6330 }
6331 
6332 //Helper functions for square_to_len()
6333 
6334 /**
6335  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
6336  * Preserves x and z and modifies rest of the registers.
6337  */
6338 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6339   // Perform square and right shift by 1
6340   // Handle odd xlen case first, then for even xlen do the following
6341   // jlong carry = 0;
6342   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
6343   //     huge_128 product = x[j:j+1] * x[j:j+1];
6344   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
6345   //     z[i+2:i+3] = (jlong)(product >>> 1);
6346   //     carry = (jlong)product;
6347   // }
6348 
6349   xorq(tmp5, tmp5);     // carry
6350   xorq(rdxReg, rdxReg);
6351   xorl(tmp1, tmp1);     // index for x
6352   xorl(tmp4, tmp4);     // index for z
6353 
6354   Label L_first_loop, L_first_loop_exit;
6355 
6356   testl(xlen, 1);
6357   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
6358 
6359   // Square and right shift by 1 the odd element using 32 bit multiply
6360   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
6361   imulq(raxReg, raxReg);
6362   shrq(raxReg, 1);
6363   adcq(tmp5, 0);
6364   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
6365   incrementl(tmp1);
6366   addl(tmp4, 2);
6367 
6368   // Square and  right shift by 1 the rest using 64 bit multiply
6369   bind(L_first_loop);
6370   cmpptr(tmp1, xlen);
6371   jccb(Assembler::equal, L_first_loop_exit);
6372 
6373   // Square
6374   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
6375   rorq(raxReg, 32);    // convert big-endian to little-endian
6376   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
6377 
6378   // Right shift by 1 and save carry
6379   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
6380   rcrq(rdxReg, 1);
6381   rcrq(raxReg, 1);
6382   adcq(tmp5, 0);
6383 
6384   // Store result in z
6385   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
6386   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
6387 
6388   // Update indices for x and z
6389   addl(tmp1, 2);
6390   addl(tmp4, 4);
6391   jmp(L_first_loop);
6392 
6393   bind(L_first_loop_exit);
6394 }
6395 
6396 
6397 /**
6398  * Perform the following multiply add operation using BMI2 instructions
6399  * carry:sum = sum + op1*op2 + carry
6400  * op2 should be in rdx
6401  * op2 is preserved, all other registers are modified
6402  */
6403 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
6404   // assert op2 is rdx
6405   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
6406   addq(sum, carry);
6407   adcq(tmp2, 0);
6408   addq(sum, op1);
6409   adcq(tmp2, 0);
6410   movq(carry, tmp2);
6411 }
6412 
6413 /**
6414  * Perform the following multiply add operation:
6415  * carry:sum = sum + op1*op2 + carry
6416  * Preserves op1, op2 and modifies rest of registers
6417  */
6418 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
6419   // rdx:rax = op1 * op2
6420   movq(raxReg, op2);
6421   mulq(op1);
6422 
6423   //  rdx:rax = sum + carry + rdx:rax
6424   addq(sum, carry);
6425   adcq(rdxReg, 0);
6426   addq(sum, raxReg);
6427   adcq(rdxReg, 0);
6428 
6429   // carry:sum = rdx:sum
6430   movq(carry, rdxReg);
6431 }
6432 
6433 /**
6434  * Add 64 bit long carry into z[] with carry propogation.
6435  * Preserves z and carry register values and modifies rest of registers.
6436  *
6437  */
6438 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
6439   Label L_fourth_loop, L_fourth_loop_exit;
6440 
6441   movl(tmp1, 1);
6442   subl(zlen, 2);
6443   addq(Address(z, zlen, Address::times_4, 0), carry);
6444 
6445   bind(L_fourth_loop);
6446   jccb(Assembler::carryClear, L_fourth_loop_exit);
6447   subl(zlen, 2);
6448   jccb(Assembler::negative, L_fourth_loop_exit);
6449   addq(Address(z, zlen, Address::times_4, 0), tmp1);
6450   jmp(L_fourth_loop);
6451   bind(L_fourth_loop_exit);
6452 }
6453 
6454 /**
6455  * Shift z[] left by 1 bit.
6456  * Preserves x, len, z and zlen registers and modifies rest of the registers.
6457  *
6458  */
6459 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
6460 
6461   Label L_fifth_loop, L_fifth_loop_exit;
6462 
6463   // Fifth loop
6464   // Perform primitiveLeftShift(z, zlen, 1)
6465 
6466   const Register prev_carry = tmp1;
6467   const Register new_carry = tmp4;
6468   const Register value = tmp2;
6469   const Register zidx = tmp3;
6470 
6471   // int zidx, carry;
6472   // long value;
6473   // carry = 0;
6474   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
6475   //    (carry:value)  = (z[i] << 1) | carry ;
6476   //    z[i] = value;
6477   // }
6478 
6479   movl(zidx, zlen);
6480   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
6481 
6482   bind(L_fifth_loop);
6483   decl(zidx);  // Use decl to preserve carry flag
6484   decl(zidx);
6485   jccb(Assembler::negative, L_fifth_loop_exit);
6486 
6487   if (UseBMI2Instructions) {
6488      movq(value, Address(z, zidx, Address::times_4, 0));
6489      rclq(value, 1);
6490      rorxq(value, value, 32);
6491      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6492   }
6493   else {
6494     // clear new_carry
6495     xorl(new_carry, new_carry);
6496 
6497     // Shift z[i] by 1, or in previous carry and save new carry
6498     movq(value, Address(z, zidx, Address::times_4, 0));
6499     shlq(value, 1);
6500     adcl(new_carry, 0);
6501 
6502     orq(value, prev_carry);
6503     rorq(value, 0x20);
6504     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6505 
6506     // Set previous carry = new carry
6507     movl(prev_carry, new_carry);
6508   }
6509   jmp(L_fifth_loop);
6510 
6511   bind(L_fifth_loop_exit);
6512 }
6513 
6514 
6515 /**
6516  * Code for BigInteger::squareToLen() intrinsic
6517  *
6518  * rdi: x
6519  * rsi: len
6520  * r8:  z
6521  * rcx: zlen
6522  * r12: tmp1
6523  * r13: tmp2
6524  * r14: tmp3
6525  * r15: tmp4
6526  * rbx: tmp5
6527  *
6528  */
6529 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6530 
6531   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
6532   push(tmp1);
6533   push(tmp2);
6534   push(tmp3);
6535   push(tmp4);
6536   push(tmp5);
6537 
6538   // First loop
6539   // Store the squares, right shifted one bit (i.e., divided by 2).
6540   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
6541 
6542   // Add in off-diagonal sums.
6543   //
6544   // Second, third (nested) and fourth loops.
6545   // zlen +=2;
6546   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
6547   //    carry = 0;
6548   //    long op2 = x[xidx:xidx+1];
6549   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
6550   //       k -= 2;
6551   //       long op1 = x[j:j+1];
6552   //       long sum = z[k:k+1];
6553   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
6554   //       z[k:k+1] = sum;
6555   //    }
6556   //    add_one_64(z, k, carry, tmp_regs);
6557   // }
6558 
6559   const Register carry = tmp5;
6560   const Register sum = tmp3;
6561   const Register op1 = tmp4;
6562   Register op2 = tmp2;
6563 
6564   push(zlen);
6565   push(len);
6566   addl(zlen,2);
6567   bind(L_second_loop);
6568   xorq(carry, carry);
6569   subl(zlen, 4);
6570   subl(len, 2);
6571   push(zlen);
6572   push(len);
6573   cmpl(len, 0);
6574   jccb(Assembler::lessEqual, L_second_loop_exit);
6575 
6576   // Multiply an array by one 64 bit long.
6577   if (UseBMI2Instructions) {
6578     op2 = rdxReg;
6579     movq(op2, Address(x, len, Address::times_4,  0));
6580     rorxq(op2, op2, 32);
6581   }
6582   else {
6583     movq(op2, Address(x, len, Address::times_4,  0));
6584     rorq(op2, 32);
6585   }
6586 
6587   bind(L_third_loop);
6588   decrementl(len);
6589   jccb(Assembler::negative, L_third_loop_exit);
6590   decrementl(len);
6591   jccb(Assembler::negative, L_last_x);
6592 
6593   movq(op1, Address(x, len, Address::times_4,  0));
6594   rorq(op1, 32);
6595 
6596   bind(L_multiply);
6597   subl(zlen, 2);
6598   movq(sum, Address(z, zlen, Address::times_4,  0));
6599 
6600   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
6601   if (UseBMI2Instructions) {
6602     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
6603   }
6604   else {
6605     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6606   }
6607 
6608   movq(Address(z, zlen, Address::times_4, 0), sum);
6609 
6610   jmp(L_third_loop);
6611   bind(L_third_loop_exit);
6612 
6613   // Fourth loop
6614   // Add 64 bit long carry into z with carry propogation.
6615   // Uses offsetted zlen.
6616   add_one_64(z, zlen, carry, tmp1);
6617 
6618   pop(len);
6619   pop(zlen);
6620   jmp(L_second_loop);
6621 
6622   // Next infrequent code is moved outside loops.
6623   bind(L_last_x);
6624   movl(op1, Address(x, 0));
6625   jmp(L_multiply);
6626 
6627   bind(L_second_loop_exit);
6628   pop(len);
6629   pop(zlen);
6630   pop(len);
6631   pop(zlen);
6632 
6633   // Fifth loop
6634   // Shift z left 1 bit.
6635   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
6636 
6637   // z[zlen-1] |= x[len-1] & 1;
6638   movl(tmp3, Address(x, len, Address::times_4, -4));
6639   andl(tmp3, 1);
6640   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
6641 
6642   pop(tmp5);
6643   pop(tmp4);
6644   pop(tmp3);
6645   pop(tmp2);
6646   pop(tmp1);
6647 }
6648 
6649 /**
6650  * Helper function for mul_add()
6651  * Multiply the in[] by int k and add to out[] starting at offset offs using
6652  * 128 bit by 32 bit multiply and return the carry in tmp5.
6653  * Only quad int aligned length of in[] is operated on in this function.
6654  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
6655  * This function preserves out, in and k registers.
6656  * len and offset point to the appropriate index in "in" & "out" correspondingly
6657  * tmp5 has the carry.
6658  * other registers are temporary and are modified.
6659  *
6660  */
6661 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
6662   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
6663   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6664 
6665   Label L_first_loop, L_first_loop_exit;
6666 
6667   movl(tmp1, len);
6668   shrl(tmp1, 2);
6669 
6670   bind(L_first_loop);
6671   subl(tmp1, 1);
6672   jccb(Assembler::negative, L_first_loop_exit);
6673 
6674   subl(len, 4);
6675   subl(offset, 4);
6676 
6677   Register op2 = tmp2;
6678   const Register sum = tmp3;
6679   const Register op1 = tmp4;
6680   const Register carry = tmp5;
6681 
6682   if (UseBMI2Instructions) {
6683     op2 = rdxReg;
6684   }
6685 
6686   movq(op1, Address(in, len, Address::times_4,  8));
6687   rorq(op1, 32);
6688   movq(sum, Address(out, offset, Address::times_4,  8));
6689   rorq(sum, 32);
6690   if (UseBMI2Instructions) {
6691     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6692   }
6693   else {
6694     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6695   }
6696   // Store back in big endian from little endian
6697   rorq(sum, 0x20);
6698   movq(Address(out, offset, Address::times_4,  8), sum);
6699 
6700   movq(op1, Address(in, len, Address::times_4,  0));
6701   rorq(op1, 32);
6702   movq(sum, Address(out, offset, Address::times_4,  0));
6703   rorq(sum, 32);
6704   if (UseBMI2Instructions) {
6705     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6706   }
6707   else {
6708     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6709   }
6710   // Store back in big endian from little endian
6711   rorq(sum, 0x20);
6712   movq(Address(out, offset, Address::times_4,  0), sum);
6713 
6714   jmp(L_first_loop);
6715   bind(L_first_loop_exit);
6716 }
6717 
6718 /**
6719  * Code for BigInteger::mulAdd() intrinsic
6720  *
6721  * rdi: out
6722  * rsi: in
6723  * r11: offs (out.length - offset)
6724  * rcx: len
6725  * r8:  k
6726  * r12: tmp1
6727  * r13: tmp2
6728  * r14: tmp3
6729  * r15: tmp4
6730  * rbx: tmp5
6731  * Multiply the in[] by word k and add to out[], return the carry in rax
6732  */
6733 void MacroAssembler::mul_add(Register out, Register in, Register offs,
6734    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
6735    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6736 
6737   Label L_carry, L_last_in, L_done;
6738 
6739 // carry = 0;
6740 // for (int j=len-1; j >= 0; j--) {
6741 //    long product = (in[j] & LONG_MASK) * kLong +
6742 //                   (out[offs] & LONG_MASK) + carry;
6743 //    out[offs--] = (int)product;
6744 //    carry = product >>> 32;
6745 // }
6746 //
6747   push(tmp1);
6748   push(tmp2);
6749   push(tmp3);
6750   push(tmp4);
6751   push(tmp5);
6752 
6753   Register op2 = tmp2;
6754   const Register sum = tmp3;
6755   const Register op1 = tmp4;
6756   const Register carry =  tmp5;
6757 
6758   if (UseBMI2Instructions) {
6759     op2 = rdxReg;
6760     movl(op2, k);
6761   }
6762   else {
6763     movl(op2, k);
6764   }
6765 
6766   xorq(carry, carry);
6767 
6768   //First loop
6769 
6770   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
6771   //The carry is in tmp5
6772   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
6773 
6774   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
6775   decrementl(len);
6776   jccb(Assembler::negative, L_carry);
6777   decrementl(len);
6778   jccb(Assembler::negative, L_last_in);
6779 
6780   movq(op1, Address(in, len, Address::times_4,  0));
6781   rorq(op1, 32);
6782 
6783   subl(offs, 2);
6784   movq(sum, Address(out, offs, Address::times_4,  0));
6785   rorq(sum, 32);
6786 
6787   if (UseBMI2Instructions) {
6788     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6789   }
6790   else {
6791     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6792   }
6793 
6794   // Store back in big endian from little endian
6795   rorq(sum, 0x20);
6796   movq(Address(out, offs, Address::times_4,  0), sum);
6797 
6798   testl(len, len);
6799   jccb(Assembler::zero, L_carry);
6800 
6801   //Multiply the last in[] entry, if any
6802   bind(L_last_in);
6803   movl(op1, Address(in, 0));
6804   movl(sum, Address(out, offs, Address::times_4,  -4));
6805 
6806   movl(raxReg, k);
6807   mull(op1); //tmp4 * eax -> edx:eax
6808   addl(sum, carry);
6809   adcl(rdxReg, 0);
6810   addl(sum, raxReg);
6811   adcl(rdxReg, 0);
6812   movl(carry, rdxReg);
6813 
6814   movl(Address(out, offs, Address::times_4,  -4), sum);
6815 
6816   bind(L_carry);
6817   //return tmp5/carry as carry in rax
6818   movl(rax, carry);
6819 
6820   bind(L_done);
6821   pop(tmp5);
6822   pop(tmp4);
6823   pop(tmp3);
6824   pop(tmp2);
6825   pop(tmp1);
6826 }
6827 #endif
6828 
6829 /**
6830  * Emits code to update CRC-32 with a byte value according to constants in table
6831  *
6832  * @param [in,out]crc   Register containing the crc.
6833  * @param [in]val       Register containing the byte to fold into the CRC.
6834  * @param [in]table     Register containing the table of crc constants.
6835  *
6836  * uint32_t crc;
6837  * val = crc_table[(val ^ crc) & 0xFF];
6838  * crc = val ^ (crc >> 8);
6839  *
6840  */
6841 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
6842   xorl(val, crc);
6843   andl(val, 0xFF);
6844   shrl(crc, 8); // unsigned shift
6845   xorl(crc, Address(table, val, Address::times_4, 0));
6846 }
6847 
6848 /**
6849  * Fold 128-bit data chunk
6850  */
6851 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
6852   if (UseAVX > 0) {
6853     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
6854     vpclmulldq(xcrc, xK, xcrc); // [63:0]
6855     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
6856     pxor(xcrc, xtmp);
6857   } else {
6858     movdqa(xtmp, xcrc);
6859     pclmulhdq(xtmp, xK);   // [123:64]
6860     pclmulldq(xcrc, xK);   // [63:0]
6861     pxor(xcrc, xtmp);
6862     movdqu(xtmp, Address(buf, offset));
6863     pxor(xcrc, xtmp);
6864   }
6865 }
6866 
6867 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
6868   if (UseAVX > 0) {
6869     vpclmulhdq(xtmp, xK, xcrc);
6870     vpclmulldq(xcrc, xK, xcrc);
6871     pxor(xcrc, xbuf);
6872     pxor(xcrc, xtmp);
6873   } else {
6874     movdqa(xtmp, xcrc);
6875     pclmulhdq(xtmp, xK);
6876     pclmulldq(xcrc, xK);
6877     pxor(xcrc, xbuf);
6878     pxor(xcrc, xtmp);
6879   }
6880 }
6881 
6882 /**
6883  * 8-bit folds to compute 32-bit CRC
6884  *
6885  * uint64_t xcrc;
6886  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
6887  */
6888 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
6889   movdl(tmp, xcrc);
6890   andl(tmp, 0xFF);
6891   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
6892   psrldq(xcrc, 1); // unsigned shift one byte
6893   pxor(xcrc, xtmp);
6894 }
6895 
6896 /**
6897  * uint32_t crc;
6898  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
6899  */
6900 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
6901   movl(tmp, crc);
6902   andl(tmp, 0xFF);
6903   shrl(crc, 8);
6904   xorl(crc, Address(table, tmp, Address::times_4, 0));
6905 }
6906 
6907 /**
6908  * @param crc   register containing existing CRC (32-bit)
6909  * @param buf   register pointing to input byte buffer (byte*)
6910  * @param len   register containing number of bytes
6911  * @param table register that will contain address of CRC table
6912  * @param tmp   scratch register
6913  */
6914 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
6915   assert_different_registers(crc, buf, len, table, tmp, rax);
6916 
6917   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
6918   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
6919 
6920   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
6921   // context for the registers used, where all instructions below are using 128-bit mode
6922   // On EVEX without VL and BW, these instructions will all be AVX.
6923   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
6924   notl(crc); // ~crc
6925   cmpl(len, 16);
6926   jcc(Assembler::less, L_tail);
6927 
6928   // Align buffer to 16 bytes
6929   movl(tmp, buf);
6930   andl(tmp, 0xF);
6931   jccb(Assembler::zero, L_aligned);
6932   subl(tmp,  16);
6933   addl(len, tmp);
6934 
6935   align(4);
6936   BIND(L_align_loop);
6937   movsbl(rax, Address(buf, 0)); // load byte with sign extension
6938   update_byte_crc32(crc, rax, table);
6939   increment(buf);
6940   incrementl(tmp);
6941   jccb(Assembler::less, L_align_loop);
6942 
6943   BIND(L_aligned);
6944   movl(tmp, len); // save
6945   shrl(len, 4);
6946   jcc(Assembler::zero, L_tail_restore);
6947 
6948   // Fold crc into first bytes of vector
6949   movdqa(xmm1, Address(buf, 0));
6950   movdl(rax, xmm1);
6951   xorl(crc, rax);
6952   if (VM_Version::supports_sse4_1()) {
6953     pinsrd(xmm1, crc, 0);
6954   } else {
6955     pinsrw(xmm1, crc, 0);
6956     shrl(crc, 16);
6957     pinsrw(xmm1, crc, 1);
6958   }
6959   addptr(buf, 16);
6960   subl(len, 4); // len > 0
6961   jcc(Assembler::less, L_fold_tail);
6962 
6963   movdqa(xmm2, Address(buf,  0));
6964   movdqa(xmm3, Address(buf, 16));
6965   movdqa(xmm4, Address(buf, 32));
6966   addptr(buf, 48);
6967   subl(len, 3);
6968   jcc(Assembler::lessEqual, L_fold_512b);
6969 
6970   // Fold total 512 bits of polynomial on each iteration,
6971   // 128 bits per each of 4 parallel streams.
6972   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
6973 
6974   align32();
6975   BIND(L_fold_512b_loop);
6976   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
6977   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
6978   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
6979   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
6980   addptr(buf, 64);
6981   subl(len, 4);
6982   jcc(Assembler::greater, L_fold_512b_loop);
6983 
6984   // Fold 512 bits to 128 bits.
6985   BIND(L_fold_512b);
6986   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
6987   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
6988   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
6989   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
6990 
6991   // Fold the rest of 128 bits data chunks
6992   BIND(L_fold_tail);
6993   addl(len, 3);
6994   jccb(Assembler::lessEqual, L_fold_128b);
6995   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
6996 
6997   BIND(L_fold_tail_loop);
6998   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
6999   addptr(buf, 16);
7000   decrementl(len);
7001   jccb(Assembler::greater, L_fold_tail_loop);
7002 
7003   // Fold 128 bits in xmm1 down into 32 bits in crc register.
7004   BIND(L_fold_128b);
7005   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7006   if (UseAVX > 0) {
7007     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7008     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7009     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7010   } else {
7011     movdqa(xmm2, xmm0);
7012     pclmulqdq(xmm2, xmm1, 0x1);
7013     movdqa(xmm3, xmm0);
7014     pand(xmm3, xmm2);
7015     pclmulqdq(xmm0, xmm3, 0x1);
7016   }
7017   psrldq(xmm1, 8);
7018   psrldq(xmm2, 4);
7019   pxor(xmm0, xmm1);
7020   pxor(xmm0, xmm2);
7021 
7022   // 8 8-bit folds to compute 32-bit CRC.
7023   for (int j = 0; j < 4; j++) {
7024     fold_8bit_crc32(xmm0, table, xmm1, rax);
7025   }
7026   movdl(crc, xmm0); // mov 32 bits to general register
7027   for (int j = 0; j < 4; j++) {
7028     fold_8bit_crc32(crc, table, rax);
7029   }
7030 
7031   BIND(L_tail_restore);
7032   movl(len, tmp); // restore
7033   BIND(L_tail);
7034   andl(len, 0xf);
7035   jccb(Assembler::zero, L_exit);
7036 
7037   // Fold the rest of bytes
7038   align(4);
7039   BIND(L_tail_loop);
7040   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7041   update_byte_crc32(crc, rax, table);
7042   increment(buf);
7043   decrementl(len);
7044   jccb(Assembler::greater, L_tail_loop);
7045 
7046   BIND(L_exit);
7047   notl(crc); // ~c
7048 }
7049 
7050 #ifdef _LP64
7051 // Helper function for AVX 512 CRC32
7052 // Fold 512-bit data chunks
7053 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
7054                                              Register pos, int offset) {
7055   evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
7056   evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
7057   evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
7058   evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
7059   evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
7060 }
7061 
7062 // Helper function for AVX 512 CRC32
7063 // Compute CRC32 for < 256B buffers
7064 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
7065                                               Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7066                                               Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7067 
7068   Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
7069   Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
7070   Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
7071 
7072   // check if there is enough buffer to be able to fold 16B at a time
7073   cmpl(len, 32);
7074   jcc(Assembler::less, L_less_than_32);
7075 
7076   // if there is, load the constants
7077   movdqu(xmm10, Address(table, 1 * 16));    //rk1 and rk2 in xmm10
7078   movdl(xmm0, crc);                        // get the initial crc value
7079   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7080   pxor(xmm7, xmm0);
7081 
7082   // update the buffer pointer
7083   addl(pos, 16);
7084   //update the counter.subtract 32 instead of 16 to save one instruction from the loop
7085   subl(len, 32);
7086   jmp(L_16B_reduction_loop);
7087 
7088   bind(L_less_than_32);
7089   //mov initial crc to the return value. this is necessary for zero - length buffers.
7090   movl(rax, crc);
7091   testl(len, len);
7092   jcc(Assembler::equal, L_cleanup);
7093 
7094   movdl(xmm0, crc);                        //get the initial crc value
7095 
7096   cmpl(len, 16);
7097   jcc(Assembler::equal, L_exact_16_left);
7098   jcc(Assembler::less, L_less_than_16_left);
7099 
7100   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7101   pxor(xmm7, xmm0);                       //xor the initial crc value
7102   addl(pos, 16);
7103   subl(len, 16);
7104   movdqu(xmm10, Address(table, 1 * 16));    // rk1 and rk2 in xmm10
7105   jmp(L_get_last_two_xmms);
7106 
7107   bind(L_less_than_16_left);
7108   //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
7109   pxor(xmm1, xmm1);
7110   movptr(tmp1, rsp);
7111   movdqu(Address(tmp1, 0 * 16), xmm1);
7112 
7113   cmpl(len, 4);
7114   jcc(Assembler::less, L_only_less_than_4);
7115 
7116   //backup the counter value
7117   movl(tmp2, len);
7118   cmpl(len, 8);
7119   jcc(Assembler::less, L_less_than_8_left);
7120 
7121   //load 8 Bytes
7122   movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
7123   movq(Address(tmp1, 0 * 16), rax);
7124   addptr(tmp1, 8);
7125   subl(len, 8);
7126   addl(pos, 8);
7127 
7128   bind(L_less_than_8_left);
7129   cmpl(len, 4);
7130   jcc(Assembler::less, L_less_than_4_left);
7131 
7132   //load 4 Bytes
7133   movl(rax, Address(buf, pos, Address::times_1, 0));
7134   movl(Address(tmp1, 0 * 16), rax);
7135   addptr(tmp1, 4);
7136   subl(len, 4);
7137   addl(pos, 4);
7138 
7139   bind(L_less_than_4_left);
7140   cmpl(len, 2);
7141   jcc(Assembler::less, L_less_than_2_left);
7142 
7143   // load 2 Bytes
7144   movw(rax, Address(buf, pos, Address::times_1, 0));
7145   movl(Address(tmp1, 0 * 16), rax);
7146   addptr(tmp1, 2);
7147   subl(len, 2);
7148   addl(pos, 2);
7149 
7150   bind(L_less_than_2_left);
7151   cmpl(len, 1);
7152   jcc(Assembler::less, L_zero_left);
7153 
7154   // load 1 Byte
7155   movb(rax, Address(buf, pos, Address::times_1, 0));
7156   movb(Address(tmp1, 0 * 16), rax);
7157 
7158   bind(L_zero_left);
7159   movdqu(xmm7, Address(rsp, 0));
7160   pxor(xmm7, xmm0);                       //xor the initial crc value
7161 
7162   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7163   movdqu(xmm0, Address(rax, tmp2));
7164   pshufb(xmm7, xmm0);
7165   jmp(L_128_done);
7166 
7167   bind(L_exact_16_left);
7168   movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
7169   pxor(xmm7, xmm0);                       //xor the initial crc value
7170   jmp(L_128_done);
7171 
7172   bind(L_only_less_than_4);
7173   cmpl(len, 3);
7174   jcc(Assembler::less, L_only_less_than_3);
7175 
7176   // load 3 Bytes
7177   movb(rax, Address(buf, pos, Address::times_1, 0));
7178   movb(Address(tmp1, 0), rax);
7179 
7180   movb(rax, Address(buf, pos, Address::times_1, 1));
7181   movb(Address(tmp1, 1), rax);
7182 
7183   movb(rax, Address(buf, pos, Address::times_1, 2));
7184   movb(Address(tmp1, 2), rax);
7185 
7186   movdqu(xmm7, Address(rsp, 0));
7187   pxor(xmm7, xmm0);                     //xor the initial crc value
7188 
7189   pslldq(xmm7, 0x5);
7190   jmp(L_barrett);
7191   bind(L_only_less_than_3);
7192   cmpl(len, 2);
7193   jcc(Assembler::less, L_only_less_than_2);
7194 
7195   // load 2 Bytes
7196   movb(rax, Address(buf, pos, Address::times_1, 0));
7197   movb(Address(tmp1, 0), rax);
7198 
7199   movb(rax, Address(buf, pos, Address::times_1, 1));
7200   movb(Address(tmp1, 1), rax);
7201 
7202   movdqu(xmm7, Address(rsp, 0));
7203   pxor(xmm7, xmm0);                     //xor the initial crc value
7204 
7205   pslldq(xmm7, 0x6);
7206   jmp(L_barrett);
7207 
7208   bind(L_only_less_than_2);
7209   //load 1 Byte
7210   movb(rax, Address(buf, pos, Address::times_1, 0));
7211   movb(Address(tmp1, 0), rax);
7212 
7213   movdqu(xmm7, Address(rsp, 0));
7214   pxor(xmm7, xmm0);                     //xor the initial crc value
7215 
7216   pslldq(xmm7, 0x7);
7217 }
7218 
7219 /**
7220 * Compute CRC32 using AVX512 instructions
7221 * param crc   register containing existing CRC (32-bit)
7222 * param buf   register pointing to input byte buffer (byte*)
7223 * param len   register containing number of bytes
7224 * param table address of crc or crc32c table
7225 * param tmp1  scratch register
7226 * param tmp2  scratch register
7227 * return rax  result register
7228 *
7229 * This routine is identical for crc32c with the exception of the precomputed constant
7230 * table which will be passed as the table argument.  The calculation steps are
7231 * the same for both variants.
7232 */
7233 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
7234   assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
7235 
7236   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7237   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7238   Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
7239   Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
7240   Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
7241 
7242   const Register pos = r12;
7243   push(r12);
7244   subptr(rsp, 16 * 2 + 8);
7245 
7246   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7247   // context for the registers used, where all instructions below are using 128-bit mode
7248   // On EVEX without VL and BW, these instructions will all be AVX.
7249   movl(pos, 0);
7250 
7251   // check if smaller than 256B
7252   cmpl(len, 256);
7253   jcc(Assembler::less, L_less_than_256);
7254 
7255   // load the initial crc value
7256   movdl(xmm10, crc);
7257 
7258   // receive the initial 64B data, xor the initial crc value
7259   evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
7260   evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
7261   evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
7262   evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
7263 
7264   subl(len, 256);
7265   cmpl(len, 256);
7266   jcc(Assembler::less, L_fold_128_B_loop);
7267 
7268   evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
7269   evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
7270   evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
7271   subl(len, 256);
7272 
7273   bind(L_fold_256_B_loop);
7274   addl(pos, 256);
7275   fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
7276   fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
7277   fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
7278   fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
7279 
7280   subl(len, 256);
7281   jcc(Assembler::greaterEqual, L_fold_256_B_loop);
7282 
7283   // Fold 256 into 128
7284   addl(pos, 256);
7285   evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
7286   evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
7287   vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
7288 
7289   evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
7290   evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
7291   vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
7292 
7293   evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
7294   evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
7295 
7296   addl(len, 128);
7297   jmp(L_fold_128_B_register);
7298 
7299   // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
7300   // loop will fold 128B at a time until we have 128 + y Bytes of buffer
7301 
7302   // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
7303   bind(L_fold_128_B_loop);
7304   addl(pos, 128);
7305   fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
7306   fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
7307 
7308   subl(len, 128);
7309   jcc(Assembler::greaterEqual, L_fold_128_B_loop);
7310 
7311   addl(pos, 128);
7312 
7313   // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
7314   // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
7315   bind(L_fold_128_B_register);
7316   evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
7317   evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
7318   evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
7319   evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
7320   // save last that has no multiplicand
7321   vextracti64x2(xmm7, xmm4, 3);
7322 
7323   evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
7324   evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
7325   // Needed later in reduction loop
7326   movdqu(xmm10, Address(table, 1 * 16));
7327   vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
7328   vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
7329 
7330   // Swap 1,0,3,2 - 01 00 11 10
7331   evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
7332   evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
7333   vextracti128(xmm5, xmm8, 1);
7334   evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
7335 
7336   // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
7337   // instead of a cmp instruction, we use the negative flag with the jl instruction
7338   addl(len, 128 - 16);
7339   jcc(Assembler::less, L_final_reduction_for_128);
7340 
7341   bind(L_16B_reduction_loop);
7342   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7343   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7344   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7345   movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
7346   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7347   addl(pos, 16);
7348   subl(len, 16);
7349   jcc(Assembler::greaterEqual, L_16B_reduction_loop);
7350 
7351   bind(L_final_reduction_for_128);
7352   addl(len, 16);
7353   jcc(Assembler::equal, L_128_done);
7354 
7355   bind(L_get_last_two_xmms);
7356   movdqu(xmm2, xmm7);
7357   addl(pos, len);
7358   movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
7359   subl(pos, len);
7360 
7361   // get rid of the extra data that was loaded before
7362   // load the shift constant
7363   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7364   movdqu(xmm0, Address(rax, len));
7365   addl(rax, len);
7366 
7367   vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7368   //Change mask to 512
7369   vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
7370   vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
7371 
7372   blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
7373   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7374   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7375   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7376   vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
7377 
7378   bind(L_128_done);
7379   // compute crc of a 128-bit value
7380   movdqu(xmm10, Address(table, 3 * 16));
7381   movdqu(xmm0, xmm7);
7382 
7383   // 64b fold
7384   vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
7385   vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
7386   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7387 
7388   // 32b fold
7389   movdqu(xmm0, xmm7);
7390   vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
7391   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7392   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7393   jmp(L_barrett);
7394 
7395   bind(L_less_than_256);
7396   kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
7397 
7398   //barrett reduction
7399   bind(L_barrett);
7400   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
7401   movdqu(xmm1, xmm7);
7402   movdqu(xmm2, xmm7);
7403   movdqu(xmm10, Address(table, 4 * 16));
7404 
7405   pclmulqdq(xmm7, xmm10, 0x0);
7406   pxor(xmm7, xmm2);
7407   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
7408   movdqu(xmm2, xmm7);
7409   pclmulqdq(xmm7, xmm10, 0x10);
7410   pxor(xmm7, xmm2);
7411   pxor(xmm7, xmm1);
7412   pextrd(crc, xmm7, 2);
7413 
7414   bind(L_cleanup);
7415   addptr(rsp, 16 * 2 + 8);
7416   pop(r12);
7417 }
7418 
7419 // S. Gueron / Information Processing Letters 112 (2012) 184
7420 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
7421 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
7422 // Output: the 64-bit carry-less product of B * CONST
7423 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
7424                                      Register tmp1, Register tmp2, Register tmp3) {
7425   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7426   if (n > 0) {
7427     addq(tmp3, n * 256 * 8);
7428   }
7429   //    Q1 = TABLEExt[n][B & 0xFF];
7430   movl(tmp1, in);
7431   andl(tmp1, 0x000000FF);
7432   shll(tmp1, 3);
7433   addq(tmp1, tmp3);
7434   movq(tmp1, Address(tmp1, 0));
7435 
7436   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7437   movl(tmp2, in);
7438   shrl(tmp2, 8);
7439   andl(tmp2, 0x000000FF);
7440   shll(tmp2, 3);
7441   addq(tmp2, tmp3);
7442   movq(tmp2, Address(tmp2, 0));
7443 
7444   shlq(tmp2, 8);
7445   xorq(tmp1, tmp2);
7446 
7447   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7448   movl(tmp2, in);
7449   shrl(tmp2, 16);
7450   andl(tmp2, 0x000000FF);
7451   shll(tmp2, 3);
7452   addq(tmp2, tmp3);
7453   movq(tmp2, Address(tmp2, 0));
7454 
7455   shlq(tmp2, 16);
7456   xorq(tmp1, tmp2);
7457 
7458   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7459   shrl(in, 24);
7460   andl(in, 0x000000FF);
7461   shll(in, 3);
7462   addq(in, tmp3);
7463   movq(in, Address(in, 0));
7464 
7465   shlq(in, 24);
7466   xorq(in, tmp1);
7467   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7468 }
7469 
7470 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7471                                       Register in_out,
7472                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7473                                       XMMRegister w_xtmp2,
7474                                       Register tmp1,
7475                                       Register n_tmp2, Register n_tmp3) {
7476   if (is_pclmulqdq_supported) {
7477     movdl(w_xtmp1, in_out); // modified blindly
7478 
7479     movl(tmp1, const_or_pre_comp_const_index);
7480     movdl(w_xtmp2, tmp1);
7481     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7482 
7483     movdq(in_out, w_xtmp1);
7484   } else {
7485     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
7486   }
7487 }
7488 
7489 // Recombination Alternative 2: No bit-reflections
7490 // T1 = (CRC_A * U1) << 1
7491 // T2 = (CRC_B * U2) << 1
7492 // C1 = T1 >> 32
7493 // C2 = T2 >> 32
7494 // T1 = T1 & 0xFFFFFFFF
7495 // T2 = T2 & 0xFFFFFFFF
7496 // T1 = CRC32(0, T1)
7497 // T2 = CRC32(0, T2)
7498 // C1 = C1 ^ T1
7499 // C2 = C2 ^ T2
7500 // CRC = C1 ^ C2 ^ CRC_C
7501 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7502                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7503                                      Register tmp1, Register tmp2,
7504                                      Register n_tmp3) {
7505   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7506   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7507   shlq(in_out, 1);
7508   movl(tmp1, in_out);
7509   shrq(in_out, 32);
7510   xorl(tmp2, tmp2);
7511   crc32(tmp2, tmp1, 4);
7512   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
7513   shlq(in1, 1);
7514   movl(tmp1, in1);
7515   shrq(in1, 32);
7516   xorl(tmp2, tmp2);
7517   crc32(tmp2, tmp1, 4);
7518   xorl(in1, tmp2);
7519   xorl(in_out, in1);
7520   xorl(in_out, in2);
7521 }
7522 
7523 // Set N to predefined value
7524 // Subtract from a lenght of a buffer
7525 // execute in a loop:
7526 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
7527 // for i = 1 to N do
7528 //  CRC_A = CRC32(CRC_A, A[i])
7529 //  CRC_B = CRC32(CRC_B, B[i])
7530 //  CRC_C = CRC32(CRC_C, C[i])
7531 // end for
7532 // Recombine
7533 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7534                                        Register in_out1, Register in_out2, Register in_out3,
7535                                        Register tmp1, Register tmp2, Register tmp3,
7536                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7537                                        Register tmp4, Register tmp5,
7538                                        Register n_tmp6) {
7539   Label L_processPartitions;
7540   Label L_processPartition;
7541   Label L_exit;
7542 
7543   bind(L_processPartitions);
7544   cmpl(in_out1, 3 * size);
7545   jcc(Assembler::less, L_exit);
7546     xorl(tmp1, tmp1);
7547     xorl(tmp2, tmp2);
7548     movq(tmp3, in_out2);
7549     addq(tmp3, size);
7550 
7551     bind(L_processPartition);
7552       crc32(in_out3, Address(in_out2, 0), 8);
7553       crc32(tmp1, Address(in_out2, size), 8);
7554       crc32(tmp2, Address(in_out2, size * 2), 8);
7555       addq(in_out2, 8);
7556       cmpq(in_out2, tmp3);
7557       jcc(Assembler::less, L_processPartition);
7558     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7559             w_xtmp1, w_xtmp2, w_xtmp3,
7560             tmp4, tmp5,
7561             n_tmp6);
7562     addq(in_out2, 2 * size);
7563     subl(in_out1, 3 * size);
7564     jmp(L_processPartitions);
7565 
7566   bind(L_exit);
7567 }
7568 #else
7569 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
7570                                      Register tmp1, Register tmp2, Register tmp3,
7571                                      XMMRegister xtmp1, XMMRegister xtmp2) {
7572   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7573   if (n > 0) {
7574     addl(tmp3, n * 256 * 8);
7575   }
7576   //    Q1 = TABLEExt[n][B & 0xFF];
7577   movl(tmp1, in_out);
7578   andl(tmp1, 0x000000FF);
7579   shll(tmp1, 3);
7580   addl(tmp1, tmp3);
7581   movq(xtmp1, Address(tmp1, 0));
7582 
7583   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7584   movl(tmp2, in_out);
7585   shrl(tmp2, 8);
7586   andl(tmp2, 0x000000FF);
7587   shll(tmp2, 3);
7588   addl(tmp2, tmp3);
7589   movq(xtmp2, Address(tmp2, 0));
7590 
7591   psllq(xtmp2, 8);
7592   pxor(xtmp1, xtmp2);
7593 
7594   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7595   movl(tmp2, in_out);
7596   shrl(tmp2, 16);
7597   andl(tmp2, 0x000000FF);
7598   shll(tmp2, 3);
7599   addl(tmp2, tmp3);
7600   movq(xtmp2, Address(tmp2, 0));
7601 
7602   psllq(xtmp2, 16);
7603   pxor(xtmp1, xtmp2);
7604 
7605   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7606   shrl(in_out, 24);
7607   andl(in_out, 0x000000FF);
7608   shll(in_out, 3);
7609   addl(in_out, tmp3);
7610   movq(xtmp2, Address(in_out, 0));
7611 
7612   psllq(xtmp2, 24);
7613   pxor(xtmp1, xtmp2); // Result in CXMM
7614   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7615 }
7616 
7617 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7618                                       Register in_out,
7619                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7620                                       XMMRegister w_xtmp2,
7621                                       Register tmp1,
7622                                       Register n_tmp2, Register n_tmp3) {
7623   if (is_pclmulqdq_supported) {
7624     movdl(w_xtmp1, in_out);
7625 
7626     movl(tmp1, const_or_pre_comp_const_index);
7627     movdl(w_xtmp2, tmp1);
7628     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7629     // Keep result in XMM since GPR is 32 bit in length
7630   } else {
7631     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
7632   }
7633 }
7634 
7635 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7636                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7637                                      Register tmp1, Register tmp2,
7638                                      Register n_tmp3) {
7639   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7640   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7641 
7642   psllq(w_xtmp1, 1);
7643   movdl(tmp1, w_xtmp1);
7644   psrlq(w_xtmp1, 32);
7645   movdl(in_out, w_xtmp1);
7646 
7647   xorl(tmp2, tmp2);
7648   crc32(tmp2, tmp1, 4);
7649   xorl(in_out, tmp2);
7650 
7651   psllq(w_xtmp2, 1);
7652   movdl(tmp1, w_xtmp2);
7653   psrlq(w_xtmp2, 32);
7654   movdl(in1, w_xtmp2);
7655 
7656   xorl(tmp2, tmp2);
7657   crc32(tmp2, tmp1, 4);
7658   xorl(in1, tmp2);
7659   xorl(in_out, in1);
7660   xorl(in_out, in2);
7661 }
7662 
7663 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7664                                        Register in_out1, Register in_out2, Register in_out3,
7665                                        Register tmp1, Register tmp2, Register tmp3,
7666                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7667                                        Register tmp4, Register tmp5,
7668                                        Register n_tmp6) {
7669   Label L_processPartitions;
7670   Label L_processPartition;
7671   Label L_exit;
7672 
7673   bind(L_processPartitions);
7674   cmpl(in_out1, 3 * size);
7675   jcc(Assembler::less, L_exit);
7676     xorl(tmp1, tmp1);
7677     xorl(tmp2, tmp2);
7678     movl(tmp3, in_out2);
7679     addl(tmp3, size);
7680 
7681     bind(L_processPartition);
7682       crc32(in_out3, Address(in_out2, 0), 4);
7683       crc32(tmp1, Address(in_out2, size), 4);
7684       crc32(tmp2, Address(in_out2, size*2), 4);
7685       crc32(in_out3, Address(in_out2, 0+4), 4);
7686       crc32(tmp1, Address(in_out2, size+4), 4);
7687       crc32(tmp2, Address(in_out2, size*2+4), 4);
7688       addl(in_out2, 8);
7689       cmpl(in_out2, tmp3);
7690       jcc(Assembler::less, L_processPartition);
7691 
7692         push(tmp3);
7693         push(in_out1);
7694         push(in_out2);
7695         tmp4 = tmp3;
7696         tmp5 = in_out1;
7697         n_tmp6 = in_out2;
7698 
7699       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7700             w_xtmp1, w_xtmp2, w_xtmp3,
7701             tmp4, tmp5,
7702             n_tmp6);
7703 
7704         pop(in_out2);
7705         pop(in_out1);
7706         pop(tmp3);
7707 
7708     addl(in_out2, 2 * size);
7709     subl(in_out1, 3 * size);
7710     jmp(L_processPartitions);
7711 
7712   bind(L_exit);
7713 }
7714 #endif //LP64
7715 
7716 #ifdef _LP64
7717 // Algorithm 2: Pipelined usage of the CRC32 instruction.
7718 // Input: A buffer I of L bytes.
7719 // Output: the CRC32C value of the buffer.
7720 // Notations:
7721 // Write L = 24N + r, with N = floor (L/24).
7722 // r = L mod 24 (0 <= r < 24).
7723 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
7724 // N quadwords, and R consists of r bytes.
7725 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
7726 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
7727 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
7728 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
7729 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7730                                           Register tmp1, Register tmp2, Register tmp3,
7731                                           Register tmp4, Register tmp5, Register tmp6,
7732                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7733                                           bool is_pclmulqdq_supported) {
7734   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7735   Label L_wordByWord;
7736   Label L_byteByByteProlog;
7737   Label L_byteByByte;
7738   Label L_exit;
7739 
7740   if (is_pclmulqdq_supported ) {
7741     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7742     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
7743 
7744     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7745     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7746 
7747     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7748     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7749     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
7750   } else {
7751     const_or_pre_comp_const_index[0] = 1;
7752     const_or_pre_comp_const_index[1] = 0;
7753 
7754     const_or_pre_comp_const_index[2] = 3;
7755     const_or_pre_comp_const_index[3] = 2;
7756 
7757     const_or_pre_comp_const_index[4] = 5;
7758     const_or_pre_comp_const_index[5] = 4;
7759    }
7760   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
7761                     in2, in1, in_out,
7762                     tmp1, tmp2, tmp3,
7763                     w_xtmp1, w_xtmp2, w_xtmp3,
7764                     tmp4, tmp5,
7765                     tmp6);
7766   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
7767                     in2, in1, in_out,
7768                     tmp1, tmp2, tmp3,
7769                     w_xtmp1, w_xtmp2, w_xtmp3,
7770                     tmp4, tmp5,
7771                     tmp6);
7772   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
7773                     in2, in1, in_out,
7774                     tmp1, tmp2, tmp3,
7775                     w_xtmp1, w_xtmp2, w_xtmp3,
7776                     tmp4, tmp5,
7777                     tmp6);
7778   movl(tmp1, in2);
7779   andl(tmp1, 0x00000007);
7780   negl(tmp1);
7781   addl(tmp1, in2);
7782   addq(tmp1, in1);
7783 
7784   BIND(L_wordByWord);
7785   cmpq(in1, tmp1);
7786   jcc(Assembler::greaterEqual, L_byteByByteProlog);
7787     crc32(in_out, Address(in1, 0), 4);
7788     addq(in1, 4);
7789     jmp(L_wordByWord);
7790 
7791   BIND(L_byteByByteProlog);
7792   andl(in2, 0x00000007);
7793   movl(tmp2, 1);
7794 
7795   BIND(L_byteByByte);
7796   cmpl(tmp2, in2);
7797   jccb(Assembler::greater, L_exit);
7798     crc32(in_out, Address(in1, 0), 1);
7799     incq(in1);
7800     incl(tmp2);
7801     jmp(L_byteByByte);
7802 
7803   BIND(L_exit);
7804 }
7805 #else
7806 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7807                                           Register tmp1, Register  tmp2, Register tmp3,
7808                                           Register tmp4, Register  tmp5, Register tmp6,
7809                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7810                                           bool is_pclmulqdq_supported) {
7811   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7812   Label L_wordByWord;
7813   Label L_byteByByteProlog;
7814   Label L_byteByByte;
7815   Label L_exit;
7816 
7817   if (is_pclmulqdq_supported) {
7818     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7819     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
7820 
7821     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7822     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7823 
7824     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7825     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7826   } else {
7827     const_or_pre_comp_const_index[0] = 1;
7828     const_or_pre_comp_const_index[1] = 0;
7829 
7830     const_or_pre_comp_const_index[2] = 3;
7831     const_or_pre_comp_const_index[3] = 2;
7832 
7833     const_or_pre_comp_const_index[4] = 5;
7834     const_or_pre_comp_const_index[5] = 4;
7835   }
7836   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
7837                     in2, in1, in_out,
7838                     tmp1, tmp2, tmp3,
7839                     w_xtmp1, w_xtmp2, w_xtmp3,
7840                     tmp4, tmp5,
7841                     tmp6);
7842   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
7843                     in2, in1, in_out,
7844                     tmp1, tmp2, tmp3,
7845                     w_xtmp1, w_xtmp2, w_xtmp3,
7846                     tmp4, tmp5,
7847                     tmp6);
7848   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
7849                     in2, in1, in_out,
7850                     tmp1, tmp2, tmp3,
7851                     w_xtmp1, w_xtmp2, w_xtmp3,
7852                     tmp4, tmp5,
7853                     tmp6);
7854   movl(tmp1, in2);
7855   andl(tmp1, 0x00000007);
7856   negl(tmp1);
7857   addl(tmp1, in2);
7858   addl(tmp1, in1);
7859 
7860   BIND(L_wordByWord);
7861   cmpl(in1, tmp1);
7862   jcc(Assembler::greaterEqual, L_byteByByteProlog);
7863     crc32(in_out, Address(in1,0), 4);
7864     addl(in1, 4);
7865     jmp(L_wordByWord);
7866 
7867   BIND(L_byteByByteProlog);
7868   andl(in2, 0x00000007);
7869   movl(tmp2, 1);
7870 
7871   BIND(L_byteByByte);
7872   cmpl(tmp2, in2);
7873   jccb(Assembler::greater, L_exit);
7874     movb(tmp1, Address(in1, 0));
7875     crc32(in_out, tmp1, 1);
7876     incl(in1);
7877     incl(tmp2);
7878     jmp(L_byteByByte);
7879 
7880   BIND(L_exit);
7881 }
7882 #endif // LP64
7883 #undef BIND
7884 #undef BLOCK_COMMENT
7885 
7886 // Compress char[] array to byte[].
7887 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
7888 //   @IntrinsicCandidate
7889 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
7890 //     for (int i = 0; i < len; i++) {
7891 //       int c = src[srcOff++];
7892 //       if (c >>> 8 != 0) {
7893 //         return 0;
7894 //       }
7895 //       dst[dstOff++] = (byte)c;
7896 //     }
7897 //     return len;
7898 //   }
7899 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
7900   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
7901   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
7902   Register tmp5, Register result, KRegister mask1, KRegister mask2) {
7903   Label copy_chars_loop, return_length, return_zero, done;
7904 
7905   // rsi: src
7906   // rdi: dst
7907   // rdx: len
7908   // rcx: tmp5
7909   // rax: result
7910 
7911   // rsi holds start addr of source char[] to be compressed
7912   // rdi holds start addr of destination byte[]
7913   // rdx holds length
7914 
7915   assert(len != result, "");
7916 
7917   // save length for return
7918   push(len);
7919 
7920   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
7921     VM_Version::supports_avx512vlbw() &&
7922     VM_Version::supports_bmi2()) {
7923 
7924     Label copy_32_loop, copy_loop_tail, below_threshold;
7925 
7926     // alignment
7927     Label post_alignment;
7928 
7929     // if length of the string is less than 16, handle it in an old fashioned way
7930     testl(len, -32);
7931     jcc(Assembler::zero, below_threshold);
7932 
7933     // First check whether a character is compressable ( <= 0xFF).
7934     // Create mask to test for Unicode chars inside zmm vector
7935     movl(result, 0x00FF);
7936     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
7937 
7938     testl(len, -64);
7939     jcc(Assembler::zero, post_alignment);
7940 
7941     movl(tmp5, dst);
7942     andl(tmp5, (32 - 1));
7943     negl(tmp5);
7944     andl(tmp5, (32 - 1));
7945 
7946     // bail out when there is nothing to be done
7947     testl(tmp5, 0xFFFFFFFF);
7948     jcc(Assembler::zero, post_alignment);
7949 
7950     // ~(~0 << len), where len is the # of remaining elements to process
7951     movl(result, 0xFFFFFFFF);
7952     shlxl(result, result, tmp5);
7953     notl(result);
7954     kmovdl(mask2, result);
7955 
7956     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
7957     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
7958     ktestd(mask1, mask2);
7959     jcc(Assembler::carryClear, return_zero);
7960 
7961     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
7962 
7963     addptr(src, tmp5);
7964     addptr(src, tmp5);
7965     addptr(dst, tmp5);
7966     subl(len, tmp5);
7967 
7968     bind(post_alignment);
7969     // end of alignment
7970 
7971     movl(tmp5, len);
7972     andl(tmp5, (32 - 1));    // tail count (in chars)
7973     andl(len, ~(32 - 1));    // vector count (in chars)
7974     jcc(Assembler::zero, copy_loop_tail);
7975 
7976     lea(src, Address(src, len, Address::times_2));
7977     lea(dst, Address(dst, len, Address::times_1));
7978     negptr(len);
7979 
7980     bind(copy_32_loop);
7981     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), /*merge*/ false, Assembler::AVX_512bit);
7982     evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
7983     kortestdl(mask1, mask1);
7984     jcc(Assembler::carryClear, return_zero);
7985 
7986     // All elements in current processed chunk are valid candidates for
7987     // compression. Write a truncated byte elements to the memory.
7988     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
7989     addptr(len, 32);
7990     jcc(Assembler::notZero, copy_32_loop);
7991 
7992     bind(copy_loop_tail);
7993     // bail out when there is nothing to be done
7994     testl(tmp5, 0xFFFFFFFF);
7995     jcc(Assembler::zero, return_length);
7996 
7997     movl(len, tmp5);
7998 
7999     // ~(~0 << len), where len is the # of remaining elements to process
8000     movl(result, 0xFFFFFFFF);
8001     shlxl(result, result, len);
8002     notl(result);
8003 
8004     kmovdl(mask2, result);
8005 
8006     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8007     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8008     ktestd(mask1, mask2);
8009     jcc(Assembler::carryClear, return_zero);
8010 
8011     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8012     jmp(return_length);
8013 
8014     bind(below_threshold);
8015   }
8016 
8017   if (UseSSE42Intrinsics) {
8018     Label copy_32_loop, copy_16, copy_tail;
8019 
8020     movl(result, len);
8021 
8022     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
8023 
8024     // vectored compression
8025     andl(len, 0xfffffff0);    // vector count (in chars)
8026     andl(result, 0x0000000f);    // tail count (in chars)
8027     testl(len, len);
8028     jcc(Assembler::zero, copy_16);
8029 
8030     // compress 16 chars per iter
8031     movdl(tmp1Reg, tmp5);
8032     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8033     pxor(tmp4Reg, tmp4Reg);
8034 
8035     lea(src, Address(src, len, Address::times_2));
8036     lea(dst, Address(dst, len, Address::times_1));
8037     negptr(len);
8038 
8039     bind(copy_32_loop);
8040     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
8041     por(tmp4Reg, tmp2Reg);
8042     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
8043     por(tmp4Reg, tmp3Reg);
8044     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
8045     jcc(Assembler::notZero, return_zero);
8046     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
8047     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
8048     addptr(len, 16);
8049     jcc(Assembler::notZero, copy_32_loop);
8050 
8051     // compress next vector of 8 chars (if any)
8052     bind(copy_16);
8053     movl(len, result);
8054     andl(len, 0xfffffff8);    // vector count (in chars)
8055     andl(result, 0x00000007);    // tail count (in chars)
8056     testl(len, len);
8057     jccb(Assembler::zero, copy_tail);
8058 
8059     movdl(tmp1Reg, tmp5);
8060     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8061     pxor(tmp3Reg, tmp3Reg);
8062 
8063     movdqu(tmp2Reg, Address(src, 0));
8064     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
8065     jccb(Assembler::notZero, return_zero);
8066     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
8067     movq(Address(dst, 0), tmp2Reg);
8068     addptr(src, 16);
8069     addptr(dst, 8);
8070 
8071     bind(copy_tail);
8072     movl(len, result);
8073   }
8074   // compress 1 char per iter
8075   testl(len, len);
8076   jccb(Assembler::zero, return_length);
8077   lea(src, Address(src, len, Address::times_2));
8078   lea(dst, Address(dst, len, Address::times_1));
8079   negptr(len);
8080 
8081   bind(copy_chars_loop);
8082   load_unsigned_short(result, Address(src, len, Address::times_2));
8083   testl(result, 0xff00);      // check if Unicode char
8084   jccb(Assembler::notZero, return_zero);
8085   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
8086   increment(len);
8087   jcc(Assembler::notZero, copy_chars_loop);
8088 
8089   // if compression succeeded, return length
8090   bind(return_length);
8091   pop(result);
8092   jmpb(done);
8093 
8094   // if compression failed, return 0
8095   bind(return_zero);
8096   xorl(result, result);
8097   addptr(rsp, wordSize);
8098 
8099   bind(done);
8100 }
8101 
8102 // Inflate byte[] array to char[].
8103 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
8104 //   @IntrinsicCandidate
8105 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
8106 //     for (int i = 0; i < len; i++) {
8107 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
8108 //     }
8109 //   }
8110 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
8111   XMMRegister tmp1, Register tmp2, KRegister mask) {
8112   Label copy_chars_loop, done, below_threshold, avx3_threshold;
8113   // rsi: src
8114   // rdi: dst
8115   // rdx: len
8116   // rcx: tmp2
8117 
8118   // rsi holds start addr of source byte[] to be inflated
8119   // rdi holds start addr of destination char[]
8120   // rdx holds length
8121   assert_different_registers(src, dst, len, tmp2);
8122   movl(tmp2, len);
8123   if ((UseAVX > 2) && // AVX512
8124     VM_Version::supports_avx512vlbw() &&
8125     VM_Version::supports_bmi2()) {
8126 
8127     Label copy_32_loop, copy_tail;
8128     Register tmp3_aliased = len;
8129 
8130     // if length of the string is less than 16, handle it in an old fashioned way
8131     testl(len, -16);
8132     jcc(Assembler::zero, below_threshold);
8133 
8134     testl(len, -1 * AVX3Threshold);
8135     jcc(Assembler::zero, avx3_threshold);
8136 
8137     // In order to use only one arithmetic operation for the main loop we use
8138     // this pre-calculation
8139     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
8140     andl(len, -32);     // vector count
8141     jccb(Assembler::zero, copy_tail);
8142 
8143     lea(src, Address(src, len, Address::times_1));
8144     lea(dst, Address(dst, len, Address::times_2));
8145     negptr(len);
8146 
8147 
8148     // inflate 32 chars per iter
8149     bind(copy_32_loop);
8150     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
8151     evmovdquw(Address(dst, len, Address::times_2), tmp1, /*merge*/ false, Assembler::AVX_512bit);
8152     addptr(len, 32);
8153     jcc(Assembler::notZero, copy_32_loop);
8154 
8155     bind(copy_tail);
8156     // bail out when there is nothing to be done
8157     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
8158     jcc(Assembler::zero, done);
8159 
8160     // ~(~0 << length), where length is the # of remaining elements to process
8161     movl(tmp3_aliased, -1);
8162     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
8163     notl(tmp3_aliased);
8164     kmovdl(mask, tmp3_aliased);
8165     evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
8166     evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
8167 
8168     jmp(done);
8169     bind(avx3_threshold);
8170   }
8171   if (UseSSE42Intrinsics) {
8172     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
8173 
8174     if (UseAVX > 1) {
8175       andl(tmp2, (16 - 1));
8176       andl(len, -16);
8177       jccb(Assembler::zero, copy_new_tail);
8178     } else {
8179       andl(tmp2, 0x00000007);   // tail count (in chars)
8180       andl(len, 0xfffffff8);    // vector count (in chars)
8181       jccb(Assembler::zero, copy_tail);
8182     }
8183 
8184     // vectored inflation
8185     lea(src, Address(src, len, Address::times_1));
8186     lea(dst, Address(dst, len, Address::times_2));
8187     negptr(len);
8188 
8189     if (UseAVX > 1) {
8190       bind(copy_16_loop);
8191       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
8192       vmovdqu(Address(dst, len, Address::times_2), tmp1);
8193       addptr(len, 16);
8194       jcc(Assembler::notZero, copy_16_loop);
8195 
8196       bind(below_threshold);
8197       bind(copy_new_tail);
8198       movl(len, tmp2);
8199       andl(tmp2, 0x00000007);
8200       andl(len, 0xFFFFFFF8);
8201       jccb(Assembler::zero, copy_tail);
8202 
8203       pmovzxbw(tmp1, Address(src, 0));
8204       movdqu(Address(dst, 0), tmp1);
8205       addptr(src, 8);
8206       addptr(dst, 2 * 8);
8207 
8208       jmp(copy_tail, true);
8209     }
8210 
8211     // inflate 8 chars per iter
8212     bind(copy_8_loop);
8213     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
8214     movdqu(Address(dst, len, Address::times_2), tmp1);
8215     addptr(len, 8);
8216     jcc(Assembler::notZero, copy_8_loop);
8217 
8218     bind(copy_tail);
8219     movl(len, tmp2);
8220 
8221     cmpl(len, 4);
8222     jccb(Assembler::less, copy_bytes);
8223 
8224     movdl(tmp1, Address(src, 0));  // load 4 byte chars
8225     pmovzxbw(tmp1, tmp1);
8226     movq(Address(dst, 0), tmp1);
8227     subptr(len, 4);
8228     addptr(src, 4);
8229     addptr(dst, 8);
8230 
8231     bind(copy_bytes);
8232   } else {
8233     bind(below_threshold);
8234   }
8235 
8236   testl(len, len);
8237   jccb(Assembler::zero, done);
8238   lea(src, Address(src, len, Address::times_1));
8239   lea(dst, Address(dst, len, Address::times_2));
8240   negptr(len);
8241 
8242   // inflate 1 char per iter
8243   bind(copy_chars_loop);
8244   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
8245   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
8246   increment(len);
8247   jcc(Assembler::notZero, copy_chars_loop);
8248 
8249   bind(done);
8250 }
8251 
8252 
8253 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
8254   switch(type) {
8255     case T_BYTE:
8256     case T_BOOLEAN:
8257       evmovdqub(dst, kmask, src, false, vector_len);
8258       break;
8259     case T_CHAR:
8260     case T_SHORT:
8261       evmovdquw(dst, kmask, src, false, vector_len);
8262       break;
8263     case T_INT:
8264     case T_FLOAT:
8265       evmovdqul(dst, kmask, src, false, vector_len);
8266       break;
8267     case T_LONG:
8268     case T_DOUBLE:
8269       evmovdquq(dst, kmask, src, false, vector_len);
8270       break;
8271     default:
8272       fatal("Unexpected type argument %s", type2name(type));
8273       break;
8274   }
8275 }
8276 
8277 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
8278   switch(type) {
8279     case T_BYTE:
8280     case T_BOOLEAN:
8281       evmovdqub(dst, kmask, src, true, vector_len);
8282       break;
8283     case T_CHAR:
8284     case T_SHORT:
8285       evmovdquw(dst, kmask, src, true, vector_len);
8286       break;
8287     case T_INT:
8288     case T_FLOAT:
8289       evmovdqul(dst, kmask, src, true, vector_len);
8290       break;
8291     case T_LONG:
8292     case T_DOUBLE:
8293       evmovdquq(dst, kmask, src, true, vector_len);
8294       break;
8295     default:
8296       fatal("Unexpected type argument %s", type2name(type));
8297       break;
8298   }
8299 }
8300 
8301 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) {
8302   switch(masklen) {
8303     case 2:
8304        knotbl(dst, src);
8305        movl(rtmp, 3);
8306        kmovbl(ktmp, rtmp);
8307        kandbl(dst, ktmp, dst);
8308        break;
8309     case 4:
8310        knotbl(dst, src);
8311        movl(rtmp, 15);
8312        kmovbl(ktmp, rtmp);
8313        kandbl(dst, ktmp, dst);
8314        break;
8315     case 8:
8316        knotbl(dst, src);
8317        break;
8318     case 16:
8319        knotwl(dst, src);
8320        break;
8321     case 32:
8322        knotdl(dst, src);
8323        break;
8324     case 64:
8325        knotql(dst, src);
8326        break;
8327     default:
8328       fatal("Unexpected vector length %d", masklen);
8329       break;
8330   }
8331 }
8332 
8333 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8334   switch(type) {
8335     case T_BOOLEAN:
8336     case T_BYTE:
8337        kandbl(dst, src1, src2);
8338        break;
8339     case T_CHAR:
8340     case T_SHORT:
8341        kandwl(dst, src1, src2);
8342        break;
8343     case T_INT:
8344     case T_FLOAT:
8345        kanddl(dst, src1, src2);
8346        break;
8347     case T_LONG:
8348     case T_DOUBLE:
8349        kandql(dst, src1, src2);
8350        break;
8351     default:
8352       fatal("Unexpected type argument %s", type2name(type));
8353       break;
8354   }
8355 }
8356 
8357 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8358   switch(type) {
8359     case T_BOOLEAN:
8360     case T_BYTE:
8361        korbl(dst, src1, src2);
8362        break;
8363     case T_CHAR:
8364     case T_SHORT:
8365        korwl(dst, src1, src2);
8366        break;
8367     case T_INT:
8368     case T_FLOAT:
8369        kordl(dst, src1, src2);
8370        break;
8371     case T_LONG:
8372     case T_DOUBLE:
8373        korql(dst, src1, src2);
8374        break;
8375     default:
8376       fatal("Unexpected type argument %s", type2name(type));
8377       break;
8378   }
8379 }
8380 
8381 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8382   switch(type) {
8383     case T_BOOLEAN:
8384     case T_BYTE:
8385        kxorbl(dst, src1, src2);
8386        break;
8387     case T_CHAR:
8388     case T_SHORT:
8389        kxorwl(dst, src1, src2);
8390        break;
8391     case T_INT:
8392     case T_FLOAT:
8393        kxordl(dst, src1, src2);
8394        break;
8395     case T_LONG:
8396     case T_DOUBLE:
8397        kxorql(dst, src1, src2);
8398        break;
8399     default:
8400       fatal("Unexpected type argument %s", type2name(type));
8401       break;
8402   }
8403 }
8404 
8405 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8406   switch(type) {
8407     case T_BOOLEAN:
8408     case T_BYTE:
8409       evpermb(dst, mask, nds, src, merge, vector_len); break;
8410     case T_CHAR:
8411     case T_SHORT:
8412       evpermw(dst, mask, nds, src, merge, vector_len); break;
8413     case T_INT:
8414     case T_FLOAT:
8415       evpermd(dst, mask, nds, src, merge, vector_len); break;
8416     case T_LONG:
8417     case T_DOUBLE:
8418       evpermq(dst, mask, nds, src, merge, vector_len); break;
8419     default:
8420       fatal("Unexpected type argument %s", type2name(type)); break;
8421   }
8422 }
8423 
8424 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8425   switch(type) {
8426     case T_BOOLEAN:
8427     case T_BYTE:
8428       evpermb(dst, mask, nds, src, merge, vector_len); break;
8429     case T_CHAR:
8430     case T_SHORT:
8431       evpermw(dst, mask, nds, src, merge, vector_len); break;
8432     case T_INT:
8433     case T_FLOAT:
8434       evpermd(dst, mask, nds, src, merge, vector_len); break;
8435     case T_LONG:
8436     case T_DOUBLE:
8437       evpermq(dst, mask, nds, src, merge, vector_len); break;
8438     default:
8439       fatal("Unexpected type argument %s", type2name(type)); break;
8440   }
8441 }
8442 
8443 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8444   switch(type) {
8445     case T_BYTE:
8446       evpminsb(dst, mask, nds, src, merge, vector_len); break;
8447     case T_SHORT:
8448       evpminsw(dst, mask, nds, src, merge, vector_len); break;
8449     case T_INT:
8450       evpminsd(dst, mask, nds, src, merge, vector_len); break;
8451     case T_LONG:
8452       evpminsq(dst, mask, nds, src, merge, vector_len); break;
8453     default:
8454       fatal("Unexpected type argument %s", type2name(type)); break;
8455   }
8456 }
8457 
8458 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8459   switch(type) {
8460     case T_BYTE:
8461       evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
8462     case T_SHORT:
8463       evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
8464     case T_INT:
8465       evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
8466     case T_LONG:
8467       evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
8468     default:
8469       fatal("Unexpected type argument %s", type2name(type)); break;
8470   }
8471 }
8472 
8473 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8474   switch(type) {
8475     case T_BYTE:
8476       evpminsb(dst, mask, nds, src, merge, vector_len); break;
8477     case T_SHORT:
8478       evpminsw(dst, mask, nds, src, merge, vector_len); break;
8479     case T_INT:
8480       evpminsd(dst, mask, nds, src, merge, vector_len); break;
8481     case T_LONG:
8482       evpminsq(dst, mask, nds, src, merge, vector_len); break;
8483     default:
8484       fatal("Unexpected type argument %s", type2name(type)); break;
8485   }
8486 }
8487 
8488 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8489   switch(type) {
8490     case T_BYTE:
8491       evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
8492     case T_SHORT:
8493       evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
8494     case T_INT:
8495       evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
8496     case T_LONG:
8497       evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
8498     default:
8499       fatal("Unexpected type argument %s", type2name(type)); break;
8500   }
8501 }
8502 
8503 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8504   switch(type) {
8505     case T_INT:
8506       evpxord(dst, mask, nds, src, merge, vector_len); break;
8507     case T_LONG:
8508       evpxorq(dst, mask, nds, src, merge, vector_len); break;
8509     default:
8510       fatal("Unexpected type argument %s", type2name(type)); break;
8511   }
8512 }
8513 
8514 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8515   switch(type) {
8516     case T_INT:
8517       evpxord(dst, mask, nds, src, merge, vector_len); break;
8518     case T_LONG:
8519       evpxorq(dst, mask, nds, src, merge, vector_len); break;
8520     default:
8521       fatal("Unexpected type argument %s", type2name(type)); break;
8522   }
8523 }
8524 
8525 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8526   switch(type) {
8527     case T_INT:
8528       Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
8529     case T_LONG:
8530       evporq(dst, mask, nds, src, merge, vector_len); break;
8531     default:
8532       fatal("Unexpected type argument %s", type2name(type)); break;
8533   }
8534 }
8535 
8536 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8537   switch(type) {
8538     case T_INT:
8539       Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
8540     case T_LONG:
8541       evporq(dst, mask, nds, src, merge, vector_len); break;
8542     default:
8543       fatal("Unexpected type argument %s", type2name(type)); break;
8544   }
8545 }
8546 
8547 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8548   switch(type) {
8549     case T_INT:
8550       evpandd(dst, mask, nds, src, merge, vector_len); break;
8551     case T_LONG:
8552       evpandq(dst, mask, nds, src, merge, vector_len); break;
8553     default:
8554       fatal("Unexpected type argument %s", type2name(type)); break;
8555   }
8556 }
8557 
8558 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8559   switch(type) {
8560     case T_INT:
8561       evpandd(dst, mask, nds, src, merge, vector_len); break;
8562     case T_LONG:
8563       evpandq(dst, mask, nds, src, merge, vector_len); break;
8564     default:
8565       fatal("Unexpected type argument %s", type2name(type)); break;
8566   }
8567 }
8568 
8569 void MacroAssembler::anytrue(Register dst, uint masklen, KRegister src1, KRegister src2) {
8570    masklen = masklen < 8 ? 8 : masklen;
8571    ktest(masklen, src1, src2);
8572    setb(Assembler::notZero, dst);
8573    movzbl(dst, dst);
8574 }
8575 
8576 void MacroAssembler::alltrue(Register dst, uint masklen, KRegister src1, KRegister src2, KRegister kscratch) {
8577   if (masklen < 8) {
8578     knotbl(kscratch, src2);
8579     kortestbl(src1, kscratch);
8580     setb(Assembler::carrySet, dst);
8581     movzbl(dst, dst);
8582   } else {
8583     ktest(masklen, src1, src2);
8584     setb(Assembler::carrySet, dst);
8585     movzbl(dst, dst);
8586   }
8587 }
8588 
8589 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) {
8590   switch(masklen) {
8591     case 8:
8592        kortestbl(src1, src2);
8593        break;
8594     case 16:
8595        kortestwl(src1, src2);
8596        break;
8597     case 32:
8598        kortestdl(src1, src2);
8599        break;
8600     case 64:
8601        kortestql(src1, src2);
8602        break;
8603     default:
8604       fatal("Unexpected mask length %d", masklen);
8605       break;
8606   }
8607 }
8608 
8609 
8610 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) {
8611   switch(masklen)  {
8612     case 8:
8613        ktestbl(src1, src2);
8614        break;
8615     case 16:
8616        ktestwl(src1, src2);
8617        break;
8618     case 32:
8619        ktestdl(src1, src2);
8620        break;
8621     case 64:
8622        ktestql(src1, src2);
8623        break;
8624     default:
8625       fatal("Unexpected mask length %d", masklen);
8626       break;
8627   }
8628 }
8629 
8630 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
8631   switch(type) {
8632     case T_INT:
8633       evprold(dst, mask, src, shift, merge, vlen_enc); break;
8634     case T_LONG:
8635       evprolq(dst, mask, src, shift, merge, vlen_enc); break;
8636     default:
8637       fatal("Unexpected type argument %s", type2name(type)); break;
8638       break;
8639   }
8640 }
8641 
8642 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
8643   switch(type) {
8644     case T_INT:
8645       evprord(dst, mask, src, shift, merge, vlen_enc); break;
8646     case T_LONG:
8647       evprorq(dst, mask, src, shift, merge, vlen_enc); break;
8648     default:
8649       fatal("Unexpected type argument %s", type2name(type)); break;
8650   }
8651 }
8652 
8653 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
8654   switch(type) {
8655     case T_INT:
8656       evprolvd(dst, mask, src1, src2, merge, vlen_enc); break;
8657     case T_LONG:
8658       evprolvq(dst, mask, src1, src2, merge, vlen_enc); break;
8659     default:
8660       fatal("Unexpected type argument %s", type2name(type)); break;
8661   }
8662 }
8663 
8664 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
8665   switch(type) {
8666     case T_INT:
8667       evprorvd(dst, mask, src1, src2, merge, vlen_enc); break;
8668     case T_LONG:
8669       evprorvq(dst, mask, src1, src2, merge, vlen_enc); break;
8670     default:
8671       fatal("Unexpected type argument %s", type2name(type)); break;
8672   }
8673 }
8674 #if COMPILER2_OR_JVMCI
8675 
8676 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
8677                                  Register length, Register temp, int vec_enc) {
8678   // Computing mask for predicated vector store.
8679   movptr(temp, -1);
8680   bzhiq(temp, temp, length);
8681   kmov(mask, temp);
8682   evmovdqu(bt, mask, dst, xmm, vec_enc);
8683 }
8684 
8685 // Set memory operation for length "less than" 64 bytes.
8686 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
8687                                        XMMRegister xmm, KRegister mask, Register length,
8688                                        Register temp, bool use64byteVector) {
8689   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8690   BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
8691   if (!use64byteVector) {
8692     fill32(dst, disp, xmm);
8693     subptr(length, 32 >> shift);
8694     fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
8695   } else {
8696     assert(MaxVectorSize == 64, "vector length != 64");
8697     fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
8698   }
8699 }
8700 
8701 
8702 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
8703                                        XMMRegister xmm, KRegister mask, Register length,
8704                                        Register temp) {
8705   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8706   BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
8707   fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
8708 }
8709 
8710 
8711 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
8712   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8713   vmovdqu(Address(dst, disp), xmm);
8714 }
8715 
8716 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
8717   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8718   BasicType type[] = {T_BYTE,  T_SHORT,  T_INT,   T_LONG};
8719   if (!use64byteVector) {
8720     fill32(dst, disp, xmm);
8721     fill32(dst, disp + 32, xmm);
8722   } else {
8723     evmovdquq(Address(dst, disp), xmm, Assembler::AVX_512bit);
8724   }
8725 }
8726 
8727 #ifdef _LP64
8728 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
8729                                         Register count, Register rtmp, XMMRegister xtmp) {
8730   Label L_exit;
8731   Label L_fill_start;
8732   Label L_fill_64_bytes;
8733   Label L_fill_96_bytes;
8734   Label L_fill_128_bytes;
8735   Label L_fill_128_bytes_loop;
8736   Label L_fill_128_loop_header;
8737   Label L_fill_128_bytes_loop_header;
8738   Label L_fill_128_bytes_loop_pre_header;
8739   Label L_fill_zmm_sequence;
8740 
8741   int shift = -1;
8742   int avx3threshold = VM_Version::avx3_threshold();
8743   switch(type) {
8744     case T_BYTE:  shift = 0;
8745       break;
8746     case T_SHORT: shift = 1;
8747       break;
8748     case T_INT:   shift = 2;
8749       break;
8750     /* Uncomment when LONG fill stubs are supported.
8751     case T_LONG:  shift = 3;
8752       break;
8753     */
8754     default:
8755       fatal("Unhandled type: %s\n", type2name(type));
8756   }
8757 
8758   if ((avx3threshold != 0)  || (MaxVectorSize == 32)) {
8759 
8760     if (MaxVectorSize == 64) {
8761       cmpq(count, avx3threshold >> shift);
8762       jcc(Assembler::greater, L_fill_zmm_sequence);
8763     }
8764 
8765     evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
8766 
8767     bind(L_fill_start);
8768 
8769     cmpq(count, 32 >> shift);
8770     jccb(Assembler::greater, L_fill_64_bytes);
8771     fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
8772     jmp(L_exit);
8773 
8774     bind(L_fill_64_bytes);
8775     cmpq(count, 64 >> shift);
8776     jccb(Assembler::greater, L_fill_96_bytes);
8777     fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
8778     jmp(L_exit);
8779 
8780     bind(L_fill_96_bytes);
8781     cmpq(count, 96 >> shift);
8782     jccb(Assembler::greater, L_fill_128_bytes);
8783     fill64(to, 0, xtmp);
8784     subq(count, 64 >> shift);
8785     fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
8786     jmp(L_exit);
8787 
8788     bind(L_fill_128_bytes);
8789     cmpq(count, 128 >> shift);
8790     jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
8791     fill64(to, 0, xtmp);
8792     fill32(to, 64, xtmp);
8793     subq(count, 96 >> shift);
8794     fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
8795     jmp(L_exit);
8796 
8797     bind(L_fill_128_bytes_loop_pre_header);
8798     {
8799       mov(rtmp, to);
8800       andq(rtmp, 31);
8801       jccb(Assembler::zero, L_fill_128_bytes_loop_header);
8802       negq(rtmp);
8803       addq(rtmp, 32);
8804       mov64(r8, -1L);
8805       bzhiq(r8, r8, rtmp);
8806       kmovql(k2, r8);
8807       evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, Assembler::AVX_256bit);
8808       addq(to, rtmp);
8809       shrq(rtmp, shift);
8810       subq(count, rtmp);
8811     }
8812 
8813     cmpq(count, 128 >> shift);
8814     jcc(Assembler::less, L_fill_start);
8815 
8816     bind(L_fill_128_bytes_loop_header);
8817     subq(count, 128 >> shift);
8818 
8819     align32();
8820     bind(L_fill_128_bytes_loop);
8821       fill64(to, 0, xtmp);
8822       fill64(to, 64, xtmp);
8823       addq(to, 128);
8824       subq(count, 128 >> shift);
8825       jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
8826 
8827     addq(count, 128 >> shift);
8828     jcc(Assembler::zero, L_exit);
8829     jmp(L_fill_start);
8830   }
8831 
8832   if (MaxVectorSize == 64) {
8833     // Sequence using 64 byte ZMM register.
8834     Label L_fill_128_bytes_zmm;
8835     Label L_fill_192_bytes_zmm;
8836     Label L_fill_192_bytes_loop_zmm;
8837     Label L_fill_192_bytes_loop_header_zmm;
8838     Label L_fill_192_bytes_loop_pre_header_zmm;
8839     Label L_fill_start_zmm_sequence;
8840 
8841     bind(L_fill_zmm_sequence);
8842     evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
8843 
8844     bind(L_fill_start_zmm_sequence);
8845     cmpq(count, 64 >> shift);
8846     jccb(Assembler::greater, L_fill_128_bytes_zmm);
8847     fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
8848     jmp(L_exit);
8849 
8850     bind(L_fill_128_bytes_zmm);
8851     cmpq(count, 128 >> shift);
8852     jccb(Assembler::greater, L_fill_192_bytes_zmm);
8853     fill64(to, 0, xtmp, true);
8854     subq(count, 64 >> shift);
8855     fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
8856     jmp(L_exit);
8857 
8858     bind(L_fill_192_bytes_zmm);
8859     cmpq(count, 192 >> shift);
8860     jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
8861     fill64(to, 0, xtmp, true);
8862     fill64(to, 64, xtmp, true);
8863     subq(count, 128 >> shift);
8864     fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
8865     jmp(L_exit);
8866 
8867     bind(L_fill_192_bytes_loop_pre_header_zmm);
8868     {
8869       movq(rtmp, to);
8870       andq(rtmp, 63);
8871       jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
8872       negq(rtmp);
8873       addq(rtmp, 64);
8874       mov64(r8, -1L);
8875       bzhiq(r8, r8, rtmp);
8876       kmovql(k2, r8);
8877       evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, Assembler::AVX_512bit);
8878       addq(to, rtmp);
8879       shrq(rtmp, shift);
8880       subq(count, rtmp);
8881     }
8882 
8883     cmpq(count, 192 >> shift);
8884     jcc(Assembler::less, L_fill_start_zmm_sequence);
8885 
8886     bind(L_fill_192_bytes_loop_header_zmm);
8887     subq(count, 192 >> shift);
8888 
8889     align32();
8890     bind(L_fill_192_bytes_loop_zmm);
8891       fill64(to, 0, xtmp, true);
8892       fill64(to, 64, xtmp, true);
8893       fill64(to, 128, xtmp, true);
8894       addq(to, 192);
8895       subq(count, 192 >> shift);
8896       jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
8897 
8898     addq(count, 192 >> shift);
8899     jcc(Assembler::zero, L_exit);
8900     jmp(L_fill_start_zmm_sequence);
8901   }
8902   bind(L_exit);
8903 }
8904 #endif
8905 #endif //COMPILER2_OR_JVMCI
8906 
8907 
8908 #ifdef _LP64
8909 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
8910   Label done;
8911   cvttss2sil(dst, src);
8912   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
8913   cmpl(dst, 0x80000000); // float_sign_flip
8914   jccb(Assembler::notEqual, done);
8915   subptr(rsp, 8);
8916   movflt(Address(rsp, 0), src);
8917   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
8918   pop(dst);
8919   bind(done);
8920 }
8921 
8922 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
8923   Label done;
8924   cvttsd2sil(dst, src);
8925   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
8926   cmpl(dst, 0x80000000); // float_sign_flip
8927   jccb(Assembler::notEqual, done);
8928   subptr(rsp, 8);
8929   movdbl(Address(rsp, 0), src);
8930   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
8931   pop(dst);
8932   bind(done);
8933 }
8934 
8935 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
8936   Label done;
8937   cvttss2siq(dst, src);
8938   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
8939   jccb(Assembler::notEqual, done);
8940   subptr(rsp, 8);
8941   movflt(Address(rsp, 0), src);
8942   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
8943   pop(dst);
8944   bind(done);
8945 }
8946 
8947 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
8948   Label done;
8949   cvttsd2siq(dst, src);
8950   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
8951   jccb(Assembler::notEqual, done);
8952   subptr(rsp, 8);
8953   movdbl(Address(rsp, 0), src);
8954   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
8955   pop(dst);
8956   bind(done);
8957 }
8958 
8959 void MacroAssembler::cache_wb(Address line)
8960 {
8961   // 64 bit cpus always support clflush
8962   assert(VM_Version::supports_clflush(), "clflush should be available");
8963   bool optimized = VM_Version::supports_clflushopt();
8964   bool no_evict = VM_Version::supports_clwb();
8965 
8966   // prefer clwb (writeback without evict) otherwise
8967   // prefer clflushopt (potentially parallel writeback with evict)
8968   // otherwise fallback on clflush (serial writeback with evict)
8969 
8970   if (optimized) {
8971     if (no_evict) {
8972       clwb(line);
8973     } else {
8974       clflushopt(line);
8975     }
8976   } else {
8977     // no need for fence when using CLFLUSH
8978     clflush(line);
8979   }
8980 }
8981 
8982 void MacroAssembler::cache_wbsync(bool is_pre)
8983 {
8984   assert(VM_Version::supports_clflush(), "clflush should be available");
8985   bool optimized = VM_Version::supports_clflushopt();
8986   bool no_evict = VM_Version::supports_clwb();
8987 
8988   // pick the correct implementation
8989 
8990   if (!is_pre && (optimized || no_evict)) {
8991     // need an sfence for post flush when using clflushopt or clwb
8992     // otherwise no no need for any synchroniaztion
8993 
8994     sfence();
8995   }
8996 }
8997 
8998 #endif // _LP64
8999 
9000 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9001   switch (cond) {
9002     // Note some conditions are synonyms for others
9003     case Assembler::zero:         return Assembler::notZero;
9004     case Assembler::notZero:      return Assembler::zero;
9005     case Assembler::less:         return Assembler::greaterEqual;
9006     case Assembler::lessEqual:    return Assembler::greater;
9007     case Assembler::greater:      return Assembler::lessEqual;
9008     case Assembler::greaterEqual: return Assembler::less;
9009     case Assembler::below:        return Assembler::aboveEqual;
9010     case Assembler::belowEqual:   return Assembler::above;
9011     case Assembler::above:        return Assembler::belowEqual;
9012     case Assembler::aboveEqual:   return Assembler::below;
9013     case Assembler::overflow:     return Assembler::noOverflow;
9014     case Assembler::noOverflow:   return Assembler::overflow;
9015     case Assembler::negative:     return Assembler::positive;
9016     case Assembler::positive:     return Assembler::negative;
9017     case Assembler::parity:       return Assembler::noParity;
9018     case Assembler::noParity:     return Assembler::parity;
9019   }
9020   ShouldNotReachHere(); return Assembler::overflow;
9021 }
9022 
9023 SkipIfEqual::SkipIfEqual(
9024     MacroAssembler* masm, const bool* flag_addr, bool value) {
9025   _masm = masm;
9026   _masm->cmp8(ExternalAddress((address)flag_addr), value);
9027   _masm->jcc(Assembler::equal, _label);
9028 }
9029 
9030 SkipIfEqual::~SkipIfEqual() {
9031   _masm->bind(_label);
9032 }
9033 
9034 // 32-bit Windows has its own fast-path implementation
9035 // of get_thread
9036 #if !defined(WIN32) || defined(_LP64)
9037 
9038 // This is simply a call to Thread::current()
9039 void MacroAssembler::get_thread(Register thread) {
9040   if (thread != rax) {
9041     push(rax);
9042   }
9043   LP64_ONLY(push(rdi);)
9044   LP64_ONLY(push(rsi);)
9045   push(rdx);
9046   push(rcx);
9047 #ifdef _LP64
9048   push(r8);
9049   push(r9);
9050   push(r10);
9051   push(r11);
9052 #endif
9053 
9054   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
9055 
9056 #ifdef _LP64
9057   pop(r11);
9058   pop(r10);
9059   pop(r9);
9060   pop(r8);
9061 #endif
9062   pop(rcx);
9063   pop(rdx);
9064   LP64_ONLY(pop(rsi);)
9065   LP64_ONLY(pop(rdi);)
9066   if (thread != rax) {
9067     mov(thread, rax);
9068     pop(rax);
9069   }
9070 }
9071 
9072 
9073 #endif // !WIN32 || _LP64
--- EOF ---