1 /*
   2  * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "jvm.h"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "compiler/compiler_globals.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/collectedHeap.inline.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/bytecodeHistogram.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/resourceArea.hpp"
  38 #include "memory/universe.hpp"
  39 #include "oops/accessDecorators.hpp"
  40 #include "oops/compressedOops.inline.hpp"
  41 #include "oops/klass.inline.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "runtime/flags/flagSetting.hpp"
  44 #include "runtime/interfaceSupport.inline.hpp"
  45 #include "runtime/jniHandles.hpp"
  46 #include "runtime/objectMonitor.hpp"
  47 #include "runtime/os.hpp"
  48 #include "runtime/safepoint.hpp"
  49 #include "runtime/safepointMechanism.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "runtime/thread.hpp"
  53 #include "utilities/macros.hpp"
  54 #include "crc32c.h"
  55 
  56 #ifdef PRODUCT
  57 #define BLOCK_COMMENT(str) /* nothing */
  58 #define STOP(error) stop(error)
  59 #else
  60 #define BLOCK_COMMENT(str) block_comment(str)
  61 #define STOP(error) block_comment(error); stop(error)
  62 #endif
  63 
  64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  65 
  66 #ifdef ASSERT
  67 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  68 #endif
  69 
  70 static Assembler::Condition reverse[] = {
  71     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  72     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  73     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  74     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  75     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  76     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  77     Assembler::above          /* belowEqual    = 0x6 */ ,
  78     Assembler::belowEqual     /* above         = 0x7 */ ,
  79     Assembler::positive       /* negative      = 0x8 */ ,
  80     Assembler::negative       /* positive      = 0x9 */ ,
  81     Assembler::noParity       /* parity        = 0xa */ ,
  82     Assembler::parity         /* noParity      = 0xb */ ,
  83     Assembler::greaterEqual   /* less          = 0xc */ ,
  84     Assembler::less           /* greaterEqual  = 0xd */ ,
  85     Assembler::greater        /* lessEqual     = 0xe */ ,
  86     Assembler::lessEqual      /* greater       = 0xf, */
  87 
  88 };
  89 
  90 
  91 // Implementation of MacroAssembler
  92 
  93 // First all the versions that have distinct versions depending on 32/64 bit
  94 // Unless the difference is trivial (1 line or so).
  95 
  96 #ifndef _LP64
  97 
  98 // 32bit versions
  99 
 100 Address MacroAssembler::as_Address(AddressLiteral adr) {
 101   return Address(adr.target(), adr.rspec());
 102 }
 103 
 104 Address MacroAssembler::as_Address(ArrayAddress adr) {
 105   return Address::make_array(adr);
 106 }
 107 
 108 void MacroAssembler::call_VM_leaf_base(address entry_point,
 109                                        int number_of_arguments) {
 110   call(RuntimeAddress(entry_point));
 111   increment(rsp, number_of_arguments * wordSize);
 112 }
 113 
 114 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 115   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 116 }
 117 
 118 
 119 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 120   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 121 }
 122 
 123 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 124   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 125 }
 126 
 127 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 128   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 129 }
 130 
 131 void MacroAssembler::extend_sign(Register hi, Register lo) {
 132   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 133   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 134     cdql();
 135   } else {
 136     movl(hi, lo);
 137     sarl(hi, 31);
 138   }
 139 }
 140 
 141 void MacroAssembler::jC2(Register tmp, Label& L) {
 142   // set parity bit if FPU flag C2 is set (via rax)
 143   save_rax(tmp);
 144   fwait(); fnstsw_ax();
 145   sahf();
 146   restore_rax(tmp);
 147   // branch
 148   jcc(Assembler::parity, L);
 149 }
 150 
 151 void MacroAssembler::jnC2(Register tmp, Label& L) {
 152   // set parity bit if FPU flag C2 is set (via rax)
 153   save_rax(tmp);
 154   fwait(); fnstsw_ax();
 155   sahf();
 156   restore_rax(tmp);
 157   // branch
 158   jcc(Assembler::noParity, L);
 159 }
 160 
 161 // 32bit can do a case table jump in one instruction but we no longer allow the base
 162 // to be installed in the Address class
 163 void MacroAssembler::jump(ArrayAddress entry) {
 164   jmp(as_Address(entry));
 165 }
 166 
 167 // Note: y_lo will be destroyed
 168 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 169   // Long compare for Java (semantics as described in JVM spec.)
 170   Label high, low, done;
 171 
 172   cmpl(x_hi, y_hi);
 173   jcc(Assembler::less, low);
 174   jcc(Assembler::greater, high);
 175   // x_hi is the return register
 176   xorl(x_hi, x_hi);
 177   cmpl(x_lo, y_lo);
 178   jcc(Assembler::below, low);
 179   jcc(Assembler::equal, done);
 180 
 181   bind(high);
 182   xorl(x_hi, x_hi);
 183   increment(x_hi);
 184   jmp(done);
 185 
 186   bind(low);
 187   xorl(x_hi, x_hi);
 188   decrementl(x_hi);
 189 
 190   bind(done);
 191 }
 192 
 193 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 194     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 195 }
 196 
 197 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 198   // leal(dst, as_Address(adr));
 199   // see note in movl as to why we must use a move
 200   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 201 }
 202 
 203 void MacroAssembler::leave() {
 204   mov(rsp, rbp);
 205   pop(rbp);
 206 }
 207 
 208 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 209   // Multiplication of two Java long values stored on the stack
 210   // as illustrated below. Result is in rdx:rax.
 211   //
 212   // rsp ---> [  ??  ] \               \
 213   //            ....    | y_rsp_offset  |
 214   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 215   //          [ y_hi ]                  | (in bytes)
 216   //            ....                    |
 217   //          [ x_lo ]                 /
 218   //          [ x_hi ]
 219   //            ....
 220   //
 221   // Basic idea: lo(result) = lo(x_lo * y_lo)
 222   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 223   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 224   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 225   Label quick;
 226   // load x_hi, y_hi and check if quick
 227   // multiplication is possible
 228   movl(rbx, x_hi);
 229   movl(rcx, y_hi);
 230   movl(rax, rbx);
 231   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 232   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 233   // do full multiplication
 234   // 1st step
 235   mull(y_lo);                                    // x_hi * y_lo
 236   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 237   // 2nd step
 238   movl(rax, x_lo);
 239   mull(rcx);                                     // x_lo * y_hi
 240   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 241   // 3rd step
 242   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 243   movl(rax, x_lo);
 244   mull(y_lo);                                    // x_lo * y_lo
 245   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 246 }
 247 
 248 void MacroAssembler::lneg(Register hi, Register lo) {
 249   negl(lo);
 250   adcl(hi, 0);
 251   negl(hi);
 252 }
 253 
 254 void MacroAssembler::lshl(Register hi, Register lo) {
 255   // Java shift left long support (semantics as described in JVM spec., p.305)
 256   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 257   // shift value is in rcx !
 258   assert(hi != rcx, "must not use rcx");
 259   assert(lo != rcx, "must not use rcx");
 260   const Register s = rcx;                        // shift count
 261   const int      n = BitsPerWord;
 262   Label L;
 263   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 264   cmpl(s, n);                                    // if (s < n)
 265   jcc(Assembler::less, L);                       // else (s >= n)
 266   movl(hi, lo);                                  // x := x << n
 267   xorl(lo, lo);
 268   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 269   bind(L);                                       // s (mod n) < n
 270   shldl(hi, lo);                                 // x := x << s
 271   shll(lo);
 272 }
 273 
 274 
 275 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 276   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 277   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 278   assert(hi != rcx, "must not use rcx");
 279   assert(lo != rcx, "must not use rcx");
 280   const Register s = rcx;                        // shift count
 281   const int      n = BitsPerWord;
 282   Label L;
 283   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 284   cmpl(s, n);                                    // if (s < n)
 285   jcc(Assembler::less, L);                       // else (s >= n)
 286   movl(lo, hi);                                  // x := x >> n
 287   if (sign_extension) sarl(hi, 31);
 288   else                xorl(hi, hi);
 289   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 290   bind(L);                                       // s (mod n) < n
 291   shrdl(lo, hi);                                 // x := x >> s
 292   if (sign_extension) sarl(hi);
 293   else                shrl(hi);
 294 }
 295 
 296 void MacroAssembler::movoop(Register dst, jobject obj) {
 297   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 298 }
 299 
 300 void MacroAssembler::movoop(Address dst, jobject obj) {
 301   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 302 }
 303 
 304 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 305   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 306 }
 307 
 308 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 309   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 310 }
 311 
 312 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 313   // scratch register is not used,
 314   // it is defined to match parameters of 64-bit version of this method.
 315   if (src.is_lval()) {
 316     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 317   } else {
 318     movl(dst, as_Address(src));
 319   }
 320 }
 321 
 322 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 323   movl(as_Address(dst), src);
 324 }
 325 
 326 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 327   movl(dst, as_Address(src));
 328 }
 329 
 330 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 331 void MacroAssembler::movptr(Address dst, intptr_t src) {
 332   movl(dst, src);
 333 }
 334 
 335 
 336 void MacroAssembler::pop_callee_saved_registers() {
 337   pop(rcx);
 338   pop(rdx);
 339   pop(rdi);
 340   pop(rsi);
 341 }
 342 
 343 void MacroAssembler::push_callee_saved_registers() {
 344   push(rsi);
 345   push(rdi);
 346   push(rdx);
 347   push(rcx);
 348 }
 349 
 350 void MacroAssembler::pushoop(jobject obj) {
 351   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 352 }
 353 
 354 void MacroAssembler::pushklass(Metadata* obj) {
 355   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 356 }
 357 
 358 void MacroAssembler::pushptr(AddressLiteral src) {
 359   if (src.is_lval()) {
 360     push_literal32((int32_t)src.target(), src.rspec());
 361   } else {
 362     pushl(as_Address(src));
 363   }
 364 }
 365 
 366 static void pass_arg0(MacroAssembler* masm, Register arg) {
 367   masm->push(arg);
 368 }
 369 
 370 static void pass_arg1(MacroAssembler* masm, Register arg) {
 371   masm->push(arg);
 372 }
 373 
 374 static void pass_arg2(MacroAssembler* masm, Register arg) {
 375   masm->push(arg);
 376 }
 377 
 378 static void pass_arg3(MacroAssembler* masm, Register arg) {
 379   masm->push(arg);
 380 }
 381 
 382 #ifndef PRODUCT
 383 extern "C" void findpc(intptr_t x);
 384 #endif
 385 
 386 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 387   // In order to get locks to work, we need to fake a in_VM state
 388   JavaThread* thread = JavaThread::current();
 389   JavaThreadState saved_state = thread->thread_state();
 390   thread->set_thread_state(_thread_in_vm);
 391   if (ShowMessageBoxOnError) {
 392     JavaThread* thread = JavaThread::current();
 393     JavaThreadState saved_state = thread->thread_state();
 394     thread->set_thread_state(_thread_in_vm);
 395     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 396       ttyLocker ttyl;
 397       BytecodeCounter::print();
 398     }
 399     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 400     // This is the value of eip which points to where verify_oop will return.
 401     if (os::message_box(msg, "Execution stopped, print registers?")) {
 402       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 403       BREAKPOINT;
 404     }
 405   }
 406   fatal("DEBUG MESSAGE: %s", msg);
 407 }
 408 
 409 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 410   ttyLocker ttyl;
 411   FlagSetting fs(Debugging, true);
 412   tty->print_cr("eip = 0x%08x", eip);
 413 #ifndef PRODUCT
 414   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 415     tty->cr();
 416     findpc(eip);
 417     tty->cr();
 418   }
 419 #endif
 420 #define PRINT_REG(rax) \
 421   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 422   PRINT_REG(rax);
 423   PRINT_REG(rbx);
 424   PRINT_REG(rcx);
 425   PRINT_REG(rdx);
 426   PRINT_REG(rdi);
 427   PRINT_REG(rsi);
 428   PRINT_REG(rbp);
 429   PRINT_REG(rsp);
 430 #undef PRINT_REG
 431   // Print some words near top of staack.
 432   int* dump_sp = (int*) rsp;
 433   for (int col1 = 0; col1 < 8; col1++) {
 434     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 435     os::print_location(tty, *dump_sp++);
 436   }
 437   for (int row = 0; row < 16; row++) {
 438     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 439     for (int col = 0; col < 8; col++) {
 440       tty->print(" 0x%08x", *dump_sp++);
 441     }
 442     tty->cr();
 443   }
 444   // Print some instructions around pc:
 445   Disassembler::decode((address)eip-64, (address)eip);
 446   tty->print_cr("--------");
 447   Disassembler::decode((address)eip, (address)eip+32);
 448 }
 449 
 450 void MacroAssembler::stop(const char* msg) {
 451   ExternalAddress message((address)msg);
 452   // push address of message
 453   pushptr(message.addr());
 454   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 455   pusha();                                            // push registers
 456   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 457   hlt();
 458 }
 459 
 460 void MacroAssembler::warn(const char* msg) {
 461   push_CPU_state();
 462 
 463   ExternalAddress message((address) msg);
 464   // push address of message
 465   pushptr(message.addr());
 466 
 467   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 468   addl(rsp, wordSize);       // discard argument
 469   pop_CPU_state();
 470 }
 471 
 472 void MacroAssembler::print_state() {
 473   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 474   pusha();                                            // push registers
 475 
 476   push_CPU_state();
 477   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 478   pop_CPU_state();
 479 
 480   popa();
 481   addl(rsp, wordSize);
 482 }
 483 
 484 #else // _LP64
 485 
 486 // 64 bit versions
 487 
 488 Address MacroAssembler::as_Address(AddressLiteral adr) {
 489   // amd64 always does this as a pc-rel
 490   // we can be absolute or disp based on the instruction type
 491   // jmp/call are displacements others are absolute
 492   assert(!adr.is_lval(), "must be rval");
 493   assert(reachable(adr), "must be");
 494   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 495 
 496 }
 497 
 498 Address MacroAssembler::as_Address(ArrayAddress adr) {
 499   AddressLiteral base = adr.base();
 500   lea(rscratch1, base);
 501   Address index = adr.index();
 502   assert(index._disp == 0, "must not have disp"); // maybe it can?
 503   Address array(rscratch1, index._index, index._scale, index._disp);
 504   return array;
 505 }
 506 
 507 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 508   Label L, E;
 509 
 510 #ifdef _WIN64
 511   // Windows always allocates space for it's register args
 512   assert(num_args <= 4, "only register arguments supported");
 513   subq(rsp,  frame::arg_reg_save_area_bytes);
 514 #endif
 515 
 516   // Align stack if necessary
 517   testl(rsp, 15);
 518   jcc(Assembler::zero, L);
 519 
 520   subq(rsp, 8);
 521   {
 522     call(RuntimeAddress(entry_point));
 523   }
 524   addq(rsp, 8);
 525   jmp(E);
 526 
 527   bind(L);
 528   {
 529     call(RuntimeAddress(entry_point));
 530   }
 531 
 532   bind(E);
 533 
 534 #ifdef _WIN64
 535   // restore stack pointer
 536   addq(rsp, frame::arg_reg_save_area_bytes);
 537 #endif
 538 
 539 }
 540 
 541 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 542   assert(!src2.is_lval(), "should use cmpptr");
 543 
 544   if (reachable(src2)) {
 545     cmpq(src1, as_Address(src2));
 546   } else {
 547     lea(rscratch1, src2);
 548     Assembler::cmpq(src1, Address(rscratch1, 0));
 549   }
 550 }
 551 
 552 int MacroAssembler::corrected_idivq(Register reg) {
 553   // Full implementation of Java ldiv and lrem; checks for special
 554   // case as described in JVM spec., p.243 & p.271.  The function
 555   // returns the (pc) offset of the idivl instruction - may be needed
 556   // for implicit exceptions.
 557   //
 558   //         normal case                           special case
 559   //
 560   // input : rax: dividend                         min_long
 561   //         reg: divisor   (may not be eax/edx)   -1
 562   //
 563   // output: rax: quotient  (= rax idiv reg)       min_long
 564   //         rdx: remainder (= rax irem reg)       0
 565   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 566   static const int64_t min_long = 0x8000000000000000;
 567   Label normal_case, special_case;
 568 
 569   // check for special case
 570   cmp64(rax, ExternalAddress((address) &min_long));
 571   jcc(Assembler::notEqual, normal_case);
 572   xorl(rdx, rdx); // prepare rdx for possible special case (where
 573                   // remainder = 0)
 574   cmpq(reg, -1);
 575   jcc(Assembler::equal, special_case);
 576 
 577   // handle normal case
 578   bind(normal_case);
 579   cdqq();
 580   int idivq_offset = offset();
 581   idivq(reg);
 582 
 583   // normal and special case exit
 584   bind(special_case);
 585 
 586   return idivq_offset;
 587 }
 588 
 589 void MacroAssembler::decrementq(Register reg, int value) {
 590   if (value == min_jint) { subq(reg, value); return; }
 591   if (value <  0) { incrementq(reg, -value); return; }
 592   if (value == 0) {                        ; return; }
 593   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 594   /* else */      { subq(reg, value)       ; return; }
 595 }
 596 
 597 void MacroAssembler::decrementq(Address dst, int value) {
 598   if (value == min_jint) { subq(dst, value); return; }
 599   if (value <  0) { incrementq(dst, -value); return; }
 600   if (value == 0) {                        ; return; }
 601   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 602   /* else */      { subq(dst, value)       ; return; }
 603 }
 604 
 605 void MacroAssembler::incrementq(AddressLiteral dst) {
 606   if (reachable(dst)) {
 607     incrementq(as_Address(dst));
 608   } else {
 609     lea(rscratch1, dst);
 610     incrementq(Address(rscratch1, 0));
 611   }
 612 }
 613 
 614 void MacroAssembler::incrementq(Register reg, int value) {
 615   if (value == min_jint) { addq(reg, value); return; }
 616   if (value <  0) { decrementq(reg, -value); return; }
 617   if (value == 0) {                        ; return; }
 618   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 619   /* else */      { addq(reg, value)       ; return; }
 620 }
 621 
 622 void MacroAssembler::incrementq(Address dst, int value) {
 623   if (value == min_jint) { addq(dst, value); return; }
 624   if (value <  0) { decrementq(dst, -value); return; }
 625   if (value == 0) {                        ; return; }
 626   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 627   /* else */      { addq(dst, value)       ; return; }
 628 }
 629 
 630 // 32bit can do a case table jump in one instruction but we no longer allow the base
 631 // to be installed in the Address class
 632 void MacroAssembler::jump(ArrayAddress entry) {
 633   lea(rscratch1, entry.base());
 634   Address dispatch = entry.index();
 635   assert(dispatch._base == noreg, "must be");
 636   dispatch._base = rscratch1;
 637   jmp(dispatch);
 638 }
 639 
 640 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 641   ShouldNotReachHere(); // 64bit doesn't use two regs
 642   cmpq(x_lo, y_lo);
 643 }
 644 
 645 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 646     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 647 }
 648 
 649 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 650   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 651   movptr(dst, rscratch1);
 652 }
 653 
 654 void MacroAssembler::leave() {
 655   // %%% is this really better? Why not on 32bit too?
 656   emit_int8((unsigned char)0xC9); // LEAVE
 657 }
 658 
 659 void MacroAssembler::lneg(Register hi, Register lo) {
 660   ShouldNotReachHere(); // 64bit doesn't use two regs
 661   negq(lo);
 662 }
 663 
 664 void MacroAssembler::movoop(Register dst, jobject obj) {
 665   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 666 }
 667 
 668 void MacroAssembler::movoop(Address dst, jobject obj) {
 669   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 670   movq(dst, rscratch1);
 671 }
 672 
 673 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 674   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 675 }
 676 
 677 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 678   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 679   movq(dst, rscratch1);
 680 }
 681 
 682 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 683   if (src.is_lval()) {
 684     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 685   } else {
 686     if (reachable(src)) {
 687       movq(dst, as_Address(src));
 688     } else {
 689       lea(scratch, src);
 690       movq(dst, Address(scratch, 0));
 691     }
 692   }
 693 }
 694 
 695 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 696   movq(as_Address(dst), src);
 697 }
 698 
 699 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 700   movq(dst, as_Address(src));
 701 }
 702 
 703 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 704 void MacroAssembler::movptr(Address dst, intptr_t src) {
 705   if (is_simm32(src)) {
 706     movptr(dst, checked_cast<int32_t>(src));
 707   } else {
 708     mov64(rscratch1, src);
 709     movq(dst, rscratch1);
 710   }
 711 }
 712 
 713 // These are mostly for initializing NULL
 714 void MacroAssembler::movptr(Address dst, int32_t src) {
 715   movslq(dst, src);
 716 }
 717 
 718 void MacroAssembler::movptr(Register dst, int32_t src) {
 719   mov64(dst, (intptr_t)src);
 720 }
 721 
 722 void MacroAssembler::pushoop(jobject obj) {
 723   movoop(rscratch1, obj);
 724   push(rscratch1);
 725 }
 726 
 727 void MacroAssembler::pushklass(Metadata* obj) {
 728   mov_metadata(rscratch1, obj);
 729   push(rscratch1);
 730 }
 731 
 732 void MacroAssembler::pushptr(AddressLiteral src) {
 733   lea(rscratch1, src);
 734   if (src.is_lval()) {
 735     push(rscratch1);
 736   } else {
 737     pushq(Address(rscratch1, 0));
 738   }
 739 }
 740 
 741 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 742   reset_last_Java_frame(r15_thread, clear_fp);
 743 }
 744 
 745 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 746                                          Register last_java_fp,
 747                                          address  last_java_pc) {
 748   vzeroupper();
 749   // determine last_java_sp register
 750   if (!last_java_sp->is_valid()) {
 751     last_java_sp = rsp;
 752   }
 753 
 754   // last_java_fp is optional
 755   if (last_java_fp->is_valid()) {
 756     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 757            last_java_fp);
 758   }
 759 
 760   // last_java_pc is optional
 761   if (last_java_pc != NULL) {
 762     Address java_pc(r15_thread,
 763                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 764     lea(rscratch1, InternalAddress(last_java_pc));
 765     movptr(java_pc, rscratch1);
 766   }
 767 
 768   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 769 }
 770 
 771 static void pass_arg0(MacroAssembler* masm, Register arg) {
 772   if (c_rarg0 != arg ) {
 773     masm->mov(c_rarg0, arg);
 774   }
 775 }
 776 
 777 static void pass_arg1(MacroAssembler* masm, Register arg) {
 778   if (c_rarg1 != arg ) {
 779     masm->mov(c_rarg1, arg);
 780   }
 781 }
 782 
 783 static void pass_arg2(MacroAssembler* masm, Register arg) {
 784   if (c_rarg2 != arg ) {
 785     masm->mov(c_rarg2, arg);
 786   }
 787 }
 788 
 789 static void pass_arg3(MacroAssembler* masm, Register arg) {
 790   if (c_rarg3 != arg ) {
 791     masm->mov(c_rarg3, arg);
 792   }
 793 }
 794 
 795 void MacroAssembler::stop(const char* msg) {
 796   if (ShowMessageBoxOnError) {
 797     address rip = pc();
 798     pusha(); // get regs on stack
 799     lea(c_rarg1, InternalAddress(rip));
 800     movq(c_rarg2, rsp); // pass pointer to regs array
 801   }
 802   lea(c_rarg0, ExternalAddress((address) msg));
 803   andq(rsp, -16); // align stack as required by ABI
 804   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 805   hlt();
 806 }
 807 
 808 void MacroAssembler::warn(const char* msg) {
 809   push(rbp);
 810   movq(rbp, rsp);
 811   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 812   push_CPU_state();   // keeps alignment at 16 bytes
 813   lea(c_rarg0, ExternalAddress((address) msg));
 814   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
 815   call(rax);
 816   pop_CPU_state();
 817   mov(rsp, rbp);
 818   pop(rbp);
 819 }
 820 
 821 void MacroAssembler::print_state() {
 822   address rip = pc();
 823   pusha();            // get regs on stack
 824   push(rbp);
 825   movq(rbp, rsp);
 826   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 827   push_CPU_state();   // keeps alignment at 16 bytes
 828 
 829   lea(c_rarg0, InternalAddress(rip));
 830   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 831   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 832 
 833   pop_CPU_state();
 834   mov(rsp, rbp);
 835   pop(rbp);
 836   popa();
 837 }
 838 
 839 #ifndef PRODUCT
 840 extern "C" void findpc(intptr_t x);
 841 #endif
 842 
 843 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 844   // In order to get locks to work, we need to fake a in_VM state
 845   if (ShowMessageBoxOnError) {
 846     JavaThread* thread = JavaThread::current();
 847     JavaThreadState saved_state = thread->thread_state();
 848     thread->set_thread_state(_thread_in_vm);
 849 #ifndef PRODUCT
 850     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 851       ttyLocker ttyl;
 852       BytecodeCounter::print();
 853     }
 854 #endif
 855     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 856     // XXX correct this offset for amd64
 857     // This is the value of eip which points to where verify_oop will return.
 858     if (os::message_box(msg, "Execution stopped, print registers?")) {
 859       print_state64(pc, regs);
 860       BREAKPOINT;
 861     }
 862   }
 863   fatal("DEBUG MESSAGE: %s", msg);
 864 }
 865 
 866 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 867   ttyLocker ttyl;
 868   FlagSetting fs(Debugging, true);
 869   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 870 #ifndef PRODUCT
 871   tty->cr();
 872   findpc(pc);
 873   tty->cr();
 874 #endif
 875 #define PRINT_REG(rax, value) \
 876   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 877   PRINT_REG(rax, regs[15]);
 878   PRINT_REG(rbx, regs[12]);
 879   PRINT_REG(rcx, regs[14]);
 880   PRINT_REG(rdx, regs[13]);
 881   PRINT_REG(rdi, regs[8]);
 882   PRINT_REG(rsi, regs[9]);
 883   PRINT_REG(rbp, regs[10]);
 884   // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
 885   PRINT_REG(rsp, (intptr_t)(&regs[16]));
 886   PRINT_REG(r8 , regs[7]);
 887   PRINT_REG(r9 , regs[6]);
 888   PRINT_REG(r10, regs[5]);
 889   PRINT_REG(r11, regs[4]);
 890   PRINT_REG(r12, regs[3]);
 891   PRINT_REG(r13, regs[2]);
 892   PRINT_REG(r14, regs[1]);
 893   PRINT_REG(r15, regs[0]);
 894 #undef PRINT_REG
 895   // Print some words near the top of the stack.
 896   int64_t* rsp = &regs[16];
 897   int64_t* dump_sp = rsp;
 898   for (int col1 = 0; col1 < 8; col1++) {
 899     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 900     os::print_location(tty, *dump_sp++);
 901   }
 902   for (int row = 0; row < 25; row++) {
 903     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 904     for (int col = 0; col < 4; col++) {
 905       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 906     }
 907     tty->cr();
 908   }
 909   // Print some instructions around pc:
 910   Disassembler::decode((address)pc-64, (address)pc);
 911   tty->print_cr("--------");
 912   Disassembler::decode((address)pc, (address)pc+32);
 913 }
 914 
 915 // The java_calling_convention describes stack locations as ideal slots on
 916 // a frame with no abi restrictions. Since we must observe abi restrictions
 917 // (like the placement of the register window) the slots must be biased by
 918 // the following value.
 919 static int reg2offset_in(VMReg r) {
 920   // Account for saved rbp and return address
 921   // This should really be in_preserve_stack_slots
 922   return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
 923 }
 924 
 925 static int reg2offset_out(VMReg r) {
 926   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
 927 }
 928 
 929 // A long move
 930 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst) {
 931 
 932   // The calling conventions assures us that each VMregpair is either
 933   // all really one physical register or adjacent stack slots.
 934 
 935   if (src.is_single_phys_reg() ) {
 936     if (dst.is_single_phys_reg()) {
 937       if (dst.first() != src.first()) {
 938         mov(dst.first()->as_Register(), src.first()->as_Register());
 939       }
 940     } else {
 941       assert(dst.is_single_reg(), "not a stack pair");
 942       movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());

 943     }
 944   } else if (dst.is_single_phys_reg()) {
 945     assert(src.is_single_reg(),  "not a stack pair");
 946     movq(dst.first()->as_Register(), Address(rbp, reg2offset_out(src.first())));
 947   } else {
 948     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 949     movq(rax, Address(rbp, reg2offset_in(src.first())));
 950     movq(Address(rsp, reg2offset_out(dst.first())), rax);
 951   }
 952 }
 953 
 954 // A double move
 955 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst) {
 956 
 957   // The calling conventions assures us that each VMregpair is either
 958   // all really one physical register or adjacent stack slots.
 959 
 960   if (src.is_single_phys_reg() ) {
 961     if (dst.is_single_phys_reg()) {
 962       // In theory these overlap but the ordering is such that this is likely a nop
 963       if ( src.first() != dst.first()) {
 964         movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
 965       }
 966     } else {
 967       assert(dst.is_single_reg(), "not a stack pair");
 968       movdbl(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
 969     }
 970   } else if (dst.is_single_phys_reg()) {
 971     assert(src.is_single_reg(),  "not a stack pair");
 972     movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_out(src.first())));
 973   } else {
 974     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 975     movq(rax, Address(rbp, reg2offset_in(src.first())));
 976     movq(Address(rsp, reg2offset_out(dst.first())), rax);
 977   }
 978 }
 979 
 980 
 981 // A float arg may have to do float reg int reg conversion
 982 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst) {
 983   assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
 984 
 985   // The calling conventions assures us that each VMregpair is either
 986   // all really one physical register or adjacent stack slots.
 987 
 988   if (src.first()->is_stack()) {
 989     if (dst.first()->is_stack()) {
 990       movl(rax, Address(rbp, reg2offset_in(src.first())));
 991       movptr(Address(rsp, reg2offset_out(dst.first())), rax);
 992     } else {
 993       // stack to reg
 994       assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
 995       movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first())));
 996     }
 997   } else if (dst.first()->is_stack()) {
 998     // reg to stack
 999     assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1000     movflt(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
1001   } else {
1002     // reg to reg
1003     // In theory these overlap but the ordering is such that this is likely a nop
1004     if ( src.first() != dst.first()) {
1005       movdbl(dst.first()->as_XMMRegister(),  src.first()->as_XMMRegister());
1006     }
1007   }
1008 }
1009 
1010 // On 64 bit we will store integer like items to the stack as
1011 // 64 bits items (x86_32/64 abi) even though java would only store
1012 // 32bits for a parameter. On 32bit it will simply be 32 bits
1013 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
1014 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst) {
1015   if (src.first()->is_stack()) {
1016     if (dst.first()->is_stack()) {
1017       // stack to stack
1018       movslq(rax, Address(rbp, reg2offset_in(src.first())));
1019       movq(Address(rsp, reg2offset_out(dst.first())), rax);
1020     } else {
1021       // stack to reg
1022       movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1023     }
1024   } else if (dst.first()->is_stack()) {
1025     // reg to stack
1026     // Do we really have to sign extend???
1027     // __ movslq(src.first()->as_Register(), src.first()->as_Register());
1028     movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1029   } else {
1030     // Do we really have to sign extend???
1031     // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
1032     if (dst.first() != src.first()) {
1033       movq(dst.first()->as_Register(), src.first()->as_Register());
1034     }
1035   }
1036 }
1037 
1038 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
1039   if (src.first()->is_stack()) {
1040     if (dst.first()->is_stack()) {
1041       // stack to stack
1042       movq(rax, Address(rbp, reg2offset_in(src.first())));
1043       movq(Address(rsp, reg2offset_out(dst.first())), rax);
1044     } else {
1045       // stack to reg
1046       movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1047     }
1048   } else if (dst.first()->is_stack()) {
1049     // reg to stack
1050     movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1051   } else {
1052     if (dst.first() != src.first()) {
1053       movq(dst.first()->as_Register(), src.first()->as_Register());
1054     }
1055   }
1056 }
1057 
1058 // An oop arg. Must pass a handle not the oop itself
1059 void MacroAssembler::object_move(OopMap* map,
1060                         int oop_handle_offset,
1061                         int framesize_in_slots,
1062                         VMRegPair src,
1063                         VMRegPair dst,
1064                         bool is_receiver,
1065                         int* receiver_offset) {
1066 
1067   // must pass a handle. First figure out the location we use as a handle
1068 
1069   Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1070 
1071   // See if oop is NULL if it is we need no handle
1072 
1073   if (src.first()->is_stack()) {
1074 
1075     // Oop is already on the stack as an argument
1076     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1077     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1078     if (is_receiver) {
1079       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1080     }
1081 
1082     cmpptr(Address(rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD);
1083     lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1084     // conditionally move a NULL
1085     cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1086   } else {
1087 
1088     // Oop is in an a register we must store it to the space we reserve
1089     // on the stack for oop_handles and pass a handle if oop is non-NULL
1090 
1091     const Register rOop = src.first()->as_Register();
1092     int oop_slot;
1093     if (rOop == j_rarg0)
1094       oop_slot = 0;
1095     else if (rOop == j_rarg1)
1096       oop_slot = 1;
1097     else if (rOop == j_rarg2)
1098       oop_slot = 2;
1099     else if (rOop == j_rarg3)
1100       oop_slot = 3;
1101     else if (rOop == j_rarg4)
1102       oop_slot = 4;
1103     else {
1104       assert(rOop == j_rarg5, "wrong register");
1105       oop_slot = 5;
1106     }
1107 
1108     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1109     int offset = oop_slot*VMRegImpl::stack_slot_size;
1110 
1111     map->set_oop(VMRegImpl::stack2reg(oop_slot));
1112     // Store oop in handle area, may be NULL
1113     movptr(Address(rsp, offset), rOop);
1114     if (is_receiver) {
1115       *receiver_offset = offset;
1116     }
1117 
1118     cmpptr(rOop, (int32_t)NULL_WORD);
1119     lea(rHandle, Address(rsp, offset));
1120     // conditionally move a NULL from the handle area where it was just stored
1121     cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1122   }
1123 
1124   // If arg is on the stack then place it otherwise it is already in correct reg.
1125   if (dst.first()->is_stack()) {
1126     movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1127   }
1128 }
1129 
1130 #endif // _LP64
1131 
1132 // Now versions that are common to 32/64 bit
1133 
1134 void MacroAssembler::addptr(Register dst, int32_t imm32) {
1135   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
1136 }
1137 
1138 void MacroAssembler::addptr(Register dst, Register src) {
1139   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1140 }
1141 
1142 void MacroAssembler::addptr(Address dst, Register src) {
1143   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1144 }
1145 
1146 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
1147   if (reachable(src)) {
1148     Assembler::addsd(dst, as_Address(src));
1149   } else {
1150     lea(rscratch1, src);
1151     Assembler::addsd(dst, Address(rscratch1, 0));
1152   }
1153 }
1154 
1155 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
1156   if (reachable(src)) {
1157     addss(dst, as_Address(src));
1158   } else {
1159     lea(rscratch1, src);
1160     addss(dst, Address(rscratch1, 0));
1161   }
1162 }
1163 
1164 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
1165   if (reachable(src)) {
1166     Assembler::addpd(dst, as_Address(src));
1167   } else {
1168     lea(rscratch1, src);
1169     Assembler::addpd(dst, Address(rscratch1, 0));
1170   }
1171 }
1172 
1173 // See 8273459.  Function for ensuring 64-byte alignment, intended for stubs only.
1174 // Stub code is generated once and never copied.
1175 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
1176 void MacroAssembler::align64() {
1177   align(64, (unsigned long long) pc());
1178 }
1179 
1180 void MacroAssembler::align32() {
1181   align(32, (unsigned long long) pc());
1182 }
1183 
1184 void MacroAssembler::align(int modulus) {
1185   // 8273459: Ensure alignment is possible with current segment alignment
1186   assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
1187   align(modulus, offset());
1188 }
1189 
1190 void MacroAssembler::align(int modulus, int target) {
1191   if (target % modulus != 0) {
1192     nop(modulus - (target % modulus));
1193   }
1194 }
1195 
1196 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1197   // Used in sign-masking with aligned address.
1198   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1199   if (reachable(src)) {
1200     Assembler::andpd(dst, as_Address(src));
1201   } else {
1202     lea(scratch_reg, src);
1203     Assembler::andpd(dst, Address(scratch_reg, 0));
1204   }
1205 }
1206 
1207 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1208   // Used in sign-masking with aligned address.
1209   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1210   if (reachable(src)) {
1211     Assembler::andps(dst, as_Address(src));
1212   } else {
1213     lea(scratch_reg, src);
1214     Assembler::andps(dst, Address(scratch_reg, 0));
1215   }
1216 }
1217 
1218 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1219   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1220 }
1221 
1222 void MacroAssembler::atomic_incl(Address counter_addr) {
1223   lock();
1224   incrementl(counter_addr);
1225 }
1226 
1227 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1228   if (reachable(counter_addr)) {
1229     atomic_incl(as_Address(counter_addr));
1230   } else {
1231     lea(scr, counter_addr);
1232     atomic_incl(Address(scr, 0));
1233   }
1234 }
1235 
1236 #ifdef _LP64
1237 void MacroAssembler::atomic_incq(Address counter_addr) {
1238   lock();
1239   incrementq(counter_addr);
1240 }
1241 
1242 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1243   if (reachable(counter_addr)) {
1244     atomic_incq(as_Address(counter_addr));
1245   } else {
1246     lea(scr, counter_addr);
1247     atomic_incq(Address(scr, 0));
1248   }
1249 }
1250 #endif
1251 
1252 // Writes to stack successive pages until offset reached to check for
1253 // stack overflow + shadow pages.  This clobbers tmp.
1254 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1255   movptr(tmp, rsp);
1256   // Bang stack for total size given plus shadow page size.
1257   // Bang one page at a time because large size can bang beyond yellow and
1258   // red zones.
1259   Label loop;
1260   bind(loop);
1261   movl(Address(tmp, (-os::vm_page_size())), size );
1262   subptr(tmp, os::vm_page_size());
1263   subl(size, os::vm_page_size());
1264   jcc(Assembler::greater, loop);
1265 
1266   // Bang down shadow pages too.
1267   // At this point, (tmp-0) is the last address touched, so don't
1268   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1269   // was post-decremented.)  Skip this address by starting at i=1, and
1270   // touch a few more pages below.  N.B.  It is important to touch all
1271   // the way down including all pages in the shadow zone.
1272   for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1273     // this could be any sized move but this is can be a debugging crumb
1274     // so the bigger the better.
1275     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1276   }
1277 }
1278 
1279 void MacroAssembler::reserved_stack_check() {
1280     // testing if reserved zone needs to be enabled
1281     Label no_reserved_zone_enabling;
1282     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1283     NOT_LP64(get_thread(rsi);)
1284 
1285     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1286     jcc(Assembler::below, no_reserved_zone_enabling);
1287 
1288     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1289     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1290     should_not_reach_here();
1291 
1292     bind(no_reserved_zone_enabling);
1293 }
1294 
1295 void MacroAssembler::c2bool(Register x) {
1296   // implements x == 0 ? 0 : 1
1297   // note: must only look at least-significant byte of x
1298   //       since C-style booleans are stored in one byte
1299   //       only! (was bug)
1300   andl(x, 0xFF);
1301   setb(Assembler::notZero, x);
1302 }
1303 
1304 // Wouldn't need if AddressLiteral version had new name
1305 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1306   Assembler::call(L, rtype);
1307 }
1308 
1309 void MacroAssembler::call(Register entry) {
1310   Assembler::call(entry);
1311 }
1312 
1313 void MacroAssembler::call(AddressLiteral entry) {
1314   if (reachable(entry)) {
1315     Assembler::call_literal(entry.target(), entry.rspec());
1316   } else {
1317     lea(rscratch1, entry);
1318     Assembler::call(rscratch1);
1319   }
1320 }
1321 
1322 void MacroAssembler::ic_call(address entry, jint method_index) {
1323   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
1324   movptr(rax, (intptr_t)Universe::non_oop_word());
1325   call(AddressLiteral(entry, rh));
1326 }
1327 
1328 // Implementation of call_VM versions
1329 
1330 void MacroAssembler::call_VM(Register oop_result,
1331                              address entry_point,
1332                              bool check_exceptions) {
1333   Label C, E;
1334   call(C, relocInfo::none);
1335   jmp(E);
1336 
1337   bind(C);
1338   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1339   ret(0);
1340 
1341   bind(E);
1342 }
1343 
1344 void MacroAssembler::call_VM(Register oop_result,
1345                              address entry_point,
1346                              Register arg_1,
1347                              bool check_exceptions) {
1348   Label C, E;
1349   call(C, relocInfo::none);
1350   jmp(E);
1351 
1352   bind(C);
1353   pass_arg1(this, arg_1);
1354   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1355   ret(0);
1356 
1357   bind(E);
1358 }
1359 
1360 void MacroAssembler::call_VM(Register oop_result,
1361                              address entry_point,
1362                              Register arg_1,
1363                              Register arg_2,
1364                              bool check_exceptions) {
1365   Label C, E;
1366   call(C, relocInfo::none);
1367   jmp(E);
1368 
1369   bind(C);
1370 
1371   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1372 
1373   pass_arg2(this, arg_2);
1374   pass_arg1(this, arg_1);
1375   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1376   ret(0);
1377 
1378   bind(E);
1379 }
1380 
1381 void MacroAssembler::call_VM(Register oop_result,
1382                              address entry_point,
1383                              Register arg_1,
1384                              Register arg_2,
1385                              Register arg_3,
1386                              bool check_exceptions) {
1387   Label C, E;
1388   call(C, relocInfo::none);
1389   jmp(E);
1390 
1391   bind(C);
1392 
1393   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1394   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1395   pass_arg3(this, arg_3);
1396 
1397   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1398   pass_arg2(this, arg_2);
1399 
1400   pass_arg1(this, arg_1);
1401   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1402   ret(0);
1403 
1404   bind(E);
1405 }
1406 
1407 void MacroAssembler::call_VM(Register oop_result,
1408                              Register last_java_sp,
1409                              address entry_point,
1410                              int number_of_arguments,
1411                              bool check_exceptions) {
1412   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1413   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1414 }
1415 
1416 void MacroAssembler::call_VM(Register oop_result,
1417                              Register last_java_sp,
1418                              address entry_point,
1419                              Register arg_1,
1420                              bool check_exceptions) {
1421   pass_arg1(this, arg_1);
1422   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1423 }
1424 
1425 void MacroAssembler::call_VM(Register oop_result,
1426                              Register last_java_sp,
1427                              address entry_point,
1428                              Register arg_1,
1429                              Register arg_2,
1430                              bool check_exceptions) {
1431 
1432   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1433   pass_arg2(this, arg_2);
1434   pass_arg1(this, arg_1);
1435   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1436 }
1437 
1438 void MacroAssembler::call_VM(Register oop_result,
1439                              Register last_java_sp,
1440                              address entry_point,
1441                              Register arg_1,
1442                              Register arg_2,
1443                              Register arg_3,
1444                              bool check_exceptions) {
1445   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1446   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1447   pass_arg3(this, arg_3);
1448   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1449   pass_arg2(this, arg_2);
1450   pass_arg1(this, arg_1);
1451   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1452 }
1453 
1454 void MacroAssembler::super_call_VM(Register oop_result,
1455                                    Register last_java_sp,
1456                                    address entry_point,
1457                                    int number_of_arguments,
1458                                    bool check_exceptions) {
1459   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1460   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1461 }
1462 
1463 void MacroAssembler::super_call_VM(Register oop_result,
1464                                    Register last_java_sp,
1465                                    address entry_point,
1466                                    Register arg_1,
1467                                    bool check_exceptions) {
1468   pass_arg1(this, arg_1);
1469   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1470 }
1471 
1472 void MacroAssembler::super_call_VM(Register oop_result,
1473                                    Register last_java_sp,
1474                                    address entry_point,
1475                                    Register arg_1,
1476                                    Register arg_2,
1477                                    bool check_exceptions) {
1478 
1479   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1480   pass_arg2(this, arg_2);
1481   pass_arg1(this, arg_1);
1482   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1483 }
1484 
1485 void MacroAssembler::super_call_VM(Register oop_result,
1486                                    Register last_java_sp,
1487                                    address entry_point,
1488                                    Register arg_1,
1489                                    Register arg_2,
1490                                    Register arg_3,
1491                                    bool check_exceptions) {
1492   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1493   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1494   pass_arg3(this, arg_3);
1495   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1496   pass_arg2(this, arg_2);
1497   pass_arg1(this, arg_1);
1498   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1499 }
1500 
1501 void MacroAssembler::call_VM_base(Register oop_result,
1502                                   Register java_thread,
1503                                   Register last_java_sp,
1504                                   address  entry_point,
1505                                   int      number_of_arguments,
1506                                   bool     check_exceptions) {
1507   // determine java_thread register
1508   if (!java_thread->is_valid()) {
1509 #ifdef _LP64
1510     java_thread = r15_thread;
1511 #else
1512     java_thread = rdi;
1513     get_thread(java_thread);
1514 #endif // LP64
1515   }
1516   // determine last_java_sp register
1517   if (!last_java_sp->is_valid()) {
1518     last_java_sp = rsp;
1519   }
1520   // debugging support
1521   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
1522   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
1523 #ifdef ASSERT
1524   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1525   // r12 is the heapbase.
1526   LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
1527 #endif // ASSERT
1528 
1529   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
1530   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1531 
1532   // push java thread (becomes first argument of C function)
1533 
1534   NOT_LP64(push(java_thread); number_of_arguments++);
1535   LP64_ONLY(mov(c_rarg0, r15_thread));
1536 
1537   // set last Java frame before call
1538   assert(last_java_sp != rbp, "can't use ebp/rbp");
1539 
1540   // Only interpreter should have to set fp
1541   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
1542 
1543   // do the call, remove parameters
1544   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1545 
1546   // restore the thread (cannot use the pushed argument since arguments
1547   // may be overwritten by C code generated by an optimizing compiler);
1548   // however can use the register value directly if it is callee saved.
1549   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
1550     // rdi & rsi (also r15) are callee saved -> nothing to do
1551 #ifdef ASSERT
1552     guarantee(java_thread != rax, "change this code");
1553     push(rax);
1554     { Label L;
1555       get_thread(rax);
1556       cmpptr(java_thread, rax);
1557       jcc(Assembler::equal, L);
1558       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
1559       bind(L);
1560     }
1561     pop(rax);
1562 #endif
1563   } else {
1564     get_thread(java_thread);
1565   }
1566   // reset last Java frame
1567   // Only interpreter should have to clear fp
1568   reset_last_Java_frame(java_thread, true);
1569 
1570    // C++ interp handles this in the interpreter
1571   check_and_handle_popframe(java_thread);
1572   check_and_handle_earlyret(java_thread);
1573 
1574   if (check_exceptions) {
1575     // check for pending exceptions (java_thread is set upon return)
1576     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
1577 #ifndef _LP64
1578     jump_cc(Assembler::notEqual,
1579             RuntimeAddress(StubRoutines::forward_exception_entry()));
1580 #else
1581     // This used to conditionally jump to forward_exception however it is
1582     // possible if we relocate that the branch will not reach. So we must jump
1583     // around so we can always reach
1584 
1585     Label ok;
1586     jcc(Assembler::equal, ok);
1587     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1588     bind(ok);
1589 #endif // LP64
1590   }
1591 
1592   // get oop result if there is one and reset the value in the thread
1593   if (oop_result->is_valid()) {
1594     get_vm_result(oop_result, java_thread);
1595   }
1596 }
1597 
1598 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1599 
1600   // Calculate the value for last_Java_sp
1601   // somewhat subtle. call_VM does an intermediate call
1602   // which places a return address on the stack just under the
1603   // stack pointer as the user finsihed with it. This allows
1604   // use to retrieve last_Java_pc from last_Java_sp[-1].
1605   // On 32bit we then have to push additional args on the stack to accomplish
1606   // the actual requested call. On 64bit call_VM only can use register args
1607   // so the only extra space is the return address that call_VM created.
1608   // This hopefully explains the calculations here.
1609 
1610 #ifdef _LP64
1611   // We've pushed one address, correct last_Java_sp
1612   lea(rax, Address(rsp, wordSize));
1613 #else
1614   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
1615 #endif // LP64
1616 
1617   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
1618 
1619 }
1620 
1621 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1622 void MacroAssembler::call_VM_leaf0(address entry_point) {
1623   MacroAssembler::call_VM_leaf_base(entry_point, 0);
1624 }
1625 
1626 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1627   call_VM_leaf_base(entry_point, number_of_arguments);
1628 }
1629 
1630 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1631   pass_arg0(this, arg_0);
1632   call_VM_leaf(entry_point, 1);
1633 }
1634 
1635 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1636 
1637   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1638   pass_arg1(this, arg_1);
1639   pass_arg0(this, arg_0);
1640   call_VM_leaf(entry_point, 2);
1641 }
1642 
1643 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1644   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1645   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1646   pass_arg2(this, arg_2);
1647   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1648   pass_arg1(this, arg_1);
1649   pass_arg0(this, arg_0);
1650   call_VM_leaf(entry_point, 3);
1651 }
1652 
1653 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1654   pass_arg0(this, arg_0);
1655   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1656 }
1657 
1658 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1659 
1660   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1661   pass_arg1(this, arg_1);
1662   pass_arg0(this, arg_0);
1663   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1664 }
1665 
1666 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1667   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1668   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1669   pass_arg2(this, arg_2);
1670   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1671   pass_arg1(this, arg_1);
1672   pass_arg0(this, arg_0);
1673   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1674 }
1675 
1676 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1677   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1678   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1679   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1680   pass_arg3(this, arg_3);
1681   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1682   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1683   pass_arg2(this, arg_2);
1684   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1685   pass_arg1(this, arg_1);
1686   pass_arg0(this, arg_0);
1687   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1688 }
1689 
1690 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1691   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1692   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
1693   verify_oop_msg(oop_result, "broken oop in call_VM_base");
1694 }
1695 
1696 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1697   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1698   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
1699 }
1700 
1701 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
1702 }
1703 
1704 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
1705 }
1706 
1707 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
1708   if (reachable(src1)) {
1709     cmpl(as_Address(src1), imm);
1710   } else {
1711     lea(rscratch1, src1);
1712     cmpl(Address(rscratch1, 0), imm);
1713   }
1714 }
1715 
1716 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
1717   assert(!src2.is_lval(), "use cmpptr");
1718   if (reachable(src2)) {
1719     cmpl(src1, as_Address(src2));
1720   } else {
1721     lea(rscratch1, src2);
1722     cmpl(src1, Address(rscratch1, 0));
1723   }
1724 }
1725 
1726 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1727   Assembler::cmpl(src1, imm);
1728 }
1729 
1730 void MacroAssembler::cmp32(Register src1, Address src2) {
1731   Assembler::cmpl(src1, src2);
1732 }
1733 
1734 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1735   ucomisd(opr1, opr2);
1736 
1737   Label L;
1738   if (unordered_is_less) {
1739     movl(dst, -1);
1740     jcc(Assembler::parity, L);
1741     jcc(Assembler::below , L);
1742     movl(dst, 0);
1743     jcc(Assembler::equal , L);
1744     increment(dst);
1745   } else { // unordered is greater
1746     movl(dst, 1);
1747     jcc(Assembler::parity, L);
1748     jcc(Assembler::above , L);
1749     movl(dst, 0);
1750     jcc(Assembler::equal , L);
1751     decrementl(dst);
1752   }
1753   bind(L);
1754 }
1755 
1756 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1757   ucomiss(opr1, opr2);
1758 
1759   Label L;
1760   if (unordered_is_less) {
1761     movl(dst, -1);
1762     jcc(Assembler::parity, L);
1763     jcc(Assembler::below , L);
1764     movl(dst, 0);
1765     jcc(Assembler::equal , L);
1766     increment(dst);
1767   } else { // unordered is greater
1768     movl(dst, 1);
1769     jcc(Assembler::parity, L);
1770     jcc(Assembler::above , L);
1771     movl(dst, 0);
1772     jcc(Assembler::equal , L);
1773     decrementl(dst);
1774   }
1775   bind(L);
1776 }
1777 
1778 
1779 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
1780   if (reachable(src1)) {
1781     cmpb(as_Address(src1), imm);
1782   } else {
1783     lea(rscratch1, src1);
1784     cmpb(Address(rscratch1, 0), imm);
1785   }
1786 }
1787 
1788 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
1789 #ifdef _LP64
1790   if (src2.is_lval()) {
1791     movptr(rscratch1, src2);
1792     Assembler::cmpq(src1, rscratch1);
1793   } else if (reachable(src2)) {
1794     cmpq(src1, as_Address(src2));
1795   } else {
1796     lea(rscratch1, src2);
1797     Assembler::cmpq(src1, Address(rscratch1, 0));
1798   }
1799 #else
1800   if (src2.is_lval()) {
1801     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1802   } else {
1803     cmpl(src1, as_Address(src2));
1804   }
1805 #endif // _LP64
1806 }
1807 
1808 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
1809   assert(src2.is_lval(), "not a mem-mem compare");
1810 #ifdef _LP64
1811   // moves src2's literal address
1812   movptr(rscratch1, src2);
1813   Assembler::cmpq(src1, rscratch1);
1814 #else
1815   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1816 #endif // _LP64
1817 }
1818 
1819 void MacroAssembler::cmpoop(Register src1, Register src2) {
1820   cmpptr(src1, src2);
1821 }
1822 
1823 void MacroAssembler::cmpoop(Register src1, Address src2) {
1824   cmpptr(src1, src2);
1825 }
1826 
1827 #ifdef _LP64
1828 void MacroAssembler::cmpoop(Register src1, jobject src2) {
1829   movoop(rscratch1, src2);
1830   cmpptr(src1, rscratch1);
1831 }
1832 #endif
1833 
1834 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
1835   if (reachable(adr)) {
1836     lock();
1837     cmpxchgptr(reg, as_Address(adr));
1838   } else {
1839     lea(rscratch1, adr);
1840     lock();
1841     cmpxchgptr(reg, Address(rscratch1, 0));
1842   }
1843 }
1844 
1845 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1846   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
1847 }
1848 
1849 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
1850   if (reachable(src)) {
1851     Assembler::comisd(dst, as_Address(src));
1852   } else {
1853     lea(rscratch1, src);
1854     Assembler::comisd(dst, Address(rscratch1, 0));
1855   }
1856 }
1857 
1858 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
1859   if (reachable(src)) {
1860     Assembler::comiss(dst, as_Address(src));
1861   } else {
1862     lea(rscratch1, src);
1863     Assembler::comiss(dst, Address(rscratch1, 0));
1864   }
1865 }
1866 
1867 
1868 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
1869   Condition negated_cond = negate_condition(cond);
1870   Label L;
1871   jcc(negated_cond, L);
1872   pushf(); // Preserve flags
1873   atomic_incl(counter_addr);
1874   popf();
1875   bind(L);
1876 }
1877 
1878 int MacroAssembler::corrected_idivl(Register reg) {
1879   // Full implementation of Java idiv and irem; checks for
1880   // special case as described in JVM spec., p.243 & p.271.
1881   // The function returns the (pc) offset of the idivl
1882   // instruction - may be needed for implicit exceptions.
1883   //
1884   //         normal case                           special case
1885   //
1886   // input : rax,: dividend                         min_int
1887   //         reg: divisor   (may not be rax,/rdx)   -1
1888   //
1889   // output: rax,: quotient  (= rax, idiv reg)       min_int
1890   //         rdx: remainder (= rax, irem reg)       0
1891   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1892   const int min_int = 0x80000000;
1893   Label normal_case, special_case;
1894 
1895   // check for special case
1896   cmpl(rax, min_int);
1897   jcc(Assembler::notEqual, normal_case);
1898   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1899   cmpl(reg, -1);
1900   jcc(Assembler::equal, special_case);
1901 
1902   // handle normal case
1903   bind(normal_case);
1904   cdql();
1905   int idivl_offset = offset();
1906   idivl(reg);
1907 
1908   // normal and special case exit
1909   bind(special_case);
1910 
1911   return idivl_offset;
1912 }
1913 
1914 
1915 
1916 void MacroAssembler::decrementl(Register reg, int value) {
1917   if (value == min_jint) {subl(reg, value) ; return; }
1918   if (value <  0) { incrementl(reg, -value); return; }
1919   if (value == 0) {                        ; return; }
1920   if (value == 1 && UseIncDec) { decl(reg) ; return; }
1921   /* else */      { subl(reg, value)       ; return; }
1922 }
1923 
1924 void MacroAssembler::decrementl(Address dst, int value) {
1925   if (value == min_jint) {subl(dst, value) ; return; }
1926   if (value <  0) { incrementl(dst, -value); return; }
1927   if (value == 0) {                        ; return; }
1928   if (value == 1 && UseIncDec) { decl(dst) ; return; }
1929   /* else */      { subl(dst, value)       ; return; }
1930 }
1931 
1932 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
1933   assert (shift_value > 0, "illegal shift value");
1934   Label _is_positive;
1935   testl (reg, reg);
1936   jcc (Assembler::positive, _is_positive);
1937   int offset = (1 << shift_value) - 1 ;
1938 
1939   if (offset == 1) {
1940     incrementl(reg);
1941   } else {
1942     addl(reg, offset);
1943   }
1944 
1945   bind (_is_positive);
1946   sarl(reg, shift_value);
1947 }
1948 
1949 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
1950   if (reachable(src)) {
1951     Assembler::divsd(dst, as_Address(src));
1952   } else {
1953     lea(rscratch1, src);
1954     Assembler::divsd(dst, Address(rscratch1, 0));
1955   }
1956 }
1957 
1958 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
1959   if (reachable(src)) {
1960     Assembler::divss(dst, as_Address(src));
1961   } else {
1962     lea(rscratch1, src);
1963     Assembler::divss(dst, Address(rscratch1, 0));
1964   }
1965 }
1966 
1967 void MacroAssembler::enter() {
1968   push(rbp);
1969   mov(rbp, rsp);
1970 }
1971 
1972 // A 5 byte nop that is safe for patching (see patch_verified_entry)
1973 void MacroAssembler::fat_nop() {
1974   if (UseAddressNop) {
1975     addr_nop_5();
1976   } else {
1977     emit_int8(0x26); // es:
1978     emit_int8(0x2e); // cs:
1979     emit_int8(0x64); // fs:
1980     emit_int8(0x65); // gs:
1981     emit_int8((unsigned char)0x90);
1982   }
1983 }
1984 
1985 #ifndef _LP64
1986 void MacroAssembler::fcmp(Register tmp) {
1987   fcmp(tmp, 1, true, true);
1988 }
1989 
1990 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
1991   assert(!pop_right || pop_left, "usage error");
1992   if (VM_Version::supports_cmov()) {
1993     assert(tmp == noreg, "unneeded temp");
1994     if (pop_left) {
1995       fucomip(index);
1996     } else {
1997       fucomi(index);
1998     }
1999     if (pop_right) {
2000       fpop();
2001     }
2002   } else {
2003     assert(tmp != noreg, "need temp");
2004     if (pop_left) {
2005       if (pop_right) {
2006         fcompp();
2007       } else {
2008         fcomp(index);
2009       }
2010     } else {
2011       fcom(index);
2012     }
2013     // convert FPU condition into eflags condition via rax,
2014     save_rax(tmp);
2015     fwait(); fnstsw_ax();
2016     sahf();
2017     restore_rax(tmp);
2018   }
2019   // condition codes set as follows:
2020   //
2021   // CF (corresponds to C0) if x < y
2022   // PF (corresponds to C2) if unordered
2023   // ZF (corresponds to C3) if x = y
2024 }
2025 
2026 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2027   fcmp2int(dst, unordered_is_less, 1, true, true);
2028 }
2029 
2030 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2031   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2032   Label L;
2033   if (unordered_is_less) {
2034     movl(dst, -1);
2035     jcc(Assembler::parity, L);
2036     jcc(Assembler::below , L);
2037     movl(dst, 0);
2038     jcc(Assembler::equal , L);
2039     increment(dst);
2040   } else { // unordered is greater
2041     movl(dst, 1);
2042     jcc(Assembler::parity, L);
2043     jcc(Assembler::above , L);
2044     movl(dst, 0);
2045     jcc(Assembler::equal , L);
2046     decrementl(dst);
2047   }
2048   bind(L);
2049 }
2050 
2051 void MacroAssembler::fld_d(AddressLiteral src) {
2052   fld_d(as_Address(src));
2053 }
2054 
2055 void MacroAssembler::fld_s(AddressLiteral src) {
2056   fld_s(as_Address(src));
2057 }
2058 
2059 void MacroAssembler::fldcw(AddressLiteral src) {
2060   Assembler::fldcw(as_Address(src));
2061 }
2062 
2063 void MacroAssembler::fpop() {
2064   ffree();
2065   fincstp();
2066 }
2067 
2068 void MacroAssembler::fremr(Register tmp) {
2069   save_rax(tmp);
2070   { Label L;
2071     bind(L);
2072     fprem();
2073     fwait(); fnstsw_ax();
2074     sahf();
2075     jcc(Assembler::parity, L);
2076   }
2077   restore_rax(tmp);
2078   // Result is in ST0.
2079   // Note: fxch & fpop to get rid of ST1
2080   // (otherwise FPU stack could overflow eventually)
2081   fxch(1);
2082   fpop();
2083 }
2084 
2085 void MacroAssembler::empty_FPU_stack() {
2086   if (VM_Version::supports_mmx()) {
2087     emms();
2088   } else {
2089     for (int i = 8; i-- > 0; ) ffree(i);
2090   }
2091 }
2092 #endif // !LP64
2093 
2094 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2095   if (reachable(src)) {
2096     Assembler::mulpd(dst, as_Address(src));
2097   } else {
2098     lea(rscratch1, src);
2099     Assembler::mulpd(dst, Address(rscratch1, 0));
2100   }
2101 }
2102 
2103 void MacroAssembler::load_float(Address src) {
2104 #ifdef _LP64
2105   movflt(xmm0, src);
2106 #else
2107   if (UseSSE >= 1) {
2108     movflt(xmm0, src);
2109   } else {
2110     fld_s(src);
2111   }
2112 #endif // LP64
2113 }
2114 
2115 void MacroAssembler::store_float(Address dst) {
2116 #ifdef _LP64
2117   movflt(dst, xmm0);
2118 #else
2119   if (UseSSE >= 1) {
2120     movflt(dst, xmm0);
2121   } else {
2122     fstp_s(dst);
2123   }
2124 #endif // LP64
2125 }
2126 
2127 void MacroAssembler::load_double(Address src) {
2128 #ifdef _LP64
2129   movdbl(xmm0, src);
2130 #else
2131   if (UseSSE >= 2) {
2132     movdbl(xmm0, src);
2133   } else {
2134     fld_d(src);
2135   }
2136 #endif // LP64
2137 }
2138 
2139 void MacroAssembler::store_double(Address dst) {
2140 #ifdef _LP64
2141   movdbl(dst, xmm0);
2142 #else
2143   if (UseSSE >= 2) {
2144     movdbl(dst, xmm0);
2145   } else {
2146     fstp_d(dst);
2147   }
2148 #endif // LP64
2149 }
2150 
2151 // dst = c = a * b + c
2152 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2153   Assembler::vfmadd231sd(c, a, b);
2154   if (dst != c) {
2155     movdbl(dst, c);
2156   }
2157 }
2158 
2159 // dst = c = a * b + c
2160 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2161   Assembler::vfmadd231ss(c, a, b);
2162   if (dst != c) {
2163     movflt(dst, c);
2164   }
2165 }
2166 
2167 // dst = c = a * b + c
2168 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2169   Assembler::vfmadd231pd(c, a, b, vector_len);
2170   if (dst != c) {
2171     vmovdqu(dst, c);
2172   }
2173 }
2174 
2175 // dst = c = a * b + c
2176 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2177   Assembler::vfmadd231ps(c, a, b, vector_len);
2178   if (dst != c) {
2179     vmovdqu(dst, c);
2180   }
2181 }
2182 
2183 // dst = c = a * b + c
2184 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2185   Assembler::vfmadd231pd(c, a, b, vector_len);
2186   if (dst != c) {
2187     vmovdqu(dst, c);
2188   }
2189 }
2190 
2191 // dst = c = a * b + c
2192 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2193   Assembler::vfmadd231ps(c, a, b, vector_len);
2194   if (dst != c) {
2195     vmovdqu(dst, c);
2196   }
2197 }
2198 
2199 void MacroAssembler::incrementl(AddressLiteral dst) {
2200   if (reachable(dst)) {
2201     incrementl(as_Address(dst));
2202   } else {
2203     lea(rscratch1, dst);
2204     incrementl(Address(rscratch1, 0));
2205   }
2206 }
2207 
2208 void MacroAssembler::incrementl(ArrayAddress dst) {
2209   incrementl(as_Address(dst));
2210 }
2211 
2212 void MacroAssembler::incrementl(Register reg, int value) {
2213   if (value == min_jint) {addl(reg, value) ; return; }
2214   if (value <  0) { decrementl(reg, -value); return; }
2215   if (value == 0) {                        ; return; }
2216   if (value == 1 && UseIncDec) { incl(reg) ; return; }
2217   /* else */      { addl(reg, value)       ; return; }
2218 }
2219 
2220 void MacroAssembler::incrementl(Address dst, int value) {
2221   if (value == min_jint) {addl(dst, value) ; return; }
2222   if (value <  0) { decrementl(dst, -value); return; }
2223   if (value == 0) {                        ; return; }
2224   if (value == 1 && UseIncDec) { incl(dst) ; return; }
2225   /* else */      { addl(dst, value)       ; return; }
2226 }
2227 
2228 void MacroAssembler::jump(AddressLiteral dst) {
2229   if (reachable(dst)) {
2230     jmp_literal(dst.target(), dst.rspec());
2231   } else {
2232     lea(rscratch1, dst);
2233     jmp(rscratch1);
2234   }
2235 }
2236 
2237 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
2238   if (reachable(dst)) {
2239     InstructionMark im(this);
2240     relocate(dst.reloc());
2241     const int short_size = 2;
2242     const int long_size = 6;
2243     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2244     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2245       // 0111 tttn #8-bit disp
2246       emit_int8(0x70 | cc);
2247       emit_int8((offs - short_size) & 0xFF);
2248     } else {
2249       // 0000 1111 1000 tttn #32-bit disp
2250       emit_int8(0x0F);
2251       emit_int8((unsigned char)(0x80 | cc));
2252       emit_int32(offs - long_size);
2253     }
2254   } else {
2255 #ifdef ASSERT
2256     warning("reversing conditional branch");
2257 #endif /* ASSERT */
2258     Label skip;
2259     jccb(reverse[cc], skip);
2260     lea(rscratch1, dst);
2261     Assembler::jmp(rscratch1);
2262     bind(skip);
2263   }
2264 }
2265 
2266 void MacroAssembler::fld_x(AddressLiteral src) {
2267   Assembler::fld_x(as_Address(src));
2268 }
2269 
2270 void MacroAssembler::ldmxcsr(AddressLiteral src) {
2271   if (reachable(src)) {
2272     Assembler::ldmxcsr(as_Address(src));
2273   } else {
2274     lea(rscratch1, src);
2275     Assembler::ldmxcsr(Address(rscratch1, 0));
2276   }
2277 }
2278 
2279 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2280   int off;
2281   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2282     off = offset();
2283     movsbl(dst, src); // movsxb
2284   } else {
2285     off = load_unsigned_byte(dst, src);
2286     shll(dst, 24);
2287     sarl(dst, 24);
2288   }
2289   return off;
2290 }
2291 
2292 // Note: load_signed_short used to be called load_signed_word.
2293 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
2294 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
2295 // The term "word" in HotSpot means a 32- or 64-bit machine word.
2296 int MacroAssembler::load_signed_short(Register dst, Address src) {
2297   int off;
2298   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2299     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
2300     // version but this is what 64bit has always done. This seems to imply
2301     // that users are only using 32bits worth.
2302     off = offset();
2303     movswl(dst, src); // movsxw
2304   } else {
2305     off = load_unsigned_short(dst, src);
2306     shll(dst, 16);
2307     sarl(dst, 16);
2308   }
2309   return off;
2310 }
2311 
2312 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2313   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2314   // and "3.9 Partial Register Penalties", p. 22).
2315   int off;
2316   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
2317     off = offset();
2318     movzbl(dst, src); // movzxb
2319   } else {
2320     xorl(dst, dst);
2321     off = offset();
2322     movb(dst, src);
2323   }
2324   return off;
2325 }
2326 
2327 // Note: load_unsigned_short used to be called load_unsigned_word.
2328 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2329   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2330   // and "3.9 Partial Register Penalties", p. 22).
2331   int off;
2332   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
2333     off = offset();
2334     movzwl(dst, src); // movzxw
2335   } else {
2336     xorl(dst, dst);
2337     off = offset();
2338     movw(dst, src);
2339   }
2340   return off;
2341 }
2342 
2343 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2344   switch (size_in_bytes) {
2345 #ifndef _LP64
2346   case  8:
2347     assert(dst2 != noreg, "second dest register required");
2348     movl(dst,  src);
2349     movl(dst2, src.plus_disp(BytesPerInt));
2350     break;
2351 #else
2352   case  8:  movq(dst, src); break;
2353 #endif
2354   case  4:  movl(dst, src); break;
2355   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2356   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2357   default:  ShouldNotReachHere();
2358   }
2359 }
2360 
2361 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2362   switch (size_in_bytes) {
2363 #ifndef _LP64
2364   case  8:
2365     assert(src2 != noreg, "second source register required");
2366     movl(dst,                        src);
2367     movl(dst.plus_disp(BytesPerInt), src2);
2368     break;
2369 #else
2370   case  8:  movq(dst, src); break;
2371 #endif
2372   case  4:  movl(dst, src); break;
2373   case  2:  movw(dst, src); break;
2374   case  1:  movb(dst, src); break;
2375   default:  ShouldNotReachHere();
2376   }
2377 }
2378 
2379 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
2380   if (reachable(dst)) {
2381     movl(as_Address(dst), src);
2382   } else {
2383     lea(rscratch1, dst);
2384     movl(Address(rscratch1, 0), src);
2385   }
2386 }
2387 
2388 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
2389   if (reachable(src)) {
2390     movl(dst, as_Address(src));
2391   } else {
2392     lea(rscratch1, src);
2393     movl(dst, Address(rscratch1, 0));
2394   }
2395 }
2396 
2397 // C++ bool manipulation
2398 
2399 void MacroAssembler::movbool(Register dst, Address src) {
2400   if(sizeof(bool) == 1)
2401     movb(dst, src);
2402   else if(sizeof(bool) == 2)
2403     movw(dst, src);
2404   else if(sizeof(bool) == 4)
2405     movl(dst, src);
2406   else
2407     // unsupported
2408     ShouldNotReachHere();
2409 }
2410 
2411 void MacroAssembler::movbool(Address dst, bool boolconst) {
2412   if(sizeof(bool) == 1)
2413     movb(dst, (int) boolconst);
2414   else if(sizeof(bool) == 2)
2415     movw(dst, (int) boolconst);
2416   else if(sizeof(bool) == 4)
2417     movl(dst, (int) boolconst);
2418   else
2419     // unsupported
2420     ShouldNotReachHere();
2421 }
2422 
2423 void MacroAssembler::movbool(Address dst, Register src) {
2424   if(sizeof(bool) == 1)
2425     movb(dst, src);
2426   else if(sizeof(bool) == 2)
2427     movw(dst, src);
2428   else if(sizeof(bool) == 4)
2429     movl(dst, src);
2430   else
2431     // unsupported
2432     ShouldNotReachHere();
2433 }
2434 
2435 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
2436   movb(as_Address(dst), src);
2437 }
2438 
2439 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
2440   if (reachable(src)) {
2441     movdl(dst, as_Address(src));
2442   } else {
2443     lea(rscratch1, src);
2444     movdl(dst, Address(rscratch1, 0));
2445   }
2446 }
2447 
2448 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
2449   if (reachable(src)) {
2450     movq(dst, as_Address(src));
2451   } else {
2452     lea(rscratch1, src);
2453     movq(dst, Address(rscratch1, 0));
2454   }
2455 }
2456 
2457 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
2458   if (reachable(src)) {
2459     if (UseXmmLoadAndClearUpper) {
2460       movsd (dst, as_Address(src));
2461     } else {
2462       movlpd(dst, as_Address(src));
2463     }
2464   } else {
2465     lea(rscratch1, src);
2466     if (UseXmmLoadAndClearUpper) {
2467       movsd (dst, Address(rscratch1, 0));
2468     } else {
2469       movlpd(dst, Address(rscratch1, 0));
2470     }
2471   }
2472 }
2473 
2474 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
2475   if (reachable(src)) {
2476     movss(dst, as_Address(src));
2477   } else {
2478     lea(rscratch1, src);
2479     movss(dst, Address(rscratch1, 0));
2480   }
2481 }
2482 
2483 void MacroAssembler::movptr(Register dst, Register src) {
2484   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2485 }
2486 
2487 void MacroAssembler::movptr(Register dst, Address src) {
2488   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2489 }
2490 
2491 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
2492 void MacroAssembler::movptr(Register dst, intptr_t src) {
2493   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
2494 }
2495 
2496 void MacroAssembler::movptr(Address dst, Register src) {
2497   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2498 }
2499 
2500 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2501     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2502     Assembler::movdqu(dst, src);
2503 }
2504 
2505 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2506     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2507     Assembler::movdqu(dst, src);
2508 }
2509 
2510 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2511     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2512     Assembler::movdqu(dst, src);
2513 }
2514 
2515 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
2516   if (reachable(src)) {
2517     movdqu(dst, as_Address(src));
2518   } else {
2519     lea(scratchReg, src);
2520     movdqu(dst, Address(scratchReg, 0));
2521   }
2522 }
2523 
2524 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2525     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2526     Assembler::vmovdqu(dst, src);
2527 }
2528 
2529 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2530     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2531     Assembler::vmovdqu(dst, src);
2532 }
2533 
2534 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2535     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2536     Assembler::vmovdqu(dst, src);
2537 }
2538 
2539 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
2540   if (reachable(src)) {
2541     vmovdqu(dst, as_Address(src));
2542   }
2543   else {
2544     lea(scratch_reg, src);
2545     vmovdqu(dst, Address(scratch_reg, 0));
2546   }
2547 }
2548 
2549 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg, int vector_len) {
2550   assert(vector_len <= AVX_256bit, "AVX2 vector length");
2551   if (vector_len == AVX_256bit) {
2552     vmovdqu(dst, src, scratch_reg);
2553   } else {
2554     movdqu(dst, src, scratch_reg);
2555   }
2556 }
2557 
2558 void MacroAssembler::kmov(KRegister dst, Address src) {
2559   if (VM_Version::supports_avx512bw()) {
2560     kmovql(dst, src);
2561   } else {
2562     assert(VM_Version::supports_evex(), "");
2563     kmovwl(dst, src);
2564   }
2565 }
2566 
2567 void MacroAssembler::kmov(Address dst, KRegister src) {
2568   if (VM_Version::supports_avx512bw()) {
2569     kmovql(dst, src);
2570   } else {
2571     assert(VM_Version::supports_evex(), "");
2572     kmovwl(dst, src);
2573   }
2574 }
2575 
2576 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2577   if (VM_Version::supports_avx512bw()) {
2578     kmovql(dst, src);
2579   } else {
2580     assert(VM_Version::supports_evex(), "");
2581     kmovwl(dst, src);
2582   }
2583 }
2584 
2585 void MacroAssembler::kmov(Register dst, KRegister src) {
2586   if (VM_Version::supports_avx512bw()) {
2587     kmovql(dst, src);
2588   } else {
2589     assert(VM_Version::supports_evex(), "");
2590     kmovwl(dst, src);
2591   }
2592 }
2593 
2594 void MacroAssembler::kmov(KRegister dst, Register src) {
2595   if (VM_Version::supports_avx512bw()) {
2596     kmovql(dst, src);
2597   } else {
2598     assert(VM_Version::supports_evex(), "");
2599     kmovwl(dst, src);
2600   }
2601 }
2602 
2603 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register scratch_reg) {
2604   if (reachable(src)) {
2605     kmovql(dst, as_Address(src));
2606   } else {
2607     lea(scratch_reg, src);
2608     kmovql(dst, Address(scratch_reg, 0));
2609   }
2610 }
2611 
2612 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg) {
2613   if (reachable(src)) {
2614     kmovwl(dst, as_Address(src));
2615   } else {
2616     lea(scratch_reg, src);
2617     kmovwl(dst, Address(scratch_reg, 0));
2618   }
2619 }
2620 
2621 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2622                                int vector_len, Register scratch_reg) {
2623   if (reachable(src)) {
2624     if (mask == k0) {
2625       Assembler::evmovdqub(dst, as_Address(src), merge, vector_len);
2626     } else {
2627       Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2628     }
2629   } else {
2630     lea(scratch_reg, src);
2631     if (mask == k0) {
2632       Assembler::evmovdqub(dst, Address(scratch_reg, 0), merge, vector_len);
2633     } else {
2634       Assembler::evmovdqub(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2635     }
2636   }
2637 }
2638 
2639 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2640                                int vector_len, Register scratch_reg) {
2641   if (reachable(src)) {
2642     Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2643   } else {
2644     lea(scratch_reg, src);
2645     Assembler::evmovdquw(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2646   }
2647 }
2648 
2649 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2650                                int vector_len, Register scratch_reg) {
2651   if (reachable(src)) {
2652     Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2653   } else {
2654     lea(scratch_reg, src);
2655     Assembler::evmovdqul(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2656   }
2657 }
2658 
2659 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2660                                int vector_len, Register scratch_reg) {
2661   if (reachable(src)) {
2662     Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2663   } else {
2664     lea(scratch_reg, src);
2665     Assembler::evmovdquq(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2666   }
2667 }
2668 
2669 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2670   if (reachable(src)) {
2671     Assembler::evmovdquq(dst, as_Address(src), vector_len);
2672   } else {
2673     lea(rscratch, src);
2674     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2675   }
2676 }
2677 
2678 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
2679   if (reachable(src)) {
2680     Assembler::movdqa(dst, as_Address(src));
2681   } else {
2682     lea(rscratch1, src);
2683     Assembler::movdqa(dst, Address(rscratch1, 0));
2684   }
2685 }
2686 
2687 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
2688   if (reachable(src)) {
2689     Assembler::movsd(dst, as_Address(src));
2690   } else {
2691     lea(rscratch1, src);
2692     Assembler::movsd(dst, Address(rscratch1, 0));
2693   }
2694 }
2695 
2696 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
2697   if (reachable(src)) {
2698     Assembler::movss(dst, as_Address(src));
2699   } else {
2700     lea(rscratch1, src);
2701     Assembler::movss(dst, Address(rscratch1, 0));
2702   }
2703 }
2704 
2705 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2706   if (reachable(src)) {
2707     Assembler::vmovddup(dst, as_Address(src), vector_len);
2708   } else {
2709     lea(rscratch, src);
2710     Assembler::vmovddup(dst, Address(rscratch, 0), vector_len);
2711   }
2712 }
2713 
2714 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
2715   if (reachable(src)) {
2716     Assembler::mulsd(dst, as_Address(src));
2717   } else {
2718     lea(rscratch1, src);
2719     Assembler::mulsd(dst, Address(rscratch1, 0));
2720   }
2721 }
2722 
2723 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
2724   if (reachable(src)) {
2725     Assembler::mulss(dst, as_Address(src));
2726   } else {
2727     lea(rscratch1, src);
2728     Assembler::mulss(dst, Address(rscratch1, 0));
2729   }
2730 }
2731 
2732 void MacroAssembler::null_check(Register reg, int offset) {
2733   if (needs_explicit_null_check(offset)) {
2734     // provoke OS NULL exception if reg = NULL by
2735     // accessing M[reg] w/o changing any (non-CC) registers
2736     // NOTE: cmpl is plenty here to provoke a segv
2737     cmpptr(rax, Address(reg, 0));
2738     // Note: should probably use testl(rax, Address(reg, 0));
2739     //       may be shorter code (however, this version of
2740     //       testl needs to be implemented first)
2741   } else {
2742     // nothing to do, (later) access of M[reg + offset]
2743     // will provoke OS NULL exception if reg = NULL
2744   }
2745 }
2746 
2747 void MacroAssembler::os_breakpoint() {
2748   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2749   // (e.g., MSVC can't call ps() otherwise)
2750   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2751 }
2752 
2753 void MacroAssembler::unimplemented(const char* what) {
2754   const char* buf = NULL;
2755   {
2756     ResourceMark rm;
2757     stringStream ss;
2758     ss.print("unimplemented: %s", what);
2759     buf = code_string(ss.as_string());
2760   }
2761   stop(buf);
2762 }
2763 
2764 #ifdef _LP64
2765 #define XSTATE_BV 0x200
2766 #endif
2767 
2768 void MacroAssembler::pop_CPU_state() {
2769   pop_FPU_state();
2770   pop_IU_state();
2771 }
2772 
2773 void MacroAssembler::pop_FPU_state() {
2774 #ifndef _LP64
2775   frstor(Address(rsp, 0));
2776 #else
2777   fxrstor(Address(rsp, 0));
2778 #endif
2779   addptr(rsp, FPUStateSizeInWords * wordSize);
2780 }
2781 
2782 void MacroAssembler::pop_IU_state() {
2783   popa();
2784   LP64_ONLY(addq(rsp, 8));
2785   popf();
2786 }
2787 
2788 // Save Integer and Float state
2789 // Warning: Stack must be 16 byte aligned (64bit)
2790 void MacroAssembler::push_CPU_state() {
2791   push_IU_state();
2792   push_FPU_state();
2793 }
2794 
2795 void MacroAssembler::push_FPU_state() {
2796   subptr(rsp, FPUStateSizeInWords * wordSize);
2797 #ifndef _LP64
2798   fnsave(Address(rsp, 0));
2799   fwait();
2800 #else
2801   fxsave(Address(rsp, 0));
2802 #endif // LP64
2803 }
2804 
2805 void MacroAssembler::push_IU_state() {
2806   // Push flags first because pusha kills them
2807   pushf();
2808   // Make sure rsp stays 16-byte aligned
2809   LP64_ONLY(subq(rsp, 8));
2810   pusha();
2811 }
2812 
2813 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
2814   if (!java_thread->is_valid()) {
2815     java_thread = rdi;
2816     get_thread(java_thread);
2817   }
2818   // we must set sp to zero to clear frame
2819   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
2820   // must clear fp, so that compiled frames are not confused; it is
2821   // possible that we need it only for debugging
2822   if (clear_fp) {
2823     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2824   }
2825   // Always clear the pc because it could have been set by make_walkable()
2826   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
2827   vzeroupper();
2828 }
2829 
2830 void MacroAssembler::restore_rax(Register tmp) {
2831   if (tmp == noreg) pop(rax);
2832   else if (tmp != rax) mov(rax, tmp);
2833 }
2834 
2835 void MacroAssembler::round_to(Register reg, int modulus) {
2836   addptr(reg, modulus - 1);
2837   andptr(reg, -modulus);
2838 }
2839 
2840 void MacroAssembler::save_rax(Register tmp) {
2841   if (tmp == noreg) push(rax);
2842   else if (tmp != rax) mov(tmp, rax);
2843 }
2844 
2845 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) {
2846   if (at_return) {
2847     // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
2848     // we may safely use rsp instead to perform the stack watermark check.
2849     cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset()));
2850     jcc(Assembler::above, slow_path);
2851     return;
2852   }
2853   testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2854   jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
2855 }
2856 
2857 // Calls to C land
2858 //
2859 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
2860 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
2861 // has to be reset to 0. This is required to allow proper stack traversal.
2862 void MacroAssembler::set_last_Java_frame(Register java_thread,
2863                                          Register last_java_sp,
2864                                          Register last_java_fp,
2865                                          address  last_java_pc) {
2866   vzeroupper();
2867   // determine java_thread register
2868   if (!java_thread->is_valid()) {
2869     java_thread = rdi;
2870     get_thread(java_thread);
2871   }
2872   // determine last_java_sp register
2873   if (!last_java_sp->is_valid()) {
2874     last_java_sp = rsp;
2875   }
2876 
2877   // last_java_fp is optional
2878 
2879   if (last_java_fp->is_valid()) {
2880     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
2881   }
2882 
2883   // last_java_pc is optional
2884 
2885   if (last_java_pc != NULL) {
2886     lea(Address(java_thread,
2887                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
2888         InternalAddress(last_java_pc));
2889 
2890   }
2891   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
2892 }
2893 
2894 void MacroAssembler::shlptr(Register dst, int imm8) {
2895   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
2896 }
2897 
2898 void MacroAssembler::shrptr(Register dst, int imm8) {
2899   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
2900 }
2901 
2902 void MacroAssembler::sign_extend_byte(Register reg) {
2903   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
2904     movsbl(reg, reg); // movsxb
2905   } else {
2906     shll(reg, 24);
2907     sarl(reg, 24);
2908   }
2909 }
2910 
2911 void MacroAssembler::sign_extend_short(Register reg) {
2912   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2913     movswl(reg, reg); // movsxw
2914   } else {
2915     shll(reg, 16);
2916     sarl(reg, 16);
2917   }
2918 }
2919 
2920 void MacroAssembler::testl(Register dst, AddressLiteral src) {
2921   assert(reachable(src), "Address should be reachable");
2922   testl(dst, as_Address(src));
2923 }
2924 
2925 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
2926   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2927   Assembler::pcmpeqb(dst, src);
2928 }
2929 
2930 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
2931   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2932   Assembler::pcmpeqw(dst, src);
2933 }
2934 
2935 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2936   assert((dst->encoding() < 16),"XMM register should be 0-15");
2937   Assembler::pcmpestri(dst, src, imm8);
2938 }
2939 
2940 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2941   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2942   Assembler::pcmpestri(dst, src, imm8);
2943 }
2944 
2945 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2946   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2947   Assembler::pmovzxbw(dst, src);
2948 }
2949 
2950 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
2951   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2952   Assembler::pmovzxbw(dst, src);
2953 }
2954 
2955 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
2956   assert((src->encoding() < 16),"XMM register should be 0-15");
2957   Assembler::pmovmskb(dst, src);
2958 }
2959 
2960 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
2961   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2962   Assembler::ptest(dst, src);
2963 }
2964 
2965 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
2966   if (reachable(src)) {
2967     Assembler::sqrtsd(dst, as_Address(src));
2968   } else {
2969     lea(rscratch1, src);
2970     Assembler::sqrtsd(dst, Address(rscratch1, 0));
2971   }
2972 }
2973 
2974 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
2975   if (reachable(src)) {
2976     Assembler::sqrtss(dst, as_Address(src));
2977   } else {
2978     lea(rscratch1, src);
2979     Assembler::sqrtss(dst, Address(rscratch1, 0));
2980   }
2981 }
2982 
2983 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
2984   if (reachable(src)) {
2985     Assembler::subsd(dst, as_Address(src));
2986   } else {
2987     lea(rscratch1, src);
2988     Assembler::subsd(dst, Address(rscratch1, 0));
2989   }
2990 }
2991 
2992 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg) {
2993   if (reachable(src)) {
2994     Assembler::roundsd(dst, as_Address(src), rmode);
2995   } else {
2996     lea(scratch_reg, src);
2997     Assembler::roundsd(dst, Address(scratch_reg, 0), rmode);
2998   }
2999 }
3000 
3001 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3002   if (reachable(src)) {
3003     Assembler::subss(dst, as_Address(src));
3004   } else {
3005     lea(rscratch1, src);
3006     Assembler::subss(dst, Address(rscratch1, 0));
3007   }
3008 }
3009 
3010 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3011   if (reachable(src)) {
3012     Assembler::ucomisd(dst, as_Address(src));
3013   } else {
3014     lea(rscratch1, src);
3015     Assembler::ucomisd(dst, Address(rscratch1, 0));
3016   }
3017 }
3018 
3019 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3020   if (reachable(src)) {
3021     Assembler::ucomiss(dst, as_Address(src));
3022   } else {
3023     lea(rscratch1, src);
3024     Assembler::ucomiss(dst, Address(rscratch1, 0));
3025   }
3026 }
3027 
3028 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3029   // Used in sign-bit flipping with aligned address.
3030   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3031   if (reachable(src)) {
3032     Assembler::xorpd(dst, as_Address(src));
3033   } else {
3034     lea(scratch_reg, src);
3035     Assembler::xorpd(dst, Address(scratch_reg, 0));
3036   }
3037 }
3038 
3039 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3040   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3041     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3042   }
3043   else {
3044     Assembler::xorpd(dst, src);
3045   }
3046 }
3047 
3048 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3049   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3050     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3051   } else {
3052     Assembler::xorps(dst, src);
3053   }
3054 }
3055 
3056 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3057   // Used in sign-bit flipping with aligned address.
3058   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3059   if (reachable(src)) {
3060     Assembler::xorps(dst, as_Address(src));
3061   } else {
3062     lea(scratch_reg, src);
3063     Assembler::xorps(dst, Address(scratch_reg, 0));
3064   }
3065 }
3066 
3067 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3068   // Used in sign-bit flipping with aligned address.
3069   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3070   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3071   if (reachable(src)) {
3072     Assembler::pshufb(dst, as_Address(src));
3073   } else {
3074     lea(rscratch1, src);
3075     Assembler::pshufb(dst, Address(rscratch1, 0));
3076   }
3077 }
3078 
3079 // AVX 3-operands instructions
3080 
3081 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3082   if (reachable(src)) {
3083     vaddsd(dst, nds, as_Address(src));
3084   } else {
3085     lea(rscratch1, src);
3086     vaddsd(dst, nds, Address(rscratch1, 0));
3087   }
3088 }
3089 
3090 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3091   if (reachable(src)) {
3092     vaddss(dst, nds, as_Address(src));
3093   } else {
3094     lea(rscratch1, src);
3095     vaddss(dst, nds, Address(rscratch1, 0));
3096   }
3097 }
3098 
3099 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3100   assert(UseAVX > 0, "requires some form of AVX");
3101   if (reachable(src)) {
3102     Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
3103   } else {
3104     lea(rscratch, src);
3105     Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
3106   }
3107 }
3108 
3109 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3110   assert(UseAVX > 0, "requires some form of AVX");
3111   if (reachable(src)) {
3112     Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
3113   } else {
3114     lea(rscratch, src);
3115     Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
3116   }
3117 }
3118 
3119 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3120   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3121   vandps(dst, nds, negate_field, vector_len);
3122 }
3123 
3124 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3125   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3126   vandpd(dst, nds, negate_field, vector_len);
3127 }
3128 
3129 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3130   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3131   Assembler::vpaddb(dst, nds, src, vector_len);
3132 }
3133 
3134 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3135   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3136   Assembler::vpaddb(dst, nds, src, vector_len);
3137 }
3138 
3139 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3140   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3141   Assembler::vpaddw(dst, nds, src, vector_len);
3142 }
3143 
3144 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3145   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3146   Assembler::vpaddw(dst, nds, src, vector_len);
3147 }
3148 
3149 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3150   if (reachable(src)) {
3151     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3152   } else {
3153     lea(scratch_reg, src);
3154     Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3155   }
3156 }
3157 
3158 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3159   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3160   Assembler::vpbroadcastw(dst, src, vector_len);
3161 }
3162 
3163 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3164   if (reachable(src)) {
3165     Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
3166   } else {
3167     lea(rscratch, src);
3168     Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len);
3169   }
3170 }
3171 
3172 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3173   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3174   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3175 }
3176 
3177 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3178   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3179   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3180 }
3181 
3182 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds,
3183                                AddressLiteral src, int vector_len, Register scratch_reg) {
3184   if (reachable(src)) {
3185     Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3186   } else {
3187     lea(scratch_reg, src);
3188     Assembler::evpcmpeqd(kdst, mask, nds, Address(scratch_reg, 0), vector_len);
3189   }
3190 }
3191 
3192 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3193                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3194   if (reachable(src)) {
3195     Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3196   } else {
3197     lea(scratch_reg, src);
3198     Assembler::evpcmpd(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3199   }
3200 }
3201 
3202 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3203                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3204   if (reachable(src)) {
3205     Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3206   } else {
3207     lea(scratch_reg, src);
3208     Assembler::evpcmpq(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3209   }
3210 }
3211 
3212 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3213                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3214   if (reachable(src)) {
3215     Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3216   } else {
3217     lea(scratch_reg, src);
3218     Assembler::evpcmpb(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3219   }
3220 }
3221 
3222 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3223                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3224   if (reachable(src)) {
3225     Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3226   } else {
3227     lea(scratch_reg, src);
3228     Assembler::evpcmpw(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3229   }
3230 }
3231 
3232 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3233   if (width == Assembler::Q) {
3234     Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3235   } else {
3236     Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3237   }
3238 }
3239 
3240 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) {
3241   int eq_cond_enc = 0x29;
3242   int gt_cond_enc = 0x37;
3243   if (width != Assembler::Q) {
3244     eq_cond_enc = 0x74 + width;
3245     gt_cond_enc = 0x64 + width;
3246   }
3247   switch (cond) {
3248   case eq:
3249     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3250     break;
3251   case neq:
3252     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3253     vallones(xtmp, vector_len);
3254     vpxor(dst, xtmp, dst, vector_len);
3255     break;
3256   case le:
3257     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3258     vallones(xtmp, vector_len);
3259     vpxor(dst, xtmp, dst, vector_len);
3260     break;
3261   case nlt:
3262     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3263     vallones(xtmp, vector_len);
3264     vpxor(dst, xtmp, dst, vector_len);
3265     break;
3266   case lt:
3267     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3268     break;
3269   case nle:
3270     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3271     break;
3272   default:
3273     assert(false, "Should not reach here");
3274   }
3275 }
3276 
3277 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3278   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3279   Assembler::vpmovzxbw(dst, src, vector_len);
3280 }
3281 
3282 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3283   assert((src->encoding() < 16),"XMM register should be 0-15");
3284   Assembler::vpmovmskb(dst, src, vector_len);
3285 }
3286 
3287 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3288   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3289   Assembler::vpmullw(dst, nds, src, vector_len);
3290 }
3291 
3292 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3293   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3294   Assembler::vpmullw(dst, nds, src, vector_len);
3295 }
3296 
3297 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3298   assert((UseAVX > 0), "AVX support is needed");
3299   if (reachable(src)) {
3300     Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3301   } else {
3302     lea(scratch_reg, src);
3303     Assembler::vpmulld(dst, nds, Address(scratch_reg, 0), vector_len);
3304   }
3305 }
3306 
3307 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3308   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3309   Assembler::vpsubb(dst, nds, src, vector_len);
3310 }
3311 
3312 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3313   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3314   Assembler::vpsubb(dst, nds, src, vector_len);
3315 }
3316 
3317 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3318   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3319   Assembler::vpsubw(dst, nds, src, vector_len);
3320 }
3321 
3322 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3323   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3324   Assembler::vpsubw(dst, nds, src, vector_len);
3325 }
3326 
3327 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3328   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3329   Assembler::vpsraw(dst, nds, shift, vector_len);
3330 }
3331 
3332 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3333   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3334   Assembler::vpsraw(dst, nds, shift, vector_len);
3335 }
3336 
3337 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3338   assert(UseAVX > 2,"");
3339   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3340      vector_len = 2;
3341   }
3342   Assembler::evpsraq(dst, nds, shift, vector_len);
3343 }
3344 
3345 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3346   assert(UseAVX > 2,"");
3347   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3348      vector_len = 2;
3349   }
3350   Assembler::evpsraq(dst, nds, shift, vector_len);
3351 }
3352 
3353 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3354   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3355   Assembler::vpsrlw(dst, nds, shift, vector_len);
3356 }
3357 
3358 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3359   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3360   Assembler::vpsrlw(dst, nds, shift, vector_len);
3361 }
3362 
3363 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3364   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3365   Assembler::vpsllw(dst, nds, shift, vector_len);
3366 }
3367 
3368 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3369   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3370   Assembler::vpsllw(dst, nds, shift, vector_len);
3371 }
3372 
3373 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3374   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3375   Assembler::vptest(dst, src);
3376 }
3377 
3378 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3379   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3380   Assembler::punpcklbw(dst, src);
3381 }
3382 
3383 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3384   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3385   Assembler::pshufd(dst, src, mode);
3386 }
3387 
3388 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3389   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3390   Assembler::pshuflw(dst, src, mode);
3391 }
3392 
3393 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3394   if (reachable(src)) {
3395     vandpd(dst, nds, as_Address(src), vector_len);
3396   } else {
3397     lea(scratch_reg, src);
3398     vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
3399   }
3400 }
3401 
3402 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3403   if (reachable(src)) {
3404     vandps(dst, nds, as_Address(src), vector_len);
3405   } else {
3406     lea(scratch_reg, src);
3407     vandps(dst, nds, Address(scratch_reg, 0), vector_len);
3408   }
3409 }
3410 
3411 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3412                             bool merge, int vector_len, Register scratch_reg) {
3413   if (reachable(src)) {
3414     Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3415   } else {
3416     lea(scratch_reg, src);
3417     Assembler::evpord(dst, mask, nds, Address(scratch_reg, 0), merge, vector_len);
3418   }
3419 }
3420 
3421 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3422   if (reachable(src)) {
3423     vdivsd(dst, nds, as_Address(src));
3424   } else {
3425     lea(rscratch1, src);
3426     vdivsd(dst, nds, Address(rscratch1, 0));
3427   }
3428 }
3429 
3430 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3431   if (reachable(src)) {
3432     vdivss(dst, nds, as_Address(src));
3433   } else {
3434     lea(rscratch1, src);
3435     vdivss(dst, nds, Address(rscratch1, 0));
3436   }
3437 }
3438 
3439 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3440   if (reachable(src)) {
3441     vmulsd(dst, nds, as_Address(src));
3442   } else {
3443     lea(rscratch1, src);
3444     vmulsd(dst, nds, Address(rscratch1, 0));
3445   }
3446 }
3447 
3448 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3449   if (reachable(src)) {
3450     vmulss(dst, nds, as_Address(src));
3451   } else {
3452     lea(rscratch1, src);
3453     vmulss(dst, nds, Address(rscratch1, 0));
3454   }
3455 }
3456 
3457 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3458   if (reachable(src)) {
3459     vsubsd(dst, nds, as_Address(src));
3460   } else {
3461     lea(rscratch1, src);
3462     vsubsd(dst, nds, Address(rscratch1, 0));
3463   }
3464 }
3465 
3466 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3467   if (reachable(src)) {
3468     vsubss(dst, nds, as_Address(src));
3469   } else {
3470     lea(rscratch1, src);
3471     vsubss(dst, nds, Address(rscratch1, 0));
3472   }
3473 }
3474 
3475 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3476   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3477   vxorps(dst, nds, src, Assembler::AVX_128bit);
3478 }
3479 
3480 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3481   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3482   vxorpd(dst, nds, src, Assembler::AVX_128bit);
3483 }
3484 
3485 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3486   if (reachable(src)) {
3487     vxorpd(dst, nds, as_Address(src), vector_len);
3488   } else {
3489     lea(scratch_reg, src);
3490     vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
3491   }
3492 }
3493 
3494 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3495   if (reachable(src)) {
3496     vxorps(dst, nds, as_Address(src), vector_len);
3497   } else {
3498     lea(scratch_reg, src);
3499     vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
3500   }
3501 }
3502 
3503 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3504   if (UseAVX > 1 || (vector_len < 1)) {
3505     if (reachable(src)) {
3506       Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3507     } else {
3508       lea(scratch_reg, src);
3509       Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
3510     }
3511   }
3512   else {
3513     MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
3514   }
3515 }
3516 
3517 void MacroAssembler::vpermd(XMMRegister dst,  XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3518   if (reachable(src)) {
3519     Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3520   } else {
3521     lea(scratch_reg, src);
3522     Assembler::vpermd(dst, nds, Address(scratch_reg, 0), vector_len);
3523   }
3524 }
3525 
3526 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
3527   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
3528   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
3529   // The inverted mask is sign-extended
3530   andptr(possibly_jweak, inverted_jweak_mask);
3531 }
3532 
3533 void MacroAssembler::resolve_jobject(Register value,
3534                                      Register thread,
3535                                      Register tmp) {
3536   assert_different_registers(value, thread, tmp);
3537   Label done, not_weak;
3538   testptr(value, value);
3539   jcc(Assembler::zero, done);                // Use NULL as-is.
3540   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
3541   jcc(Assembler::zero, not_weak);
3542   // Resolve jweak.
3543   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3544                  value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
3545   verify_oop(value);
3546   jmp(done);
3547   bind(not_weak);
3548   // Resolve (untagged) jobject.
3549   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
3550   verify_oop(value);
3551   bind(done);
3552 }
3553 
3554 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3555   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
3556 }
3557 
3558 // Force generation of a 4 byte immediate value even if it fits into 8bit
3559 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3560   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
3561 }
3562 
3563 void MacroAssembler::subptr(Register dst, Register src) {
3564   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
3565 }
3566 
3567 // C++ bool manipulation
3568 void MacroAssembler::testbool(Register dst) {
3569   if(sizeof(bool) == 1)
3570     testb(dst, 0xff);
3571   else if(sizeof(bool) == 2) {
3572     // testw implementation needed for two byte bools
3573     ShouldNotReachHere();
3574   } else if(sizeof(bool) == 4)
3575     testl(dst, dst);
3576   else
3577     // unsupported
3578     ShouldNotReachHere();
3579 }
3580 
3581 void MacroAssembler::testptr(Register dst, Register src) {
3582   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
3583 }
3584 
3585 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3586 void MacroAssembler::tlab_allocate(Register thread, Register obj,
3587                                    Register var_size_in_bytes,
3588                                    int con_size_in_bytes,
3589                                    Register t1,
3590                                    Register t2,
3591                                    Label& slow_case) {
3592   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3593   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3594 }
3595 
3596 // Defines obj, preserves var_size_in_bytes
3597 void MacroAssembler::eden_allocate(Register thread, Register obj,
3598                                    Register var_size_in_bytes,
3599                                    int con_size_in_bytes,
3600                                    Register t1,
3601                                    Label& slow_case) {
3602   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3603   bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
3604 }
3605 
3606 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3607 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3608   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3609   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3610   Label done;
3611 
3612   testptr(length_in_bytes, length_in_bytes);
3613   jcc(Assembler::zero, done);
3614 
3615   // initialize topmost word, divide index by 2, check if odd and test if zero
3616   // note: for the remaining code to work, index must be a multiple of BytesPerWord
3617 #ifdef ASSERT
3618   {
3619     Label L;
3620     testptr(length_in_bytes, BytesPerWord - 1);
3621     jcc(Assembler::zero, L);
3622     stop("length must be a multiple of BytesPerWord");
3623     bind(L);
3624   }
3625 #endif
3626   Register index = length_in_bytes;
3627   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
3628   if (UseIncDec) {
3629     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
3630   } else {
3631     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
3632     shrptr(index, 1);
3633   }
3634 #ifndef _LP64
3635   // index could have not been a multiple of 8 (i.e., bit 2 was set)
3636   {
3637     Label even;
3638     // note: if index was a multiple of 8, then it cannot
3639     //       be 0 now otherwise it must have been 0 before
3640     //       => if it is even, we don't need to check for 0 again
3641     jcc(Assembler::carryClear, even);
3642     // clear topmost word (no jump would be needed if conditional assignment worked here)
3643     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
3644     // index could be 0 now, must check again
3645     jcc(Assembler::zero, done);
3646     bind(even);
3647   }
3648 #endif // !_LP64
3649   // initialize remaining object fields: index is a multiple of 2 now
3650   {
3651     Label loop;
3652     bind(loop);
3653     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3654     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
3655     decrement(index);
3656     jcc(Assembler::notZero, loop);
3657   }
3658 
3659   bind(done);
3660 }
3661 
3662 // Look up the method for a megamorphic invokeinterface call.
3663 // The target method is determined by <intf_klass, itable_index>.
3664 // The receiver klass is in recv_klass.
3665 // On success, the result will be in method_result, and execution falls through.
3666 // On failure, execution transfers to the given label.
3667 void MacroAssembler::lookup_interface_method(Register recv_klass,
3668                                              Register intf_klass,
3669                                              RegisterOrConstant itable_index,
3670                                              Register method_result,
3671                                              Register scan_temp,
3672                                              Label& L_no_such_interface,
3673                                              bool return_method) {
3674   assert_different_registers(recv_klass, intf_klass, scan_temp);
3675   assert_different_registers(method_result, intf_klass, scan_temp);
3676   assert(recv_klass != method_result || !return_method,
3677          "recv_klass can be destroyed when method isn't needed");
3678 
3679   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3680          "caller must use same register for non-constant itable index as for method");
3681 
3682   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3683   int vtable_base = in_bytes(Klass::vtable_start_offset());
3684   int itentry_off = itableMethodEntry::method_offset_in_bytes();
3685   int scan_step   = itableOffsetEntry::size() * wordSize;
3686   int vte_size    = vtableEntry::size_in_bytes();
3687   Address::ScaleFactor times_vte_scale = Address::times_ptr;
3688   assert(vte_size == wordSize, "else adjust times_vte_scale");
3689 
3690   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3691 
3692   // %%% Could store the aligned, prescaled offset in the klassoop.
3693   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3694 
3695   if (return_method) {
3696     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3697     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3698     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
3699   }
3700 
3701   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
3702   //   if (scan->interface() == intf) {
3703   //     result = (klass + scan->offset() + itable_index);
3704   //   }
3705   // }
3706   Label search, found_method;
3707 
3708   for (int peel = 1; peel >= 0; peel--) {
3709     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
3710     cmpptr(intf_klass, method_result);
3711 
3712     if (peel) {
3713       jccb(Assembler::equal, found_method);
3714     } else {
3715       jccb(Assembler::notEqual, search);
3716       // (invert the test to fall through to found_method...)
3717     }
3718 
3719     if (!peel)  break;
3720 
3721     bind(search);
3722 
3723     // Check that the previous entry is non-null.  A null entry means that
3724     // the receiver class doesn't implement the interface, and wasn't the
3725     // same as when the caller was compiled.
3726     testptr(method_result, method_result);
3727     jcc(Assembler::zero, L_no_such_interface);
3728     addptr(scan_temp, scan_step);
3729   }
3730 
3731   bind(found_method);
3732 
3733   if (return_method) {
3734     // Got a hit.
3735     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
3736     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
3737   }
3738 }
3739 
3740 
3741 // virtual method calling
3742 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3743                                            RegisterOrConstant vtable_index,
3744                                            Register method_result) {
3745   const int base = in_bytes(Klass::vtable_start_offset());
3746   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
3747   Address vtable_entry_addr(recv_klass,
3748                             vtable_index, Address::times_ptr,
3749                             base + vtableEntry::method_offset_in_bytes());
3750   movptr(method_result, vtable_entry_addr);
3751 }
3752 
3753 
3754 void MacroAssembler::check_klass_subtype(Register sub_klass,
3755                            Register super_klass,
3756                            Register temp_reg,
3757                            Label& L_success) {
3758   Label L_failure;
3759   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
3760   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
3761   bind(L_failure);
3762 }
3763 
3764 
3765 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3766                                                    Register super_klass,
3767                                                    Register temp_reg,
3768                                                    Label* L_success,
3769                                                    Label* L_failure,
3770                                                    Label* L_slow_path,
3771                                         RegisterOrConstant super_check_offset) {
3772   assert_different_registers(sub_klass, super_klass, temp_reg);
3773   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
3774   if (super_check_offset.is_register()) {
3775     assert_different_registers(sub_klass, super_klass,
3776                                super_check_offset.as_register());
3777   } else if (must_load_sco) {
3778     assert(temp_reg != noreg, "supply either a temp or a register offset");
3779   }
3780 
3781   Label L_fallthrough;
3782   int label_nulls = 0;
3783   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
3784   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
3785   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
3786   assert(label_nulls <= 1, "at most one NULL in the batch");
3787 
3788   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3789   int sco_offset = in_bytes(Klass::super_check_offset_offset());
3790   Address super_check_offset_addr(super_klass, sco_offset);
3791 
3792   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
3793   // range of a jccb.  If this routine grows larger, reconsider at
3794   // least some of these.
3795 #define local_jcc(assembler_cond, label)                                \
3796   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
3797   else                             jcc( assembler_cond, label) /*omit semi*/
3798 
3799   // Hacked jmp, which may only be used just before L_fallthrough.
3800 #define final_jmp(label)                                                \
3801   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
3802   else                            jmp(label)                /*omit semi*/
3803 
3804   // If the pointers are equal, we are done (e.g., String[] elements).
3805   // This self-check enables sharing of secondary supertype arrays among
3806   // non-primary types such as array-of-interface.  Otherwise, each such
3807   // type would need its own customized SSA.
3808   // We move this check to the front of the fast path because many
3809   // type checks are in fact trivially successful in this manner,
3810   // so we get a nicely predicted branch right at the start of the check.
3811   cmpptr(sub_klass, super_klass);
3812   local_jcc(Assembler::equal, *L_success);
3813 
3814   // Check the supertype display:
3815   if (must_load_sco) {
3816     // Positive movl does right thing on LP64.
3817     movl(temp_reg, super_check_offset_addr);
3818     super_check_offset = RegisterOrConstant(temp_reg);
3819   }
3820   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
3821   cmpptr(super_klass, super_check_addr); // load displayed supertype
3822 
3823   // This check has worked decisively for primary supers.
3824   // Secondary supers are sought in the super_cache ('super_cache_addr').
3825   // (Secondary supers are interfaces and very deeply nested subtypes.)
3826   // This works in the same check above because of a tricky aliasing
3827   // between the super_cache and the primary super display elements.
3828   // (The 'super_check_addr' can address either, as the case requires.)
3829   // Note that the cache is updated below if it does not help us find
3830   // what we need immediately.
3831   // So if it was a primary super, we can just fail immediately.
3832   // Otherwise, it's the slow path for us (no success at this point).
3833 
3834   if (super_check_offset.is_register()) {
3835     local_jcc(Assembler::equal, *L_success);
3836     cmpl(super_check_offset.as_register(), sc_offset);
3837     if (L_failure == &L_fallthrough) {
3838       local_jcc(Assembler::equal, *L_slow_path);
3839     } else {
3840       local_jcc(Assembler::notEqual, *L_failure);
3841       final_jmp(*L_slow_path);
3842     }
3843   } else if (super_check_offset.as_constant() == sc_offset) {
3844     // Need a slow path; fast failure is impossible.
3845     if (L_slow_path == &L_fallthrough) {
3846       local_jcc(Assembler::equal, *L_success);
3847     } else {
3848       local_jcc(Assembler::notEqual, *L_slow_path);
3849       final_jmp(*L_success);
3850     }
3851   } else {
3852     // No slow path; it's a fast decision.
3853     if (L_failure == &L_fallthrough) {
3854       local_jcc(Assembler::equal, *L_success);
3855     } else {
3856       local_jcc(Assembler::notEqual, *L_failure);
3857       final_jmp(*L_success);
3858     }
3859   }
3860 
3861   bind(L_fallthrough);
3862 
3863 #undef local_jcc
3864 #undef final_jmp
3865 }
3866 
3867 
3868 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
3869                                                    Register super_klass,
3870                                                    Register temp_reg,
3871                                                    Register temp2_reg,
3872                                                    Label* L_success,
3873                                                    Label* L_failure,
3874                                                    bool set_cond_codes) {
3875   assert_different_registers(sub_klass, super_klass, temp_reg);
3876   if (temp2_reg != noreg)
3877     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
3878 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
3879 
3880   Label L_fallthrough;
3881   int label_nulls = 0;
3882   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
3883   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
3884   assert(label_nulls <= 1, "at most one NULL in the batch");
3885 
3886   // a couple of useful fields in sub_klass:
3887   int ss_offset = in_bytes(Klass::secondary_supers_offset());
3888   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3889   Address secondary_supers_addr(sub_klass, ss_offset);
3890   Address super_cache_addr(     sub_klass, sc_offset);
3891 
3892   // Do a linear scan of the secondary super-klass chain.
3893   // This code is rarely used, so simplicity is a virtue here.
3894   // The repne_scan instruction uses fixed registers, which we must spill.
3895   // Don't worry too much about pre-existing connections with the input regs.
3896 
3897   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
3898   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
3899 
3900   // Get super_klass value into rax (even if it was in rdi or rcx).
3901   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
3902   if (super_klass != rax || UseCompressedOops) {
3903     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
3904     mov(rax, super_klass);
3905   }
3906   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
3907   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
3908 
3909 #ifndef PRODUCT
3910   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
3911   ExternalAddress pst_counter_addr((address) pst_counter);
3912   NOT_LP64(  incrementl(pst_counter_addr) );
3913   LP64_ONLY( lea(rcx, pst_counter_addr) );
3914   LP64_ONLY( incrementl(Address(rcx, 0)) );
3915 #endif //PRODUCT
3916 
3917   // We will consult the secondary-super array.
3918   movptr(rdi, secondary_supers_addr);
3919   // Load the array length.  (Positive movl does right thing on LP64.)
3920   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
3921   // Skip to start of data.
3922   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
3923 
3924   // Scan RCX words at [RDI] for an occurrence of RAX.
3925   // Set NZ/Z based on last compare.
3926   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
3927   // not change flags (only scas instruction which is repeated sets flags).
3928   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
3929 
3930     testptr(rax,rax); // Set Z = 0
3931     repne_scan();
3932 
3933   // Unspill the temp. registers:
3934   if (pushed_rdi)  pop(rdi);
3935   if (pushed_rcx)  pop(rcx);
3936   if (pushed_rax)  pop(rax);
3937 
3938   if (set_cond_codes) {
3939     // Special hack for the AD files:  rdi is guaranteed non-zero.
3940     assert(!pushed_rdi, "rdi must be left non-NULL");
3941     // Also, the condition codes are properly set Z/NZ on succeed/failure.
3942   }
3943 
3944   if (L_failure == &L_fallthrough)
3945         jccb(Assembler::notEqual, *L_failure);
3946   else  jcc(Assembler::notEqual, *L_failure);
3947 
3948   // Success.  Cache the super we found and proceed in triumph.
3949   movptr(super_cache_addr, super_klass);
3950 
3951   if (L_success != &L_fallthrough) {
3952     jmp(*L_success);
3953   }
3954 
3955 #undef IS_A_TEMP
3956 
3957   bind(L_fallthrough);
3958 }
3959 
3960 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
3961   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
3962 
3963   Label L_fallthrough;
3964   if (L_fast_path == NULL) {
3965     L_fast_path = &L_fallthrough;
3966   } else if (L_slow_path == NULL) {
3967     L_slow_path = &L_fallthrough;
3968   }
3969 
3970   // Fast path check: class is fully initialized
3971   cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
3972   jcc(Assembler::equal, *L_fast_path);
3973 
3974   // Fast path check: current thread is initializer thread
3975   cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
3976   if (L_slow_path == &L_fallthrough) {
3977     jcc(Assembler::equal, *L_fast_path);
3978     bind(*L_slow_path);
3979   } else if (L_fast_path == &L_fallthrough) {
3980     jcc(Assembler::notEqual, *L_slow_path);
3981     bind(*L_fast_path);
3982   } else {
3983     Unimplemented();
3984   }
3985 }
3986 
3987 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
3988   if (VM_Version::supports_cmov()) {
3989     cmovl(cc, dst, src);
3990   } else {
3991     Label L;
3992     jccb(negate_condition(cc), L);
3993     movl(dst, src);
3994     bind(L);
3995   }
3996 }
3997 
3998 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
3999   if (VM_Version::supports_cmov()) {
4000     cmovl(cc, dst, src);
4001   } else {
4002     Label L;
4003     jccb(negate_condition(cc), L);
4004     movl(dst, src);
4005     bind(L);
4006   }
4007 }
4008 
4009 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4010   if (!VerifyOops) return;
4011 
4012   // Pass register number to verify_oop_subroutine
4013   const char* b = NULL;
4014   {
4015     ResourceMark rm;
4016     stringStream ss;
4017     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4018     b = code_string(ss.as_string());
4019   }
4020   BLOCK_COMMENT("verify_oop {");
4021 #ifdef _LP64
4022   push(rscratch1);                    // save r10, trashed by movptr()
4023 #endif
4024   push(rax);                          // save rax,
4025   push(reg);                          // pass register argument
4026   ExternalAddress buffer((address) b);
4027   // avoid using pushptr, as it modifies scratch registers
4028   // and our contract is not to modify anything
4029   movptr(rax, buffer.addr());
4030   push(rax);
4031   // call indirectly to solve generation ordering problem
4032   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4033   call(rax);
4034   // Caller pops the arguments (oop, message) and restores rax, r10
4035   BLOCK_COMMENT("} verify_oop");
4036 }
4037 
4038 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4039   if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4040     vpternlogd(dst, 0xFF, dst, dst, vector_len);
4041   } else {
4042     assert(UseAVX > 0, "");
4043     vpcmpeqb(dst, dst, dst, vector_len);
4044   }
4045 }
4046 
4047 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4048                                          int extra_slot_offset) {
4049   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4050   int stackElementSize = Interpreter::stackElementSize;
4051   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4052 #ifdef ASSERT
4053   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4054   assert(offset1 - offset == stackElementSize, "correct arithmetic");
4055 #endif
4056   Register             scale_reg    = noreg;
4057   Address::ScaleFactor scale_factor = Address::no_scale;
4058   if (arg_slot.is_constant()) {
4059     offset += arg_slot.as_constant() * stackElementSize;
4060   } else {
4061     scale_reg    = arg_slot.as_register();
4062     scale_factor = Address::times(stackElementSize);
4063   }
4064   offset += wordSize;           // return PC is on stack
4065   return Address(rsp, scale_reg, scale_factor, offset);
4066 }
4067 
4068 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
4069   if (!VerifyOops) return;
4070 
4071   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4072   // Pass register number to verify_oop_subroutine
4073   const char* b = NULL;
4074   {
4075     ResourceMark rm;
4076     stringStream ss;
4077     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
4078     b = code_string(ss.as_string());
4079   }
4080 #ifdef _LP64
4081   push(rscratch1);                    // save r10, trashed by movptr()
4082 #endif
4083   push(rax);                          // save rax,
4084   // addr may contain rsp so we will have to adjust it based on the push
4085   // we just did (and on 64 bit we do two pushes)
4086   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4087   // stores rax into addr which is backwards of what was intended.
4088   if (addr.uses(rsp)) {
4089     lea(rax, addr);
4090     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4091   } else {
4092     pushptr(addr);
4093   }
4094 
4095   ExternalAddress buffer((address) b);
4096   // pass msg argument
4097   // avoid using pushptr, as it modifies scratch registers
4098   // and our contract is not to modify anything
4099   movptr(rax, buffer.addr());
4100   push(rax);
4101 
4102   // call indirectly to solve generation ordering problem
4103   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4104   call(rax);
4105   // Caller pops the arguments (addr, message) and restores rax, r10.
4106 }
4107 
4108 void MacroAssembler::verify_tlab() {
4109 #ifdef ASSERT
4110   if (UseTLAB && VerifyOops) {
4111     Label next, ok;
4112     Register t1 = rsi;
4113     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4114 
4115     push(t1);
4116     NOT_LP64(push(thread_reg));
4117     NOT_LP64(get_thread(thread_reg));
4118 
4119     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4120     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4121     jcc(Assembler::aboveEqual, next);
4122     STOP("assert(top >= start)");
4123     should_not_reach_here();
4124 
4125     bind(next);
4126     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4127     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4128     jcc(Assembler::aboveEqual, ok);
4129     STOP("assert(top <= end)");
4130     should_not_reach_here();
4131 
4132     bind(ok);
4133     NOT_LP64(pop(thread_reg));
4134     pop(t1);
4135   }
4136 #endif
4137 }
4138 
4139 class ControlWord {
4140  public:
4141   int32_t _value;
4142 
4143   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4144   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4145   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4146   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4147   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4148   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4149   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4150   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4151 
4152   void print() const {
4153     // rounding control
4154     const char* rc;
4155     switch (rounding_control()) {
4156       case 0: rc = "round near"; break;
4157       case 1: rc = "round down"; break;
4158       case 2: rc = "round up  "; break;
4159       case 3: rc = "chop      "; break;
4160       default:
4161         rc = NULL; // silence compiler warnings
4162         fatal("Unknown rounding control: %d", rounding_control());
4163     };
4164     // precision control
4165     const char* pc;
4166     switch (precision_control()) {
4167       case 0: pc = "24 bits "; break;
4168       case 1: pc = "reserved"; break;
4169       case 2: pc = "53 bits "; break;
4170       case 3: pc = "64 bits "; break;
4171       default:
4172         pc = NULL; // silence compiler warnings
4173         fatal("Unknown precision control: %d", precision_control());
4174     };
4175     // flags
4176     char f[9];
4177     f[0] = ' ';
4178     f[1] = ' ';
4179     f[2] = (precision   ()) ? 'P' : 'p';
4180     f[3] = (underflow   ()) ? 'U' : 'u';
4181     f[4] = (overflow    ()) ? 'O' : 'o';
4182     f[5] = (zero_divide ()) ? 'Z' : 'z';
4183     f[6] = (denormalized()) ? 'D' : 'd';
4184     f[7] = (invalid     ()) ? 'I' : 'i';
4185     f[8] = '\x0';
4186     // output
4187     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4188   }
4189 
4190 };
4191 
4192 class StatusWord {
4193  public:
4194   int32_t _value;
4195 
4196   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4197   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4198   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4199   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4200   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4201   int  top() const                     { return  (_value >> 11) & 7      ; }
4202   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4203   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4204   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4205   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4206   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4207   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4208   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4209   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4210 
4211   void print() const {
4212     // condition codes
4213     char c[5];
4214     c[0] = (C3()) ? '3' : '-';
4215     c[1] = (C2()) ? '2' : '-';
4216     c[2] = (C1()) ? '1' : '-';
4217     c[3] = (C0()) ? '0' : '-';
4218     c[4] = '\x0';
4219     // flags
4220     char f[9];
4221     f[0] = (error_status()) ? 'E' : '-';
4222     f[1] = (stack_fault ()) ? 'S' : '-';
4223     f[2] = (precision   ()) ? 'P' : '-';
4224     f[3] = (underflow   ()) ? 'U' : '-';
4225     f[4] = (overflow    ()) ? 'O' : '-';
4226     f[5] = (zero_divide ()) ? 'Z' : '-';
4227     f[6] = (denormalized()) ? 'D' : '-';
4228     f[7] = (invalid     ()) ? 'I' : '-';
4229     f[8] = '\x0';
4230     // output
4231     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4232   }
4233 
4234 };
4235 
4236 class TagWord {
4237  public:
4238   int32_t _value;
4239 
4240   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4241 
4242   void print() const {
4243     printf("%04x", _value & 0xFFFF);
4244   }
4245 
4246 };
4247 
4248 class FPU_Register {
4249  public:
4250   int32_t _m0;
4251   int32_t _m1;
4252   int16_t _ex;
4253 
4254   bool is_indefinite() const           {
4255     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4256   }
4257 
4258   void print() const {
4259     char  sign = (_ex < 0) ? '-' : '+';
4260     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
4261     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
4262   };
4263 
4264 };
4265 
4266 class FPU_State {
4267  public:
4268   enum {
4269     register_size       = 10,
4270     number_of_registers =  8,
4271     register_mask       =  7
4272   };
4273 
4274   ControlWord  _control_word;
4275   StatusWord   _status_word;
4276   TagWord      _tag_word;
4277   int32_t      _error_offset;
4278   int32_t      _error_selector;
4279   int32_t      _data_offset;
4280   int32_t      _data_selector;
4281   int8_t       _register[register_size * number_of_registers];
4282 
4283   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4284   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
4285 
4286   const char* tag_as_string(int tag) const {
4287     switch (tag) {
4288       case 0: return "valid";
4289       case 1: return "zero";
4290       case 2: return "special";
4291       case 3: return "empty";
4292     }
4293     ShouldNotReachHere();
4294     return NULL;
4295   }
4296 
4297   void print() const {
4298     // print computation registers
4299     { int t = _status_word.top();
4300       for (int i = 0; i < number_of_registers; i++) {
4301         int j = (i - t) & register_mask;
4302         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4303         st(j)->print();
4304         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4305       }
4306     }
4307     printf("\n");
4308     // print control registers
4309     printf("ctrl = "); _control_word.print(); printf("\n");
4310     printf("stat = "); _status_word .print(); printf("\n");
4311     printf("tags = "); _tag_word    .print(); printf("\n");
4312   }
4313 
4314 };
4315 
4316 class Flag_Register {
4317  public:
4318   int32_t _value;
4319 
4320   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
4321   bool direction() const               { return ((_value >> 10) & 1) != 0; }
4322   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
4323   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
4324   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
4325   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
4326   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
4327 
4328   void print() const {
4329     // flags
4330     char f[8];
4331     f[0] = (overflow       ()) ? 'O' : '-';
4332     f[1] = (direction      ()) ? 'D' : '-';
4333     f[2] = (sign           ()) ? 'S' : '-';
4334     f[3] = (zero           ()) ? 'Z' : '-';
4335     f[4] = (auxiliary_carry()) ? 'A' : '-';
4336     f[5] = (parity         ()) ? 'P' : '-';
4337     f[6] = (carry          ()) ? 'C' : '-';
4338     f[7] = '\x0';
4339     // output
4340     printf("%08x  flags = %s", _value, f);
4341   }
4342 
4343 };
4344 
4345 class IU_Register {
4346  public:
4347   int32_t _value;
4348 
4349   void print() const {
4350     printf("%08x  %11d", _value, _value);
4351   }
4352 
4353 };
4354 
4355 class IU_State {
4356  public:
4357   Flag_Register _eflags;
4358   IU_Register   _rdi;
4359   IU_Register   _rsi;
4360   IU_Register   _rbp;
4361   IU_Register   _rsp;
4362   IU_Register   _rbx;
4363   IU_Register   _rdx;
4364   IU_Register   _rcx;
4365   IU_Register   _rax;
4366 
4367   void print() const {
4368     // computation registers
4369     printf("rax,  = "); _rax.print(); printf("\n");
4370     printf("rbx,  = "); _rbx.print(); printf("\n");
4371     printf("rcx  = "); _rcx.print(); printf("\n");
4372     printf("rdx  = "); _rdx.print(); printf("\n");
4373     printf("rdi  = "); _rdi.print(); printf("\n");
4374     printf("rsi  = "); _rsi.print(); printf("\n");
4375     printf("rbp,  = "); _rbp.print(); printf("\n");
4376     printf("rsp  = "); _rsp.print(); printf("\n");
4377     printf("\n");
4378     // control registers
4379     printf("flgs = "); _eflags.print(); printf("\n");
4380   }
4381 };
4382 
4383 
4384 class CPU_State {
4385  public:
4386   FPU_State _fpu_state;
4387   IU_State  _iu_state;
4388 
4389   void print() const {
4390     printf("--------------------------------------------------\n");
4391     _iu_state .print();
4392     printf("\n");
4393     _fpu_state.print();
4394     printf("--------------------------------------------------\n");
4395   }
4396 
4397 };
4398 
4399 
4400 static void _print_CPU_state(CPU_State* state) {
4401   state->print();
4402 };
4403 
4404 
4405 void MacroAssembler::print_CPU_state() {
4406   push_CPU_state();
4407   push(rsp);                // pass CPU state
4408   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
4409   addptr(rsp, wordSize);       // discard argument
4410   pop_CPU_state();
4411 }
4412 
4413 
4414 #ifndef _LP64
4415 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
4416   static int counter = 0;
4417   FPU_State* fs = &state->_fpu_state;
4418   counter++;
4419   // For leaf calls, only verify that the top few elements remain empty.
4420   // We only need 1 empty at the top for C2 code.
4421   if( stack_depth < 0 ) {
4422     if( fs->tag_for_st(7) != 3 ) {
4423       printf("FPR7 not empty\n");
4424       state->print();
4425       assert(false, "error");
4426       return false;
4427     }
4428     return true;                // All other stack states do not matter
4429   }
4430 
4431   assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(),
4432          "bad FPU control word");
4433 
4434   // compute stack depth
4435   int i = 0;
4436   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
4437   int d = i;
4438   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
4439   // verify findings
4440   if (i != FPU_State::number_of_registers) {
4441     // stack not contiguous
4442     printf("%s: stack not contiguous at ST%d\n", s, i);
4443     state->print();
4444     assert(false, "error");
4445     return false;
4446   }
4447   // check if computed stack depth corresponds to expected stack depth
4448   if (stack_depth < 0) {
4449     // expected stack depth is -stack_depth or less
4450     if (d > -stack_depth) {
4451       // too many elements on the stack
4452       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
4453       state->print();
4454       assert(false, "error");
4455       return false;
4456     }
4457   } else {
4458     // expected stack depth is stack_depth
4459     if (d != stack_depth) {
4460       // wrong stack depth
4461       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
4462       state->print();
4463       assert(false, "error");
4464       return false;
4465     }
4466   }
4467   // everything is cool
4468   return true;
4469 }
4470 
4471 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
4472   if (!VerifyFPU) return;
4473   push_CPU_state();
4474   push(rsp);                // pass CPU state
4475   ExternalAddress msg((address) s);
4476   // pass message string s
4477   pushptr(msg.addr());
4478   push(stack_depth);        // pass stack depth
4479   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
4480   addptr(rsp, 3 * wordSize);   // discard arguments
4481   // check for error
4482   { Label L;
4483     testl(rax, rax);
4484     jcc(Assembler::notZero, L);
4485     int3();                  // break if error condition
4486     bind(L);
4487   }
4488   pop_CPU_state();
4489 }
4490 #endif // _LP64
4491 
4492 void MacroAssembler::restore_cpu_control_state_after_jni() {
4493   // Either restore the MXCSR register after returning from the JNI Call
4494   // or verify that it wasn't changed (with -Xcheck:jni flag).
4495   if (VM_Version::supports_sse()) {
4496     if (RestoreMXCSROnJNICalls) {
4497       ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()));
4498     } else if (CheckJNICalls) {
4499       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
4500     }
4501   }
4502   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
4503   vzeroupper();
4504   // Reset k1 to 0xffff.
4505 
4506 #ifdef COMPILER2
4507   if (PostLoopMultiversioning && VM_Version::supports_evex()) {
4508     push(rcx);
4509     movl(rcx, 0xffff);
4510     kmovwl(k1, rcx);
4511     pop(rcx);
4512   }
4513 #endif // COMPILER2
4514 
4515 #ifndef _LP64
4516   // Either restore the x87 floating pointer control word after returning
4517   // from the JNI call or verify that it wasn't changed.
4518   if (CheckJNICalls) {
4519     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
4520   }
4521 #endif // _LP64
4522 }
4523 
4524 // ((OopHandle)result).resolve();
4525 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
4526   assert_different_registers(result, tmp);
4527 
4528   // Only 64 bit platforms support GCs that require a tmp register
4529   // Only IN_HEAP loads require a thread_tmp register
4530   // OopHandle::resolve is an indirection like jobject.
4531   access_load_at(T_OBJECT, IN_NATIVE,
4532                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
4533 }
4534 
4535 // ((WeakHandle)result).resolve();
4536 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
4537   assert_different_registers(rresult, rtmp);
4538   Label resolved;
4539 
4540   // A null weak handle resolves to null.
4541   cmpptr(rresult, 0);
4542   jcc(Assembler::equal, resolved);
4543 
4544   // Only 64 bit platforms support GCs that require a tmp register
4545   // Only IN_HEAP loads require a thread_tmp register
4546   // WeakHandle::resolve is an indirection like jweak.
4547   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4548                  rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
4549   bind(resolved);
4550 }
4551 
4552 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
4553   // get mirror
4554   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
4555   load_method_holder(mirror, method);
4556   movptr(mirror, Address(mirror, mirror_offset));
4557   resolve_oop_handle(mirror, tmp);
4558 }
4559 
4560 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
4561   load_method_holder(rresult, rmethod);
4562   movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
4563 }
4564 
4565 void MacroAssembler::load_method_holder(Register holder, Register method) {
4566   movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
4567   movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
4568   movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
4569 }
4570 
4571 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
4572   assert_different_registers(src, tmp);
4573   assert_different_registers(dst, tmp);
4574 #ifdef _LP64
4575   if (UseCompressedClassPointers) {
4576     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4577     decode_klass_not_null(dst, tmp);
4578   } else
4579 #endif
4580     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4581 }
4582 
4583 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
4584   assert_different_registers(src, tmp);
4585   assert_different_registers(dst, tmp);
4586 #ifdef _LP64
4587   if (UseCompressedClassPointers) {
4588     encode_klass_not_null(src, tmp);
4589     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4590   } else
4591 #endif
4592     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4593 }
4594 
4595 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
4596                                     Register tmp1, Register thread_tmp) {
4597   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4598   decorators = AccessInternal::decorator_fixup(decorators);
4599   bool as_raw = (decorators & AS_RAW) != 0;
4600   if (as_raw) {
4601     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4602   } else {
4603     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4604   }
4605 }
4606 
4607 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
4608                                      Register tmp1, Register tmp2) {
4609   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4610   decorators = AccessInternal::decorator_fixup(decorators);
4611   bool as_raw = (decorators & AS_RAW) != 0;
4612   if (as_raw) {
4613     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2);
4614   } else {
4615     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2);
4616   }
4617 }
4618 
4619 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4620                                    Register thread_tmp, DecoratorSet decorators) {
4621   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4622 }
4623 
4624 // Doesn't do verfication, generates fixed size code
4625 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4626                                             Register thread_tmp, DecoratorSet decorators) {
4627   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4628 }
4629 
4630 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4631                                     Register tmp2, DecoratorSet decorators) {
4632   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
4633 }
4634 
4635 // Used for storing NULLs.
4636 void MacroAssembler::store_heap_oop_null(Address dst) {
4637   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4638 }
4639 
4640 #ifdef _LP64
4641 void MacroAssembler::store_klass_gap(Register dst, Register src) {
4642   if (UseCompressedClassPointers) {
4643     // Store to klass gap in destination
4644     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
4645   }
4646 }
4647 
4648 #ifdef ASSERT
4649 void MacroAssembler::verify_heapbase(const char* msg) {
4650   assert (UseCompressedOops, "should be compressed");
4651   assert (Universe::heap() != NULL, "java heap should be initialized");
4652   if (CheckCompressedOops) {
4653     Label ok;
4654     push(rscratch1); // cmpptr trashes rscratch1
4655     cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
4656     jcc(Assembler::equal, ok);
4657     STOP(msg);
4658     bind(ok);
4659     pop(rscratch1);
4660   }
4661 }
4662 #endif
4663 
4664 // Algorithm must match oop.inline.hpp encode_heap_oop.
4665 void MacroAssembler::encode_heap_oop(Register r) {
4666 #ifdef ASSERT
4667   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
4668 #endif
4669   verify_oop_msg(r, "broken oop in encode_heap_oop");
4670   if (CompressedOops::base() == NULL) {
4671     if (CompressedOops::shift() != 0) {
4672       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4673       shrq(r, LogMinObjAlignmentInBytes);
4674     }
4675     return;
4676   }
4677   testq(r, r);
4678   cmovq(Assembler::equal, r, r12_heapbase);
4679   subq(r, r12_heapbase);
4680   shrq(r, LogMinObjAlignmentInBytes);
4681 }
4682 
4683 void MacroAssembler::encode_heap_oop_not_null(Register r) {
4684 #ifdef ASSERT
4685   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
4686   if (CheckCompressedOops) {
4687     Label ok;
4688     testq(r, r);
4689     jcc(Assembler::notEqual, ok);
4690     STOP("null oop passed to encode_heap_oop_not_null");
4691     bind(ok);
4692   }
4693 #endif
4694   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
4695   if (CompressedOops::base() != NULL) {
4696     subq(r, r12_heapbase);
4697   }
4698   if (CompressedOops::shift() != 0) {
4699     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4700     shrq(r, LogMinObjAlignmentInBytes);
4701   }
4702 }
4703 
4704 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
4705 #ifdef ASSERT
4706   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
4707   if (CheckCompressedOops) {
4708     Label ok;
4709     testq(src, src);
4710     jcc(Assembler::notEqual, ok);
4711     STOP("null oop passed to encode_heap_oop_not_null2");
4712     bind(ok);
4713   }
4714 #endif
4715   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
4716   if (dst != src) {
4717     movq(dst, src);
4718   }
4719   if (CompressedOops::base() != NULL) {
4720     subq(dst, r12_heapbase);
4721   }
4722   if (CompressedOops::shift() != 0) {
4723     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4724     shrq(dst, LogMinObjAlignmentInBytes);
4725   }
4726 }
4727 
4728 void  MacroAssembler::decode_heap_oop(Register r) {
4729 #ifdef ASSERT
4730   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
4731 #endif
4732   if (CompressedOops::base() == NULL) {
4733     if (CompressedOops::shift() != 0) {
4734       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4735       shlq(r, LogMinObjAlignmentInBytes);
4736     }
4737   } else {
4738     Label done;
4739     shlq(r, LogMinObjAlignmentInBytes);
4740     jccb(Assembler::equal, done);
4741     addq(r, r12_heapbase);
4742     bind(done);
4743   }
4744   verify_oop_msg(r, "broken oop in decode_heap_oop");
4745 }
4746 
4747 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
4748   // Note: it will change flags
4749   assert (UseCompressedOops, "should only be used for compressed headers");
4750   assert (Universe::heap() != NULL, "java heap should be initialized");
4751   // Cannot assert, unverified entry point counts instructions (see .ad file)
4752   // vtableStubs also counts instructions in pd_code_size_limit.
4753   // Also do not verify_oop as this is called by verify_oop.
4754   if (CompressedOops::shift() != 0) {
4755     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4756     shlq(r, LogMinObjAlignmentInBytes);
4757     if (CompressedOops::base() != NULL) {
4758       addq(r, r12_heapbase);
4759     }
4760   } else {
4761     assert (CompressedOops::base() == NULL, "sanity");
4762   }
4763 }
4764 
4765 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
4766   // Note: it will change flags
4767   assert (UseCompressedOops, "should only be used for compressed headers");
4768   assert (Universe::heap() != NULL, "java heap should be initialized");
4769   // Cannot assert, unverified entry point counts instructions (see .ad file)
4770   // vtableStubs also counts instructions in pd_code_size_limit.
4771   // Also do not verify_oop as this is called by verify_oop.
4772   if (CompressedOops::shift() != 0) {
4773     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4774     if (LogMinObjAlignmentInBytes == Address::times_8) {
4775       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
4776     } else {
4777       if (dst != src) {
4778         movq(dst, src);
4779       }
4780       shlq(dst, LogMinObjAlignmentInBytes);
4781       if (CompressedOops::base() != NULL) {
4782         addq(dst, r12_heapbase);
4783       }
4784     }
4785   } else {
4786     assert (CompressedOops::base() == NULL, "sanity");
4787     if (dst != src) {
4788       movq(dst, src);
4789     }
4790   }
4791 }
4792 
4793 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
4794   assert_different_registers(r, tmp);
4795   if (CompressedKlassPointers::base() != NULL) {
4796     mov64(tmp, (int64_t)CompressedKlassPointers::base());
4797     subq(r, tmp);
4798   }
4799   if (CompressedKlassPointers::shift() != 0) {
4800     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4801     shrq(r, LogKlassAlignmentInBytes);
4802   }
4803 }
4804 
4805 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
4806   assert_different_registers(src, dst);
4807   if (CompressedKlassPointers::base() != NULL) {
4808     mov64(dst, -(int64_t)CompressedKlassPointers::base());
4809     addq(dst, src);
4810   } else {
4811     movptr(dst, src);
4812   }
4813   if (CompressedKlassPointers::shift() != 0) {
4814     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4815     shrq(dst, LogKlassAlignmentInBytes);
4816   }
4817 }
4818 
4819 void  MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
4820   assert_different_registers(r, tmp);
4821   // Note: it will change flags
4822   assert(UseCompressedClassPointers, "should only be used for compressed headers");
4823   // Cannot assert, unverified entry point counts instructions (see .ad file)
4824   // vtableStubs also counts instructions in pd_code_size_limit.
4825   // Also do not verify_oop as this is called by verify_oop.
4826   if (CompressedKlassPointers::shift() != 0) {
4827     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4828     shlq(r, LogKlassAlignmentInBytes);
4829   }
4830   if (CompressedKlassPointers::base() != NULL) {
4831     mov64(tmp, (int64_t)CompressedKlassPointers::base());
4832     addq(r, tmp);
4833   }
4834 }
4835 
4836 void  MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
4837   assert_different_registers(src, dst);
4838   // Note: it will change flags
4839   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4840   // Cannot assert, unverified entry point counts instructions (see .ad file)
4841   // vtableStubs also counts instructions in pd_code_size_limit.
4842   // Also do not verify_oop as this is called by verify_oop.
4843 
4844   if (CompressedKlassPointers::base() == NULL &&
4845       CompressedKlassPointers::shift() == 0) {
4846     // The best case scenario is that there is no base or shift. Then it is already
4847     // a pointer that needs nothing but a register rename.
4848     movl(dst, src);
4849   } else {
4850     if (CompressedKlassPointers::base() != NULL) {
4851       mov64(dst, (int64_t)CompressedKlassPointers::base());
4852     } else {
4853       xorq(dst, dst);
4854     }
4855     if (CompressedKlassPointers::shift() != 0) {
4856       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4857       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
4858       leaq(dst, Address(dst, src, Address::times_8, 0));
4859     } else {
4860       addq(dst, src);
4861     }
4862   }
4863 }
4864 
4865 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4866   assert (UseCompressedOops, "should only be used for compressed headers");
4867   assert (Universe::heap() != NULL, "java heap should be initialized");
4868   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4869   int oop_index = oop_recorder()->find_index(obj);
4870   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4871   mov_narrow_oop(dst, oop_index, rspec);
4872 }
4873 
4874 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
4875   assert (UseCompressedOops, "should only be used for compressed headers");
4876   assert (Universe::heap() != NULL, "java heap should be initialized");
4877   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4878   int oop_index = oop_recorder()->find_index(obj);
4879   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4880   mov_narrow_oop(dst, oop_index, rspec);
4881 }
4882 
4883 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4884   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4885   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4886   int klass_index = oop_recorder()->find_index(k);
4887   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
4888   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
4889 }
4890 
4891 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
4892   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4893   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4894   int klass_index = oop_recorder()->find_index(k);
4895   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
4896   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
4897 }
4898 
4899 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
4900   assert (UseCompressedOops, "should only be used for compressed headers");
4901   assert (Universe::heap() != NULL, "java heap should be initialized");
4902   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4903   int oop_index = oop_recorder()->find_index(obj);
4904   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4905   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
4906 }
4907 
4908 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
4909   assert (UseCompressedOops, "should only be used for compressed headers");
4910   assert (Universe::heap() != NULL, "java heap should be initialized");
4911   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4912   int oop_index = oop_recorder()->find_index(obj);
4913   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4914   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
4915 }
4916 
4917 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
4918   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4919   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4920   int klass_index = oop_recorder()->find_index(k);
4921   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
4922   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
4923 }
4924 
4925 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
4926   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4927   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4928   int klass_index = oop_recorder()->find_index(k);
4929   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
4930   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
4931 }
4932 
4933 void MacroAssembler::reinit_heapbase() {
4934   if (UseCompressedOops) {
4935     if (Universe::heap() != NULL) {
4936       if (CompressedOops::base() == NULL) {
4937         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
4938       } else {
4939         mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
4940       }
4941     } else {
4942       movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
4943     }
4944   }
4945 }
4946 
4947 #endif // _LP64
4948 
4949 // C2 compiled method's prolog code.
4950 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
4951 
4952   // WARNING: Initial instruction MUST be 5 bytes or longer so that
4953   // NativeJump::patch_verified_entry will be able to patch out the entry
4954   // code safely. The push to verify stack depth is ok at 5 bytes,
4955   // the frame allocation can be either 3 or 6 bytes. So if we don't do
4956   // stack bang then we must use the 6 byte frame allocation even if
4957   // we have no frame. :-(
4958   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
4959 
4960   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
4961   // Remove word for return addr
4962   framesize -= wordSize;
4963   stack_bang_size -= wordSize;
4964 
4965   // Calls to C2R adapters often do not accept exceptional returns.
4966   // We require that their callers must bang for them.  But be careful, because
4967   // some VM calls (such as call site linkage) can use several kilobytes of
4968   // stack.  But the stack safety zone should account for that.
4969   // See bugs 4446381, 4468289, 4497237.
4970   if (stack_bang_size > 0) {
4971     generate_stack_overflow_check(stack_bang_size);
4972 
4973     // We always push rbp, so that on return to interpreter rbp, will be
4974     // restored correctly and we can correct the stack.
4975     push(rbp);
4976     // Save caller's stack pointer into RBP if the frame pointer is preserved.
4977     if (PreserveFramePointer) {
4978       mov(rbp, rsp);
4979     }
4980     // Remove word for ebp
4981     framesize -= wordSize;
4982 
4983     // Create frame
4984     if (framesize) {
4985       subptr(rsp, framesize);
4986     }
4987   } else {
4988     // Create frame (force generation of a 4 byte immediate value)
4989     subptr_imm32(rsp, framesize);
4990 
4991     // Save RBP register now.
4992     framesize -= wordSize;
4993     movptr(Address(rsp, framesize), rbp);
4994     // Save caller's stack pointer into RBP if the frame pointer is preserved.
4995     if (PreserveFramePointer) {
4996       movptr(rbp, rsp);
4997       if (framesize > 0) {
4998         addptr(rbp, framesize);
4999       }
5000     }
5001   }
5002 
5003   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5004     framesize -= wordSize;
5005     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5006   }
5007 
5008 #ifndef _LP64
5009   // If method sets FPU control word do it now
5010   if (fp_mode_24b) {
5011     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
5012   }
5013   if (UseSSE >= 2 && VerifyFPU) {
5014     verify_FPU(0, "FPU stack must be clean on entry");
5015   }
5016 #endif
5017 
5018 #ifdef ASSERT
5019   if (VerifyStackAtCalls) {
5020     Label L;
5021     push(rax);
5022     mov(rax, rsp);
5023     andptr(rax, StackAlignmentInBytes-1);
5024     cmpptr(rax, StackAlignmentInBytes-wordSize);
5025     pop(rax);
5026     jcc(Assembler::equal, L);
5027     STOP("Stack is not properly aligned!");
5028     bind(L);
5029   }
5030 #endif
5031 
5032   if (!is_stub) {
5033     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5034     bs->nmethod_entry_barrier(this);
5035   }
5036 }
5037 
5038 #if COMPILER2_OR_JVMCI
5039 
5040 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5041 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5042   // cnt - number of qwords (8-byte words).
5043   // base - start address, qword aligned.
5044   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5045   bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0);
5046   if (use64byteVector) {
5047     vpxor(xtmp, xtmp, xtmp, AVX_512bit);
5048   } else if (MaxVectorSize >= 32) {
5049     vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5050   } else {
5051     pxor(xtmp, xtmp);
5052   }
5053   jmp(L_zero_64_bytes);
5054 
5055   BIND(L_loop);
5056   if (MaxVectorSize >= 32) {
5057     fill64(base, 0, xtmp, use64byteVector);
5058   } else {
5059     movdqu(Address(base,  0), xtmp);
5060     movdqu(Address(base, 16), xtmp);
5061     movdqu(Address(base, 32), xtmp);
5062     movdqu(Address(base, 48), xtmp);
5063   }
5064   addptr(base, 64);
5065 
5066   BIND(L_zero_64_bytes);
5067   subptr(cnt, 8);
5068   jccb(Assembler::greaterEqual, L_loop);
5069 
5070   // Copy trailing 64 bytes
5071   if (use64byteVector) {
5072     addptr(cnt, 8);
5073     jccb(Assembler::equal, L_end);
5074     fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
5075     jmp(L_end);
5076   } else {
5077     addptr(cnt, 4);
5078     jccb(Assembler::less, L_tail);
5079     if (MaxVectorSize >= 32) {
5080       vmovdqu(Address(base, 0), xtmp);
5081     } else {
5082       movdqu(Address(base,  0), xtmp);
5083       movdqu(Address(base, 16), xtmp);
5084     }
5085   }
5086   addptr(base, 32);
5087   subptr(cnt, 4);
5088 
5089   BIND(L_tail);
5090   addptr(cnt, 4);
5091   jccb(Assembler::lessEqual, L_end);
5092   if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5093     fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
5094   } else {
5095     decrement(cnt);
5096 
5097     BIND(L_sloop);
5098     movq(Address(base, 0), xtmp);
5099     addptr(base, 8);
5100     decrement(cnt);
5101     jccb(Assembler::greaterEqual, L_sloop);
5102   }
5103   BIND(L_end);
5104 }
5105 
5106 // Clearing constant sized memory using YMM/ZMM registers.
5107 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5108   assert(UseAVX > 2 && VM_Version::supports_avx512vlbw(), "");
5109   bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
5110 
5111   int vector64_count = (cnt & (~0x7)) >> 3;
5112   cnt = cnt & 0x7;
5113 
5114   // 64 byte initialization loop.
5115   vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
5116   for (int i = 0; i < vector64_count; i++) {
5117     fill64(base, i * 64, xtmp, use64byteVector);
5118   }
5119 
5120   // Clear remaining 64 byte tail.
5121   int disp = vector64_count * 64;
5122   if (cnt) {
5123     switch (cnt) {
5124       case 1:
5125         movq(Address(base, disp), xtmp);
5126         break;
5127       case 2:
5128         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_128bit);
5129         break;
5130       case 3:
5131         movl(rtmp, 0x7);
5132         kmovwl(mask, rtmp);
5133         evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_256bit);
5134         break;
5135       case 4:
5136         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5137         break;
5138       case 5:
5139         if (use64byteVector) {
5140           movl(rtmp, 0x1F);
5141           kmovwl(mask, rtmp);
5142           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5143         } else {
5144           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5145           movq(Address(base, disp + 32), xtmp);
5146         }
5147         break;
5148       case 6:
5149         if (use64byteVector) {
5150           movl(rtmp, 0x3F);
5151           kmovwl(mask, rtmp);
5152           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5153         } else {
5154           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5155           evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, Assembler::AVX_128bit);
5156         }
5157         break;
5158       case 7:
5159         if (use64byteVector) {
5160           movl(rtmp, 0x7F);
5161           kmovwl(mask, rtmp);
5162           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5163         } else {
5164           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5165           movl(rtmp, 0x7);
5166           kmovwl(mask, rtmp);
5167           evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, Assembler::AVX_256bit);
5168         }
5169         break;
5170       default:
5171         fatal("Unexpected length : %d\n",cnt);
5172         break;
5173     }
5174   }
5175 }
5176 
5177 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
5178                                bool is_large, KRegister mask) {
5179   // cnt      - number of qwords (8-byte words).
5180   // base     - start address, qword aligned.
5181   // is_large - if optimizers know cnt is larger than InitArrayShortSize
5182   assert(base==rdi, "base register must be edi for rep stos");
5183   assert(tmp==rax,   "tmp register must be eax for rep stos");
5184   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
5185   assert(InitArrayShortSize % BytesPerLong == 0,
5186     "InitArrayShortSize should be the multiple of BytesPerLong");
5187 
5188   Label DONE;
5189   if (!is_large || !UseXMMForObjInit) {
5190     xorptr(tmp, tmp);
5191   }
5192 
5193   if (!is_large) {
5194     Label LOOP, LONG;
5195     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5196     jccb(Assembler::greater, LONG);
5197 
5198     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5199 
5200     decrement(cnt);
5201     jccb(Assembler::negative, DONE); // Zero length
5202 
5203     // Use individual pointer-sized stores for small counts:
5204     BIND(LOOP);
5205     movptr(Address(base, cnt, Address::times_ptr), tmp);
5206     decrement(cnt);
5207     jccb(Assembler::greaterEqual, LOOP);
5208     jmpb(DONE);
5209 
5210     BIND(LONG);
5211   }
5212 
5213   // Use longer rep-prefixed ops for non-small counts:
5214   if (UseFastStosb) {
5215     shlptr(cnt, 3); // convert to number of bytes
5216     rep_stosb();
5217   } else if (UseXMMForObjInit) {
5218     xmm_clear_mem(base, cnt, tmp, xtmp, mask);
5219   } else {
5220     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5221     rep_stos();
5222   }
5223 
5224   BIND(DONE);
5225 }
5226 
5227 #endif //COMPILER2_OR_JVMCI
5228 
5229 
5230 void MacroAssembler::generate_fill(BasicType t, bool aligned,
5231                                    Register to, Register value, Register count,
5232                                    Register rtmp, XMMRegister xtmp) {
5233   ShortBranchVerifier sbv(this);
5234   assert_different_registers(to, value, count, rtmp);
5235   Label L_exit;
5236   Label L_fill_2_bytes, L_fill_4_bytes;
5237 
5238 #if defined(COMPILER2) && defined(_LP64)
5239   if(MaxVectorSize >=32 &&
5240      VM_Version::supports_avx512vlbw() &&
5241      VM_Version::supports_bmi2()) {
5242     generate_fill_avx3(t, to, value, count, rtmp, xtmp);
5243     return;
5244   }
5245 #endif
5246 
5247   int shift = -1;
5248   switch (t) {
5249     case T_BYTE:
5250       shift = 2;
5251       break;
5252     case T_SHORT:
5253       shift = 1;
5254       break;
5255     case T_INT:
5256       shift = 0;
5257       break;
5258     default: ShouldNotReachHere();
5259   }
5260 
5261   if (t == T_BYTE) {
5262     andl(value, 0xff);
5263     movl(rtmp, value);
5264     shll(rtmp, 8);
5265     orl(value, rtmp);
5266   }
5267   if (t == T_SHORT) {
5268     andl(value, 0xffff);
5269   }
5270   if (t == T_BYTE || t == T_SHORT) {
5271     movl(rtmp, value);
5272     shll(rtmp, 16);
5273     orl(value, rtmp);
5274   }
5275 
5276   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
5277   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
5278   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
5279     Label L_skip_align2;
5280     // align source address at 4 bytes address boundary
5281     if (t == T_BYTE) {
5282       Label L_skip_align1;
5283       // One byte misalignment happens only for byte arrays
5284       testptr(to, 1);
5285       jccb(Assembler::zero, L_skip_align1);
5286       movb(Address(to, 0), value);
5287       increment(to);
5288       decrement(count);
5289       BIND(L_skip_align1);
5290     }
5291     // Two bytes misalignment happens only for byte and short (char) arrays
5292     testptr(to, 2);
5293     jccb(Assembler::zero, L_skip_align2);
5294     movw(Address(to, 0), value);
5295     addptr(to, 2);
5296     subl(count, 1<<(shift-1));
5297     BIND(L_skip_align2);
5298   }
5299   if (UseSSE < 2) {
5300     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5301     // Fill 32-byte chunks
5302     subl(count, 8 << shift);
5303     jcc(Assembler::less, L_check_fill_8_bytes);
5304     align(16);
5305 
5306     BIND(L_fill_32_bytes_loop);
5307 
5308     for (int i = 0; i < 32; i += 4) {
5309       movl(Address(to, i), value);
5310     }
5311 
5312     addptr(to, 32);
5313     subl(count, 8 << shift);
5314     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5315     BIND(L_check_fill_8_bytes);
5316     addl(count, 8 << shift);
5317     jccb(Assembler::zero, L_exit);
5318     jmpb(L_fill_8_bytes);
5319 
5320     //
5321     // length is too short, just fill qwords
5322     //
5323     BIND(L_fill_8_bytes_loop);
5324     movl(Address(to, 0), value);
5325     movl(Address(to, 4), value);
5326     addptr(to, 8);
5327     BIND(L_fill_8_bytes);
5328     subl(count, 1 << (shift + 1));
5329     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5330     // fall through to fill 4 bytes
5331   } else {
5332     Label L_fill_32_bytes;
5333     if (!UseUnalignedLoadStores) {
5334       // align to 8 bytes, we know we are 4 byte aligned to start
5335       testptr(to, 4);
5336       jccb(Assembler::zero, L_fill_32_bytes);
5337       movl(Address(to, 0), value);
5338       addptr(to, 4);
5339       subl(count, 1<<shift);
5340     }
5341     BIND(L_fill_32_bytes);
5342     {
5343       assert( UseSSE >= 2, "supported cpu only" );
5344       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5345       movdl(xtmp, value);
5346       if (UseAVX >= 2 && UseUnalignedLoadStores) {
5347         Label L_check_fill_32_bytes;
5348         if (UseAVX > 2) {
5349           // Fill 64-byte chunks
5350           Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
5351 
5352           // If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2
5353           cmpl(count, VM_Version::avx3_threshold());
5354           jccb(Assembler::below, L_check_fill_64_bytes_avx2);
5355 
5356           vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
5357 
5358           subl(count, 16 << shift);
5359           jccb(Assembler::less, L_check_fill_32_bytes);
5360           align(16);
5361 
5362           BIND(L_fill_64_bytes_loop_avx3);
5363           evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
5364           addptr(to, 64);
5365           subl(count, 16 << shift);
5366           jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
5367           jmpb(L_check_fill_32_bytes);
5368 
5369           BIND(L_check_fill_64_bytes_avx2);
5370         }
5371         // Fill 64-byte chunks
5372         Label L_fill_64_bytes_loop;
5373         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
5374 
5375         subl(count, 16 << shift);
5376         jcc(Assembler::less, L_check_fill_32_bytes);
5377         align(16);
5378 
5379         BIND(L_fill_64_bytes_loop);
5380         vmovdqu(Address(to, 0), xtmp);
5381         vmovdqu(Address(to, 32), xtmp);
5382         addptr(to, 64);
5383         subl(count, 16 << shift);
5384         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
5385 
5386         BIND(L_check_fill_32_bytes);
5387         addl(count, 8 << shift);
5388         jccb(Assembler::less, L_check_fill_8_bytes);
5389         vmovdqu(Address(to, 0), xtmp);
5390         addptr(to, 32);
5391         subl(count, 8 << shift);
5392 
5393         BIND(L_check_fill_8_bytes);
5394         // clean upper bits of YMM registers
5395         movdl(xtmp, value);
5396         pshufd(xtmp, xtmp, 0);
5397       } else {
5398         // Fill 32-byte chunks
5399         pshufd(xtmp, xtmp, 0);
5400 
5401         subl(count, 8 << shift);
5402         jcc(Assembler::less, L_check_fill_8_bytes);
5403         align(16);
5404 
5405         BIND(L_fill_32_bytes_loop);
5406 
5407         if (UseUnalignedLoadStores) {
5408           movdqu(Address(to, 0), xtmp);
5409           movdqu(Address(to, 16), xtmp);
5410         } else {
5411           movq(Address(to, 0), xtmp);
5412           movq(Address(to, 8), xtmp);
5413           movq(Address(to, 16), xtmp);
5414           movq(Address(to, 24), xtmp);
5415         }
5416 
5417         addptr(to, 32);
5418         subl(count, 8 << shift);
5419         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5420 
5421         BIND(L_check_fill_8_bytes);
5422       }
5423       addl(count, 8 << shift);
5424       jccb(Assembler::zero, L_exit);
5425       jmpb(L_fill_8_bytes);
5426 
5427       //
5428       // length is too short, just fill qwords
5429       //
5430       BIND(L_fill_8_bytes_loop);
5431       movq(Address(to, 0), xtmp);
5432       addptr(to, 8);
5433       BIND(L_fill_8_bytes);
5434       subl(count, 1 << (shift + 1));
5435       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5436     }
5437   }
5438   // fill trailing 4 bytes
5439   BIND(L_fill_4_bytes);
5440   testl(count, 1<<shift);
5441   jccb(Assembler::zero, L_fill_2_bytes);
5442   movl(Address(to, 0), value);
5443   if (t == T_BYTE || t == T_SHORT) {
5444     Label L_fill_byte;
5445     addptr(to, 4);
5446     BIND(L_fill_2_bytes);
5447     // fill trailing 2 bytes
5448     testl(count, 1<<(shift-1));
5449     jccb(Assembler::zero, L_fill_byte);
5450     movw(Address(to, 0), value);
5451     if (t == T_BYTE) {
5452       addptr(to, 2);
5453       BIND(L_fill_byte);
5454       // fill trailing byte
5455       testl(count, 1);
5456       jccb(Assembler::zero, L_exit);
5457       movb(Address(to, 0), value);
5458     } else {
5459       BIND(L_fill_byte);
5460     }
5461   } else {
5462     BIND(L_fill_2_bytes);
5463   }
5464   BIND(L_exit);
5465 }
5466 
5467 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
5468   switch(type) {
5469     case T_BYTE:
5470     case T_BOOLEAN:
5471       evpbroadcastb(dst, src, vector_len);
5472       break;
5473     case T_SHORT:
5474     case T_CHAR:
5475       evpbroadcastw(dst, src, vector_len);
5476       break;
5477     case T_INT:
5478     case T_FLOAT:
5479       evpbroadcastd(dst, src, vector_len);
5480       break;
5481     case T_LONG:
5482     case T_DOUBLE:
5483       evpbroadcastq(dst, src, vector_len);
5484       break;
5485     default:
5486       fatal("Unhandled type : %s", type2name(type));
5487       break;
5488   }
5489 }
5490 
5491 // encode char[] to byte[] in ISO_8859_1 or ASCII
5492    //@IntrinsicCandidate
5493    //private static int implEncodeISOArray(byte[] sa, int sp,
5494    //byte[] da, int dp, int len) {
5495    //  int i = 0;
5496    //  for (; i < len; i++) {
5497    //    char c = StringUTF16.getChar(sa, sp++);
5498    //    if (c > '\u00FF')
5499    //      break;
5500    //    da[dp++] = (byte)c;
5501    //  }
5502    //  return i;
5503    //}
5504    //
5505    //@IntrinsicCandidate
5506    //private static int implEncodeAsciiArray(char[] sa, int sp,
5507    //    byte[] da, int dp, int len) {
5508    //  int i = 0;
5509    //  for (; i < len; i++) {
5510    //    char c = sa[sp++];
5511    //    if (c >= '\u0080')
5512    //      break;
5513    //    da[dp++] = (byte)c;
5514    //  }
5515    //  return i;
5516    //}
5517 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
5518   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
5519   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
5520   Register tmp5, Register result, bool ascii) {
5521 
5522   // rsi: src
5523   // rdi: dst
5524   // rdx: len
5525   // rcx: tmp5
5526   // rax: result
5527   ShortBranchVerifier sbv(this);
5528   assert_different_registers(src, dst, len, tmp5, result);
5529   Label L_done, L_copy_1_char, L_copy_1_char_exit;
5530 
5531   int mask = ascii ? 0xff80ff80 : 0xff00ff00;
5532   int short_mask = ascii ? 0xff80 : 0xff00;
5533 
5534   // set result
5535   xorl(result, result);
5536   // check for zero length
5537   testl(len, len);
5538   jcc(Assembler::zero, L_done);
5539 
5540   movl(result, len);
5541 
5542   // Setup pointers
5543   lea(src, Address(src, len, Address::times_2)); // char[]
5544   lea(dst, Address(dst, len, Address::times_1)); // byte[]
5545   negptr(len);
5546 
5547   if (UseSSE42Intrinsics || UseAVX >= 2) {
5548     Label L_copy_8_chars, L_copy_8_chars_exit;
5549     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
5550 
5551     if (UseAVX >= 2) {
5552       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
5553       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
5554       movdl(tmp1Reg, tmp5);
5555       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
5556       jmp(L_chars_32_check);
5557 
5558       bind(L_copy_32_chars);
5559       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
5560       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
5561       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5562       vptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
5563       jccb(Assembler::notZero, L_copy_32_chars_exit);
5564       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5565       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
5566       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
5567 
5568       bind(L_chars_32_check);
5569       addptr(len, 32);
5570       jcc(Assembler::lessEqual, L_copy_32_chars);
5571 
5572       bind(L_copy_32_chars_exit);
5573       subptr(len, 16);
5574       jccb(Assembler::greater, L_copy_16_chars_exit);
5575 
5576     } else if (UseSSE42Intrinsics) {
5577       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
5578       movdl(tmp1Reg, tmp5);
5579       pshufd(tmp1Reg, tmp1Reg, 0);
5580       jmpb(L_chars_16_check);
5581     }
5582 
5583     bind(L_copy_16_chars);
5584     if (UseAVX >= 2) {
5585       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
5586       vptest(tmp2Reg, tmp1Reg);
5587       jcc(Assembler::notZero, L_copy_16_chars_exit);
5588       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
5589       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
5590     } else {
5591       if (UseAVX > 0) {
5592         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5593         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5594         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
5595       } else {
5596         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5597         por(tmp2Reg, tmp3Reg);
5598         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5599         por(tmp2Reg, tmp4Reg);
5600       }
5601       ptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
5602       jccb(Assembler::notZero, L_copy_16_chars_exit);
5603       packuswb(tmp3Reg, tmp4Reg);
5604     }
5605     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
5606 
5607     bind(L_chars_16_check);
5608     addptr(len, 16);
5609     jcc(Assembler::lessEqual, L_copy_16_chars);
5610 
5611     bind(L_copy_16_chars_exit);
5612     if (UseAVX >= 2) {
5613       // clean upper bits of YMM registers
5614       vpxor(tmp2Reg, tmp2Reg);
5615       vpxor(tmp3Reg, tmp3Reg);
5616       vpxor(tmp4Reg, tmp4Reg);
5617       movdl(tmp1Reg, tmp5);
5618       pshufd(tmp1Reg, tmp1Reg, 0);
5619     }
5620     subptr(len, 8);
5621     jccb(Assembler::greater, L_copy_8_chars_exit);
5622 
5623     bind(L_copy_8_chars);
5624     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
5625     ptest(tmp3Reg, tmp1Reg);
5626     jccb(Assembler::notZero, L_copy_8_chars_exit);
5627     packuswb(tmp3Reg, tmp1Reg);
5628     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
5629     addptr(len, 8);
5630     jccb(Assembler::lessEqual, L_copy_8_chars);
5631 
5632     bind(L_copy_8_chars_exit);
5633     subptr(len, 8);
5634     jccb(Assembler::zero, L_done);
5635   }
5636 
5637   bind(L_copy_1_char);
5638   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
5639   testl(tmp5, short_mask);      // check if Unicode or non-ASCII char
5640   jccb(Assembler::notZero, L_copy_1_char_exit);
5641   movb(Address(dst, len, Address::times_1, 0), tmp5);
5642   addptr(len, 1);
5643   jccb(Assembler::less, L_copy_1_char);
5644 
5645   bind(L_copy_1_char_exit);
5646   addptr(result, len); // len is negative count of not processed elements
5647 
5648   bind(L_done);
5649 }
5650 
5651 #ifdef _LP64
5652 /**
5653  * Helper for multiply_to_len().
5654  */
5655 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
5656   addq(dest_lo, src1);
5657   adcq(dest_hi, 0);
5658   addq(dest_lo, src2);
5659   adcq(dest_hi, 0);
5660 }
5661 
5662 /**
5663  * Multiply 64 bit by 64 bit first loop.
5664  */
5665 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5666                                            Register y, Register y_idx, Register z,
5667                                            Register carry, Register product,
5668                                            Register idx, Register kdx) {
5669   //
5670   //  jlong carry, x[], y[], z[];
5671   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5672   //    huge_128 product = y[idx] * x[xstart] + carry;
5673   //    z[kdx] = (jlong)product;
5674   //    carry  = (jlong)(product >>> 64);
5675   //  }
5676   //  z[xstart] = carry;
5677   //
5678 
5679   Label L_first_loop, L_first_loop_exit;
5680   Label L_one_x, L_one_y, L_multiply;
5681 
5682   decrementl(xstart);
5683   jcc(Assembler::negative, L_one_x);
5684 
5685   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
5686   rorq(x_xstart, 32); // convert big-endian to little-endian
5687 
5688   bind(L_first_loop);
5689   decrementl(idx);
5690   jcc(Assembler::negative, L_first_loop_exit);
5691   decrementl(idx);
5692   jcc(Assembler::negative, L_one_y);
5693   movq(y_idx, Address(y, idx, Address::times_4,  0));
5694   rorq(y_idx, 32); // convert big-endian to little-endian
5695   bind(L_multiply);
5696   movq(product, x_xstart);
5697   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
5698   addq(product, carry);
5699   adcq(rdx, 0);
5700   subl(kdx, 2);
5701   movl(Address(z, kdx, Address::times_4,  4), product);
5702   shrq(product, 32);
5703   movl(Address(z, kdx, Address::times_4,  0), product);
5704   movq(carry, rdx);
5705   jmp(L_first_loop);
5706 
5707   bind(L_one_y);
5708   movl(y_idx, Address(y,  0));
5709   jmp(L_multiply);
5710 
5711   bind(L_one_x);
5712   movl(x_xstart, Address(x,  0));
5713   jmp(L_first_loop);
5714 
5715   bind(L_first_loop_exit);
5716 }
5717 
5718 /**
5719  * Multiply 64 bit by 64 bit and add 128 bit.
5720  */
5721 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
5722                                             Register yz_idx, Register idx,
5723                                             Register carry, Register product, int offset) {
5724   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
5725   //     z[kdx] = (jlong)product;
5726 
5727   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
5728   rorq(yz_idx, 32); // convert big-endian to little-endian
5729   movq(product, x_xstart);
5730   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
5731   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
5732   rorq(yz_idx, 32); // convert big-endian to little-endian
5733 
5734   add2_with_carry(rdx, product, carry, yz_idx);
5735 
5736   movl(Address(z, idx, Address::times_4,  offset+4), product);
5737   shrq(product, 32);
5738   movl(Address(z, idx, Address::times_4,  offset), product);
5739 
5740 }
5741 
5742 /**
5743  * Multiply 128 bit by 128 bit. Unrolled inner loop.
5744  */
5745 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
5746                                              Register yz_idx, Register idx, Register jdx,
5747                                              Register carry, Register product,
5748                                              Register carry2) {
5749   //   jlong carry, x[], y[], z[];
5750   //   int kdx = ystart+1;
5751   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5752   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
5753   //     z[kdx+idx+1] = (jlong)product;
5754   //     jlong carry2  = (jlong)(product >>> 64);
5755   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
5756   //     z[kdx+idx] = (jlong)product;
5757   //     carry  = (jlong)(product >>> 64);
5758   //   }
5759   //   idx += 2;
5760   //   if (idx > 0) {
5761   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
5762   //     z[kdx+idx] = (jlong)product;
5763   //     carry  = (jlong)(product >>> 64);
5764   //   }
5765   //
5766 
5767   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5768 
5769   movl(jdx, idx);
5770   andl(jdx, 0xFFFFFFFC);
5771   shrl(jdx, 2);
5772 
5773   bind(L_third_loop);
5774   subl(jdx, 1);
5775   jcc(Assembler::negative, L_third_loop_exit);
5776   subl(idx, 4);
5777 
5778   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
5779   movq(carry2, rdx);
5780 
5781   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
5782   movq(carry, rdx);
5783   jmp(L_third_loop);
5784 
5785   bind (L_third_loop_exit);
5786 
5787   andl (idx, 0x3);
5788   jcc(Assembler::zero, L_post_third_loop_done);
5789 
5790   Label L_check_1;
5791   subl(idx, 2);
5792   jcc(Assembler::negative, L_check_1);
5793 
5794   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
5795   movq(carry, rdx);
5796 
5797   bind (L_check_1);
5798   addl (idx, 0x2);
5799   andl (idx, 0x1);
5800   subl(idx, 1);
5801   jcc(Assembler::negative, L_post_third_loop_done);
5802 
5803   movl(yz_idx, Address(y, idx, Address::times_4,  0));
5804   movq(product, x_xstart);
5805   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
5806   movl(yz_idx, Address(z, idx, Address::times_4,  0));
5807 
5808   add2_with_carry(rdx, product, yz_idx, carry);
5809 
5810   movl(Address(z, idx, Address::times_4,  0), product);
5811   shrq(product, 32);
5812 
5813   shlq(rdx, 32);
5814   orq(product, rdx);
5815   movq(carry, product);
5816 
5817   bind(L_post_third_loop_done);
5818 }
5819 
5820 /**
5821  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
5822  *
5823  */
5824 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
5825                                                   Register carry, Register carry2,
5826                                                   Register idx, Register jdx,
5827                                                   Register yz_idx1, Register yz_idx2,
5828                                                   Register tmp, Register tmp3, Register tmp4) {
5829   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
5830 
5831   //   jlong carry, x[], y[], z[];
5832   //   int kdx = ystart+1;
5833   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5834   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
5835   //     jlong carry2  = (jlong)(tmp3 >>> 64);
5836   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
5837   //     carry  = (jlong)(tmp4 >>> 64);
5838   //     z[kdx+idx+1] = (jlong)tmp3;
5839   //     z[kdx+idx] = (jlong)tmp4;
5840   //   }
5841   //   idx += 2;
5842   //   if (idx > 0) {
5843   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
5844   //     z[kdx+idx] = (jlong)yz_idx1;
5845   //     carry  = (jlong)(yz_idx1 >>> 64);
5846   //   }
5847   //
5848 
5849   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5850 
5851   movl(jdx, idx);
5852   andl(jdx, 0xFFFFFFFC);
5853   shrl(jdx, 2);
5854 
5855   bind(L_third_loop);
5856   subl(jdx, 1);
5857   jcc(Assembler::negative, L_third_loop_exit);
5858   subl(idx, 4);
5859 
5860   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
5861   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
5862   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
5863   rorxq(yz_idx2, yz_idx2, 32);
5864 
5865   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
5866   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
5867 
5868   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
5869   rorxq(yz_idx1, yz_idx1, 32);
5870   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
5871   rorxq(yz_idx2, yz_idx2, 32);
5872 
5873   if (VM_Version::supports_adx()) {
5874     adcxq(tmp3, carry);
5875     adoxq(tmp3, yz_idx1);
5876 
5877     adcxq(tmp4, tmp);
5878     adoxq(tmp4, yz_idx2);
5879 
5880     movl(carry, 0); // does not affect flags
5881     adcxq(carry2, carry);
5882     adoxq(carry2, carry);
5883   } else {
5884     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
5885     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
5886   }
5887   movq(carry, carry2);
5888 
5889   movl(Address(z, idx, Address::times_4, 12), tmp3);
5890   shrq(tmp3, 32);
5891   movl(Address(z, idx, Address::times_4,  8), tmp3);
5892 
5893   movl(Address(z, idx, Address::times_4,  4), tmp4);
5894   shrq(tmp4, 32);
5895   movl(Address(z, idx, Address::times_4,  0), tmp4);
5896 
5897   jmp(L_third_loop);
5898 
5899   bind (L_third_loop_exit);
5900 
5901   andl (idx, 0x3);
5902   jcc(Assembler::zero, L_post_third_loop_done);
5903 
5904   Label L_check_1;
5905   subl(idx, 2);
5906   jcc(Assembler::negative, L_check_1);
5907 
5908   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
5909   rorxq(yz_idx1, yz_idx1, 32);
5910   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
5911   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
5912   rorxq(yz_idx2, yz_idx2, 32);
5913 
5914   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
5915 
5916   movl(Address(z, idx, Address::times_4,  4), tmp3);
5917   shrq(tmp3, 32);
5918   movl(Address(z, idx, Address::times_4,  0), tmp3);
5919   movq(carry, tmp4);
5920 
5921   bind (L_check_1);
5922   addl (idx, 0x2);
5923   andl (idx, 0x1);
5924   subl(idx, 1);
5925   jcc(Assembler::negative, L_post_third_loop_done);
5926   movl(tmp4, Address(y, idx, Address::times_4,  0));
5927   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
5928   movl(tmp4, Address(z, idx, Address::times_4,  0));
5929 
5930   add2_with_carry(carry2, tmp3, tmp4, carry);
5931 
5932   movl(Address(z, idx, Address::times_4,  0), tmp3);
5933   shrq(tmp3, 32);
5934 
5935   shlq(carry2, 32);
5936   orq(tmp3, carry2);
5937   movq(carry, tmp3);
5938 
5939   bind(L_post_third_loop_done);
5940 }
5941 
5942 /**
5943  * Code for BigInteger::multiplyToLen() instrinsic.
5944  *
5945  * rdi: x
5946  * rax: xlen
5947  * rsi: y
5948  * rcx: ylen
5949  * r8:  z
5950  * r11: zlen
5951  * r12: tmp1
5952  * r13: tmp2
5953  * r14: tmp3
5954  * r15: tmp4
5955  * rbx: tmp5
5956  *
5957  */
5958 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
5959                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
5960   ShortBranchVerifier sbv(this);
5961   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
5962 
5963   push(tmp1);
5964   push(tmp2);
5965   push(tmp3);
5966   push(tmp4);
5967   push(tmp5);
5968 
5969   push(xlen);
5970   push(zlen);
5971 
5972   const Register idx = tmp1;
5973   const Register kdx = tmp2;
5974   const Register xstart = tmp3;
5975 
5976   const Register y_idx = tmp4;
5977   const Register carry = tmp5;
5978   const Register product  = xlen;
5979   const Register x_xstart = zlen;  // reuse register
5980 
5981   // First Loop.
5982   //
5983   //  final static long LONG_MASK = 0xffffffffL;
5984   //  int xstart = xlen - 1;
5985   //  int ystart = ylen - 1;
5986   //  long carry = 0;
5987   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5988   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
5989   //    z[kdx] = (int)product;
5990   //    carry = product >>> 32;
5991   //  }
5992   //  z[xstart] = (int)carry;
5993   //
5994 
5995   movl(idx, ylen);      // idx = ylen;
5996   movl(kdx, zlen);      // kdx = xlen+ylen;
5997   xorq(carry, carry);   // carry = 0;
5998 
5999   Label L_done;
6000 
6001   movl(xstart, xlen);
6002   decrementl(xstart);
6003   jcc(Assembler::negative, L_done);
6004 
6005   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6006 
6007   Label L_second_loop;
6008   testl(kdx, kdx);
6009   jcc(Assembler::zero, L_second_loop);
6010 
6011   Label L_carry;
6012   subl(kdx, 1);
6013   jcc(Assembler::zero, L_carry);
6014 
6015   movl(Address(z, kdx, Address::times_4,  0), carry);
6016   shrq(carry, 32);
6017   subl(kdx, 1);
6018 
6019   bind(L_carry);
6020   movl(Address(z, kdx, Address::times_4,  0), carry);
6021 
6022   // Second and third (nested) loops.
6023   //
6024   // for (int i = xstart-1; i >= 0; i--) { // Second loop
6025   //   carry = 0;
6026   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6027   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6028   //                    (z[k] & LONG_MASK) + carry;
6029   //     z[k] = (int)product;
6030   //     carry = product >>> 32;
6031   //   }
6032   //   z[i] = (int)carry;
6033   // }
6034   //
6035   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
6036 
6037   const Register jdx = tmp1;
6038 
6039   bind(L_second_loop);
6040   xorl(carry, carry);    // carry = 0;
6041   movl(jdx, ylen);       // j = ystart+1
6042 
6043   subl(xstart, 1);       // i = xstart-1;
6044   jcc(Assembler::negative, L_done);
6045 
6046   push (z);
6047 
6048   Label L_last_x;
6049   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
6050   subl(xstart, 1);       // i = xstart-1;
6051   jcc(Assembler::negative, L_last_x);
6052 
6053   if (UseBMI2Instructions) {
6054     movq(rdx,  Address(x, xstart, Address::times_4,  0));
6055     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
6056   } else {
6057     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
6058     rorq(x_xstart, 32);  // convert big-endian to little-endian
6059   }
6060 
6061   Label L_third_loop_prologue;
6062   bind(L_third_loop_prologue);
6063 
6064   push (x);
6065   push (xstart);
6066   push (ylen);
6067 
6068 
6069   if (UseBMI2Instructions) {
6070     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
6071   } else { // !UseBMI2Instructions
6072     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
6073   }
6074 
6075   pop(ylen);
6076   pop(xlen);
6077   pop(x);
6078   pop(z);
6079 
6080   movl(tmp3, xlen);
6081   addl(tmp3, 1);
6082   movl(Address(z, tmp3, Address::times_4,  0), carry);
6083   subl(tmp3, 1);
6084   jccb(Assembler::negative, L_done);
6085 
6086   shrq(carry, 32);
6087   movl(Address(z, tmp3, Address::times_4,  0), carry);
6088   jmp(L_second_loop);
6089 
6090   // Next infrequent code is moved outside loops.
6091   bind(L_last_x);
6092   if (UseBMI2Instructions) {
6093     movl(rdx, Address(x,  0));
6094   } else {
6095     movl(x_xstart, Address(x,  0));
6096   }
6097   jmp(L_third_loop_prologue);
6098 
6099   bind(L_done);
6100 
6101   pop(zlen);
6102   pop(xlen);
6103 
6104   pop(tmp5);
6105   pop(tmp4);
6106   pop(tmp3);
6107   pop(tmp2);
6108   pop(tmp1);
6109 }
6110 
6111 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
6112   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
6113   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
6114   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
6115   Label VECTOR8_TAIL, VECTOR4_TAIL;
6116   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
6117   Label SAME_TILL_END, DONE;
6118   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
6119 
6120   //scale is in rcx in both Win64 and Unix
6121   ShortBranchVerifier sbv(this);
6122 
6123   shlq(length);
6124   xorq(result, result);
6125 
6126   if ((AVX3Threshold == 0) && (UseAVX > 2) &&
6127       VM_Version::supports_avx512vlbw()) {
6128     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
6129 
6130     cmpq(length, 64);
6131     jcc(Assembler::less, VECTOR32_TAIL);
6132 
6133     movq(tmp1, length);
6134     andq(tmp1, 0x3F);      // tail count
6135     andq(length, ~(0x3F)); //vector count
6136 
6137     bind(VECTOR64_LOOP);
6138     // AVX512 code to compare 64 byte vectors.
6139     evmovdqub(rymm0, Address(obja, result), false, Assembler::AVX_512bit);
6140     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
6141     kortestql(k7, k7);
6142     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
6143     addq(result, 64);
6144     subq(length, 64);
6145     jccb(Assembler::notZero, VECTOR64_LOOP);
6146 
6147     //bind(VECTOR64_TAIL);
6148     testq(tmp1, tmp1);
6149     jcc(Assembler::zero, SAME_TILL_END);
6150 
6151     //bind(VECTOR64_TAIL);
6152     // AVX512 code to compare upto 63 byte vectors.
6153     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
6154     shlxq(tmp2, tmp2, tmp1);
6155     notq(tmp2);
6156     kmovql(k3, tmp2);
6157 
6158     evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
6159     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
6160 
6161     ktestql(k7, k3);
6162     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
6163 
6164     bind(VECTOR64_NOT_EQUAL);
6165     kmovql(tmp1, k7);
6166     notq(tmp1);
6167     tzcntq(tmp1, tmp1);
6168     addq(result, tmp1);
6169     shrq(result);
6170     jmp(DONE);
6171     bind(VECTOR32_TAIL);
6172   }
6173 
6174   cmpq(length, 8);
6175   jcc(Assembler::equal, VECTOR8_LOOP);
6176   jcc(Assembler::less, VECTOR4_TAIL);
6177 
6178   if (UseAVX >= 2) {
6179     Label VECTOR16_TAIL, VECTOR32_LOOP;
6180 
6181     cmpq(length, 16);
6182     jcc(Assembler::equal, VECTOR16_LOOP);
6183     jcc(Assembler::less, VECTOR8_LOOP);
6184 
6185     cmpq(length, 32);
6186     jccb(Assembler::less, VECTOR16_TAIL);
6187 
6188     subq(length, 32);
6189     bind(VECTOR32_LOOP);
6190     vmovdqu(rymm0, Address(obja, result));
6191     vmovdqu(rymm1, Address(objb, result));
6192     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
6193     vptest(rymm2, rymm2);
6194     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
6195     addq(result, 32);
6196     subq(length, 32);
6197     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
6198     addq(length, 32);
6199     jcc(Assembler::equal, SAME_TILL_END);
6200     //falling through if less than 32 bytes left //close the branch here.
6201 
6202     bind(VECTOR16_TAIL);
6203     cmpq(length, 16);
6204     jccb(Assembler::less, VECTOR8_TAIL);
6205     bind(VECTOR16_LOOP);
6206     movdqu(rymm0, Address(obja, result));
6207     movdqu(rymm1, Address(objb, result));
6208     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
6209     ptest(rymm2, rymm2);
6210     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6211     addq(result, 16);
6212     subq(length, 16);
6213     jcc(Assembler::equal, SAME_TILL_END);
6214     //falling through if less than 16 bytes left
6215   } else {//regular intrinsics
6216 
6217     cmpq(length, 16);
6218     jccb(Assembler::less, VECTOR8_TAIL);
6219 
6220     subq(length, 16);
6221     bind(VECTOR16_LOOP);
6222     movdqu(rymm0, Address(obja, result));
6223     movdqu(rymm1, Address(objb, result));
6224     pxor(rymm0, rymm1);
6225     ptest(rymm0, rymm0);
6226     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6227     addq(result, 16);
6228     subq(length, 16);
6229     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
6230     addq(length, 16);
6231     jcc(Assembler::equal, SAME_TILL_END);
6232     //falling through if less than 16 bytes left
6233   }
6234 
6235   bind(VECTOR8_TAIL);
6236   cmpq(length, 8);
6237   jccb(Assembler::less, VECTOR4_TAIL);
6238   bind(VECTOR8_LOOP);
6239   movq(tmp1, Address(obja, result));
6240   movq(tmp2, Address(objb, result));
6241   xorq(tmp1, tmp2);
6242   testq(tmp1, tmp1);
6243   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
6244   addq(result, 8);
6245   subq(length, 8);
6246   jcc(Assembler::equal, SAME_TILL_END);
6247   //falling through if less than 8 bytes left
6248 
6249   bind(VECTOR4_TAIL);
6250   cmpq(length, 4);
6251   jccb(Assembler::less, BYTES_TAIL);
6252   bind(VECTOR4_LOOP);
6253   movl(tmp1, Address(obja, result));
6254   xorl(tmp1, Address(objb, result));
6255   testl(tmp1, tmp1);
6256   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
6257   addq(result, 4);
6258   subq(length, 4);
6259   jcc(Assembler::equal, SAME_TILL_END);
6260   //falling through if less than 4 bytes left
6261 
6262   bind(BYTES_TAIL);
6263   bind(BYTES_LOOP);
6264   load_unsigned_byte(tmp1, Address(obja, result));
6265   load_unsigned_byte(tmp2, Address(objb, result));
6266   xorl(tmp1, tmp2);
6267   testl(tmp1, tmp1);
6268   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6269   decq(length);
6270   jcc(Assembler::zero, SAME_TILL_END);
6271   incq(result);
6272   load_unsigned_byte(tmp1, Address(obja, result));
6273   load_unsigned_byte(tmp2, Address(objb, result));
6274   xorl(tmp1, tmp2);
6275   testl(tmp1, tmp1);
6276   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6277   decq(length);
6278   jcc(Assembler::zero, SAME_TILL_END);
6279   incq(result);
6280   load_unsigned_byte(tmp1, Address(obja, result));
6281   load_unsigned_byte(tmp2, Address(objb, result));
6282   xorl(tmp1, tmp2);
6283   testl(tmp1, tmp1);
6284   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6285   jmp(SAME_TILL_END);
6286 
6287   if (UseAVX >= 2) {
6288     bind(VECTOR32_NOT_EQUAL);
6289     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
6290     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
6291     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
6292     vpmovmskb(tmp1, rymm0);
6293     bsfq(tmp1, tmp1);
6294     addq(result, tmp1);
6295     shrq(result);
6296     jmp(DONE);
6297   }
6298 
6299   bind(VECTOR16_NOT_EQUAL);
6300   if (UseAVX >= 2) {
6301     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
6302     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
6303     pxor(rymm0, rymm2);
6304   } else {
6305     pcmpeqb(rymm2, rymm2);
6306     pxor(rymm0, rymm1);
6307     pcmpeqb(rymm0, rymm1);
6308     pxor(rymm0, rymm2);
6309   }
6310   pmovmskb(tmp1, rymm0);
6311   bsfq(tmp1, tmp1);
6312   addq(result, tmp1);
6313   shrq(result);
6314   jmpb(DONE);
6315 
6316   bind(VECTOR8_NOT_EQUAL);
6317   bind(VECTOR4_NOT_EQUAL);
6318   bsfq(tmp1, tmp1);
6319   shrq(tmp1, 3);
6320   addq(result, tmp1);
6321   bind(BYTES_NOT_EQUAL);
6322   shrq(result);
6323   jmpb(DONE);
6324 
6325   bind(SAME_TILL_END);
6326   mov64(result, -1);
6327 
6328   bind(DONE);
6329 }
6330 
6331 //Helper functions for square_to_len()
6332 
6333 /**
6334  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
6335  * Preserves x and z and modifies rest of the registers.
6336  */
6337 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6338   // Perform square and right shift by 1
6339   // Handle odd xlen case first, then for even xlen do the following
6340   // jlong carry = 0;
6341   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
6342   //     huge_128 product = x[j:j+1] * x[j:j+1];
6343   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
6344   //     z[i+2:i+3] = (jlong)(product >>> 1);
6345   //     carry = (jlong)product;
6346   // }
6347 
6348   xorq(tmp5, tmp5);     // carry
6349   xorq(rdxReg, rdxReg);
6350   xorl(tmp1, tmp1);     // index for x
6351   xorl(tmp4, tmp4);     // index for z
6352 
6353   Label L_first_loop, L_first_loop_exit;
6354 
6355   testl(xlen, 1);
6356   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
6357 
6358   // Square and right shift by 1 the odd element using 32 bit multiply
6359   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
6360   imulq(raxReg, raxReg);
6361   shrq(raxReg, 1);
6362   adcq(tmp5, 0);
6363   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
6364   incrementl(tmp1);
6365   addl(tmp4, 2);
6366 
6367   // Square and  right shift by 1 the rest using 64 bit multiply
6368   bind(L_first_loop);
6369   cmpptr(tmp1, xlen);
6370   jccb(Assembler::equal, L_first_loop_exit);
6371 
6372   // Square
6373   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
6374   rorq(raxReg, 32);    // convert big-endian to little-endian
6375   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
6376 
6377   // Right shift by 1 and save carry
6378   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
6379   rcrq(rdxReg, 1);
6380   rcrq(raxReg, 1);
6381   adcq(tmp5, 0);
6382 
6383   // Store result in z
6384   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
6385   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
6386 
6387   // Update indices for x and z
6388   addl(tmp1, 2);
6389   addl(tmp4, 4);
6390   jmp(L_first_loop);
6391 
6392   bind(L_first_loop_exit);
6393 }
6394 
6395 
6396 /**
6397  * Perform the following multiply add operation using BMI2 instructions
6398  * carry:sum = sum + op1*op2 + carry
6399  * op2 should be in rdx
6400  * op2 is preserved, all other registers are modified
6401  */
6402 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
6403   // assert op2 is rdx
6404   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
6405   addq(sum, carry);
6406   adcq(tmp2, 0);
6407   addq(sum, op1);
6408   adcq(tmp2, 0);
6409   movq(carry, tmp2);
6410 }
6411 
6412 /**
6413  * Perform the following multiply add operation:
6414  * carry:sum = sum + op1*op2 + carry
6415  * Preserves op1, op2 and modifies rest of registers
6416  */
6417 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
6418   // rdx:rax = op1 * op2
6419   movq(raxReg, op2);
6420   mulq(op1);
6421 
6422   //  rdx:rax = sum + carry + rdx:rax
6423   addq(sum, carry);
6424   adcq(rdxReg, 0);
6425   addq(sum, raxReg);
6426   adcq(rdxReg, 0);
6427 
6428   // carry:sum = rdx:sum
6429   movq(carry, rdxReg);
6430 }
6431 
6432 /**
6433  * Add 64 bit long carry into z[] with carry propogation.
6434  * Preserves z and carry register values and modifies rest of registers.
6435  *
6436  */
6437 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
6438   Label L_fourth_loop, L_fourth_loop_exit;
6439 
6440   movl(tmp1, 1);
6441   subl(zlen, 2);
6442   addq(Address(z, zlen, Address::times_4, 0), carry);
6443 
6444   bind(L_fourth_loop);
6445   jccb(Assembler::carryClear, L_fourth_loop_exit);
6446   subl(zlen, 2);
6447   jccb(Assembler::negative, L_fourth_loop_exit);
6448   addq(Address(z, zlen, Address::times_4, 0), tmp1);
6449   jmp(L_fourth_loop);
6450   bind(L_fourth_loop_exit);
6451 }
6452 
6453 /**
6454  * Shift z[] left by 1 bit.
6455  * Preserves x, len, z and zlen registers and modifies rest of the registers.
6456  *
6457  */
6458 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
6459 
6460   Label L_fifth_loop, L_fifth_loop_exit;
6461 
6462   // Fifth loop
6463   // Perform primitiveLeftShift(z, zlen, 1)
6464 
6465   const Register prev_carry = tmp1;
6466   const Register new_carry = tmp4;
6467   const Register value = tmp2;
6468   const Register zidx = tmp3;
6469 
6470   // int zidx, carry;
6471   // long value;
6472   // carry = 0;
6473   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
6474   //    (carry:value)  = (z[i] << 1) | carry ;
6475   //    z[i] = value;
6476   // }
6477 
6478   movl(zidx, zlen);
6479   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
6480 
6481   bind(L_fifth_loop);
6482   decl(zidx);  // Use decl to preserve carry flag
6483   decl(zidx);
6484   jccb(Assembler::negative, L_fifth_loop_exit);
6485 
6486   if (UseBMI2Instructions) {
6487      movq(value, Address(z, zidx, Address::times_4, 0));
6488      rclq(value, 1);
6489      rorxq(value, value, 32);
6490      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6491   }
6492   else {
6493     // clear new_carry
6494     xorl(new_carry, new_carry);
6495 
6496     // Shift z[i] by 1, or in previous carry and save new carry
6497     movq(value, Address(z, zidx, Address::times_4, 0));
6498     shlq(value, 1);
6499     adcl(new_carry, 0);
6500 
6501     orq(value, prev_carry);
6502     rorq(value, 0x20);
6503     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6504 
6505     // Set previous carry = new carry
6506     movl(prev_carry, new_carry);
6507   }
6508   jmp(L_fifth_loop);
6509 
6510   bind(L_fifth_loop_exit);
6511 }
6512 
6513 
6514 /**
6515  * Code for BigInteger::squareToLen() intrinsic
6516  *
6517  * rdi: x
6518  * rsi: len
6519  * r8:  z
6520  * rcx: zlen
6521  * r12: tmp1
6522  * r13: tmp2
6523  * r14: tmp3
6524  * r15: tmp4
6525  * rbx: tmp5
6526  *
6527  */
6528 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6529 
6530   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
6531   push(tmp1);
6532   push(tmp2);
6533   push(tmp3);
6534   push(tmp4);
6535   push(tmp5);
6536 
6537   // First loop
6538   // Store the squares, right shifted one bit (i.e., divided by 2).
6539   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
6540 
6541   // Add in off-diagonal sums.
6542   //
6543   // Second, third (nested) and fourth loops.
6544   // zlen +=2;
6545   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
6546   //    carry = 0;
6547   //    long op2 = x[xidx:xidx+1];
6548   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
6549   //       k -= 2;
6550   //       long op1 = x[j:j+1];
6551   //       long sum = z[k:k+1];
6552   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
6553   //       z[k:k+1] = sum;
6554   //    }
6555   //    add_one_64(z, k, carry, tmp_regs);
6556   // }
6557 
6558   const Register carry = tmp5;
6559   const Register sum = tmp3;
6560   const Register op1 = tmp4;
6561   Register op2 = tmp2;
6562 
6563   push(zlen);
6564   push(len);
6565   addl(zlen,2);
6566   bind(L_second_loop);
6567   xorq(carry, carry);
6568   subl(zlen, 4);
6569   subl(len, 2);
6570   push(zlen);
6571   push(len);
6572   cmpl(len, 0);
6573   jccb(Assembler::lessEqual, L_second_loop_exit);
6574 
6575   // Multiply an array by one 64 bit long.
6576   if (UseBMI2Instructions) {
6577     op2 = rdxReg;
6578     movq(op2, Address(x, len, Address::times_4,  0));
6579     rorxq(op2, op2, 32);
6580   }
6581   else {
6582     movq(op2, Address(x, len, Address::times_4,  0));
6583     rorq(op2, 32);
6584   }
6585 
6586   bind(L_third_loop);
6587   decrementl(len);
6588   jccb(Assembler::negative, L_third_loop_exit);
6589   decrementl(len);
6590   jccb(Assembler::negative, L_last_x);
6591 
6592   movq(op1, Address(x, len, Address::times_4,  0));
6593   rorq(op1, 32);
6594 
6595   bind(L_multiply);
6596   subl(zlen, 2);
6597   movq(sum, Address(z, zlen, Address::times_4,  0));
6598 
6599   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
6600   if (UseBMI2Instructions) {
6601     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
6602   }
6603   else {
6604     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6605   }
6606 
6607   movq(Address(z, zlen, Address::times_4, 0), sum);
6608 
6609   jmp(L_third_loop);
6610   bind(L_third_loop_exit);
6611 
6612   // Fourth loop
6613   // Add 64 bit long carry into z with carry propogation.
6614   // Uses offsetted zlen.
6615   add_one_64(z, zlen, carry, tmp1);
6616 
6617   pop(len);
6618   pop(zlen);
6619   jmp(L_second_loop);
6620 
6621   // Next infrequent code is moved outside loops.
6622   bind(L_last_x);
6623   movl(op1, Address(x, 0));
6624   jmp(L_multiply);
6625 
6626   bind(L_second_loop_exit);
6627   pop(len);
6628   pop(zlen);
6629   pop(len);
6630   pop(zlen);
6631 
6632   // Fifth loop
6633   // Shift z left 1 bit.
6634   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
6635 
6636   // z[zlen-1] |= x[len-1] & 1;
6637   movl(tmp3, Address(x, len, Address::times_4, -4));
6638   andl(tmp3, 1);
6639   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
6640 
6641   pop(tmp5);
6642   pop(tmp4);
6643   pop(tmp3);
6644   pop(tmp2);
6645   pop(tmp1);
6646 }
6647 
6648 /**
6649  * Helper function for mul_add()
6650  * Multiply the in[] by int k and add to out[] starting at offset offs using
6651  * 128 bit by 32 bit multiply and return the carry in tmp5.
6652  * Only quad int aligned length of in[] is operated on in this function.
6653  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
6654  * This function preserves out, in and k registers.
6655  * len and offset point to the appropriate index in "in" & "out" correspondingly
6656  * tmp5 has the carry.
6657  * other registers are temporary and are modified.
6658  *
6659  */
6660 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
6661   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
6662   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6663 
6664   Label L_first_loop, L_first_loop_exit;
6665 
6666   movl(tmp1, len);
6667   shrl(tmp1, 2);
6668 
6669   bind(L_first_loop);
6670   subl(tmp1, 1);
6671   jccb(Assembler::negative, L_first_loop_exit);
6672 
6673   subl(len, 4);
6674   subl(offset, 4);
6675 
6676   Register op2 = tmp2;
6677   const Register sum = tmp3;
6678   const Register op1 = tmp4;
6679   const Register carry = tmp5;
6680 
6681   if (UseBMI2Instructions) {
6682     op2 = rdxReg;
6683   }
6684 
6685   movq(op1, Address(in, len, Address::times_4,  8));
6686   rorq(op1, 32);
6687   movq(sum, Address(out, offset, Address::times_4,  8));
6688   rorq(sum, 32);
6689   if (UseBMI2Instructions) {
6690     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6691   }
6692   else {
6693     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6694   }
6695   // Store back in big endian from little endian
6696   rorq(sum, 0x20);
6697   movq(Address(out, offset, Address::times_4,  8), sum);
6698 
6699   movq(op1, Address(in, len, Address::times_4,  0));
6700   rorq(op1, 32);
6701   movq(sum, Address(out, offset, Address::times_4,  0));
6702   rorq(sum, 32);
6703   if (UseBMI2Instructions) {
6704     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6705   }
6706   else {
6707     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6708   }
6709   // Store back in big endian from little endian
6710   rorq(sum, 0x20);
6711   movq(Address(out, offset, Address::times_4,  0), sum);
6712 
6713   jmp(L_first_loop);
6714   bind(L_first_loop_exit);
6715 }
6716 
6717 /**
6718  * Code for BigInteger::mulAdd() intrinsic
6719  *
6720  * rdi: out
6721  * rsi: in
6722  * r11: offs (out.length - offset)
6723  * rcx: len
6724  * r8:  k
6725  * r12: tmp1
6726  * r13: tmp2
6727  * r14: tmp3
6728  * r15: tmp4
6729  * rbx: tmp5
6730  * Multiply the in[] by word k and add to out[], return the carry in rax
6731  */
6732 void MacroAssembler::mul_add(Register out, Register in, Register offs,
6733    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
6734    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6735 
6736   Label L_carry, L_last_in, L_done;
6737 
6738 // carry = 0;
6739 // for (int j=len-1; j >= 0; j--) {
6740 //    long product = (in[j] & LONG_MASK) * kLong +
6741 //                   (out[offs] & LONG_MASK) + carry;
6742 //    out[offs--] = (int)product;
6743 //    carry = product >>> 32;
6744 // }
6745 //
6746   push(tmp1);
6747   push(tmp2);
6748   push(tmp3);
6749   push(tmp4);
6750   push(tmp5);
6751 
6752   Register op2 = tmp2;
6753   const Register sum = tmp3;
6754   const Register op1 = tmp4;
6755   const Register carry =  tmp5;
6756 
6757   if (UseBMI2Instructions) {
6758     op2 = rdxReg;
6759     movl(op2, k);
6760   }
6761   else {
6762     movl(op2, k);
6763   }
6764 
6765   xorq(carry, carry);
6766 
6767   //First loop
6768 
6769   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
6770   //The carry is in tmp5
6771   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
6772 
6773   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
6774   decrementl(len);
6775   jccb(Assembler::negative, L_carry);
6776   decrementl(len);
6777   jccb(Assembler::negative, L_last_in);
6778 
6779   movq(op1, Address(in, len, Address::times_4,  0));
6780   rorq(op1, 32);
6781 
6782   subl(offs, 2);
6783   movq(sum, Address(out, offs, Address::times_4,  0));
6784   rorq(sum, 32);
6785 
6786   if (UseBMI2Instructions) {
6787     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6788   }
6789   else {
6790     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6791   }
6792 
6793   // Store back in big endian from little endian
6794   rorq(sum, 0x20);
6795   movq(Address(out, offs, Address::times_4,  0), sum);
6796 
6797   testl(len, len);
6798   jccb(Assembler::zero, L_carry);
6799 
6800   //Multiply the last in[] entry, if any
6801   bind(L_last_in);
6802   movl(op1, Address(in, 0));
6803   movl(sum, Address(out, offs, Address::times_4,  -4));
6804 
6805   movl(raxReg, k);
6806   mull(op1); //tmp4 * eax -> edx:eax
6807   addl(sum, carry);
6808   adcl(rdxReg, 0);
6809   addl(sum, raxReg);
6810   adcl(rdxReg, 0);
6811   movl(carry, rdxReg);
6812 
6813   movl(Address(out, offs, Address::times_4,  -4), sum);
6814 
6815   bind(L_carry);
6816   //return tmp5/carry as carry in rax
6817   movl(rax, carry);
6818 
6819   bind(L_done);
6820   pop(tmp5);
6821   pop(tmp4);
6822   pop(tmp3);
6823   pop(tmp2);
6824   pop(tmp1);
6825 }
6826 #endif
6827 
6828 /**
6829  * Emits code to update CRC-32 with a byte value according to constants in table
6830  *
6831  * @param [in,out]crc   Register containing the crc.
6832  * @param [in]val       Register containing the byte to fold into the CRC.
6833  * @param [in]table     Register containing the table of crc constants.
6834  *
6835  * uint32_t crc;
6836  * val = crc_table[(val ^ crc) & 0xFF];
6837  * crc = val ^ (crc >> 8);
6838  *
6839  */
6840 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
6841   xorl(val, crc);
6842   andl(val, 0xFF);
6843   shrl(crc, 8); // unsigned shift
6844   xorl(crc, Address(table, val, Address::times_4, 0));
6845 }
6846 
6847 /**
6848  * Fold 128-bit data chunk
6849  */
6850 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
6851   if (UseAVX > 0) {
6852     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
6853     vpclmulldq(xcrc, xK, xcrc); // [63:0]
6854     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
6855     pxor(xcrc, xtmp);
6856   } else {
6857     movdqa(xtmp, xcrc);
6858     pclmulhdq(xtmp, xK);   // [123:64]
6859     pclmulldq(xcrc, xK);   // [63:0]
6860     pxor(xcrc, xtmp);
6861     movdqu(xtmp, Address(buf, offset));
6862     pxor(xcrc, xtmp);
6863   }
6864 }
6865 
6866 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
6867   if (UseAVX > 0) {
6868     vpclmulhdq(xtmp, xK, xcrc);
6869     vpclmulldq(xcrc, xK, xcrc);
6870     pxor(xcrc, xbuf);
6871     pxor(xcrc, xtmp);
6872   } else {
6873     movdqa(xtmp, xcrc);
6874     pclmulhdq(xtmp, xK);
6875     pclmulldq(xcrc, xK);
6876     pxor(xcrc, xbuf);
6877     pxor(xcrc, xtmp);
6878   }
6879 }
6880 
6881 /**
6882  * 8-bit folds to compute 32-bit CRC
6883  *
6884  * uint64_t xcrc;
6885  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
6886  */
6887 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
6888   movdl(tmp, xcrc);
6889   andl(tmp, 0xFF);
6890   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
6891   psrldq(xcrc, 1); // unsigned shift one byte
6892   pxor(xcrc, xtmp);
6893 }
6894 
6895 /**
6896  * uint32_t crc;
6897  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
6898  */
6899 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
6900   movl(tmp, crc);
6901   andl(tmp, 0xFF);
6902   shrl(crc, 8);
6903   xorl(crc, Address(table, tmp, Address::times_4, 0));
6904 }
6905 
6906 /**
6907  * @param crc   register containing existing CRC (32-bit)
6908  * @param buf   register pointing to input byte buffer (byte*)
6909  * @param len   register containing number of bytes
6910  * @param table register that will contain address of CRC table
6911  * @param tmp   scratch register
6912  */
6913 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
6914   assert_different_registers(crc, buf, len, table, tmp, rax);
6915 
6916   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
6917   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
6918 
6919   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
6920   // context for the registers used, where all instructions below are using 128-bit mode
6921   // On EVEX without VL and BW, these instructions will all be AVX.
6922   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
6923   notl(crc); // ~crc
6924   cmpl(len, 16);
6925   jcc(Assembler::less, L_tail);
6926 
6927   // Align buffer to 16 bytes
6928   movl(tmp, buf);
6929   andl(tmp, 0xF);
6930   jccb(Assembler::zero, L_aligned);
6931   subl(tmp,  16);
6932   addl(len, tmp);
6933 
6934   align(4);
6935   BIND(L_align_loop);
6936   movsbl(rax, Address(buf, 0)); // load byte with sign extension
6937   update_byte_crc32(crc, rax, table);
6938   increment(buf);
6939   incrementl(tmp);
6940   jccb(Assembler::less, L_align_loop);
6941 
6942   BIND(L_aligned);
6943   movl(tmp, len); // save
6944   shrl(len, 4);
6945   jcc(Assembler::zero, L_tail_restore);
6946 
6947   // Fold crc into first bytes of vector
6948   movdqa(xmm1, Address(buf, 0));
6949   movdl(rax, xmm1);
6950   xorl(crc, rax);
6951   if (VM_Version::supports_sse4_1()) {
6952     pinsrd(xmm1, crc, 0);
6953   } else {
6954     pinsrw(xmm1, crc, 0);
6955     shrl(crc, 16);
6956     pinsrw(xmm1, crc, 1);
6957   }
6958   addptr(buf, 16);
6959   subl(len, 4); // len > 0
6960   jcc(Assembler::less, L_fold_tail);
6961 
6962   movdqa(xmm2, Address(buf,  0));
6963   movdqa(xmm3, Address(buf, 16));
6964   movdqa(xmm4, Address(buf, 32));
6965   addptr(buf, 48);
6966   subl(len, 3);
6967   jcc(Assembler::lessEqual, L_fold_512b);
6968 
6969   // Fold total 512 bits of polynomial on each iteration,
6970   // 128 bits per each of 4 parallel streams.
6971   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
6972 
6973   align32();
6974   BIND(L_fold_512b_loop);
6975   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
6976   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
6977   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
6978   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
6979   addptr(buf, 64);
6980   subl(len, 4);
6981   jcc(Assembler::greater, L_fold_512b_loop);
6982 
6983   // Fold 512 bits to 128 bits.
6984   BIND(L_fold_512b);
6985   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
6986   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
6987   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
6988   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
6989 
6990   // Fold the rest of 128 bits data chunks
6991   BIND(L_fold_tail);
6992   addl(len, 3);
6993   jccb(Assembler::lessEqual, L_fold_128b);
6994   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
6995 
6996   BIND(L_fold_tail_loop);
6997   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
6998   addptr(buf, 16);
6999   decrementl(len);
7000   jccb(Assembler::greater, L_fold_tail_loop);
7001 
7002   // Fold 128 bits in xmm1 down into 32 bits in crc register.
7003   BIND(L_fold_128b);
7004   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7005   if (UseAVX > 0) {
7006     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7007     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7008     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7009   } else {
7010     movdqa(xmm2, xmm0);
7011     pclmulqdq(xmm2, xmm1, 0x1);
7012     movdqa(xmm3, xmm0);
7013     pand(xmm3, xmm2);
7014     pclmulqdq(xmm0, xmm3, 0x1);
7015   }
7016   psrldq(xmm1, 8);
7017   psrldq(xmm2, 4);
7018   pxor(xmm0, xmm1);
7019   pxor(xmm0, xmm2);
7020 
7021   // 8 8-bit folds to compute 32-bit CRC.
7022   for (int j = 0; j < 4; j++) {
7023     fold_8bit_crc32(xmm0, table, xmm1, rax);
7024   }
7025   movdl(crc, xmm0); // mov 32 bits to general register
7026   for (int j = 0; j < 4; j++) {
7027     fold_8bit_crc32(crc, table, rax);
7028   }
7029 
7030   BIND(L_tail_restore);
7031   movl(len, tmp); // restore
7032   BIND(L_tail);
7033   andl(len, 0xf);
7034   jccb(Assembler::zero, L_exit);
7035 
7036   // Fold the rest of bytes
7037   align(4);
7038   BIND(L_tail_loop);
7039   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7040   update_byte_crc32(crc, rax, table);
7041   increment(buf);
7042   decrementl(len);
7043   jccb(Assembler::greater, L_tail_loop);
7044 
7045   BIND(L_exit);
7046   notl(crc); // ~c
7047 }
7048 
7049 #ifdef _LP64
7050 // Helper function for AVX 512 CRC32
7051 // Fold 512-bit data chunks
7052 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
7053                                              Register pos, int offset) {
7054   evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
7055   evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
7056   evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
7057   evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
7058   evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
7059 }
7060 
7061 // Helper function for AVX 512 CRC32
7062 // Compute CRC32 for < 256B buffers
7063 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
7064                                               Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7065                                               Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7066 
7067   Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
7068   Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
7069   Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
7070 
7071   // check if there is enough buffer to be able to fold 16B at a time
7072   cmpl(len, 32);
7073   jcc(Assembler::less, L_less_than_32);
7074 
7075   // if there is, load the constants
7076   movdqu(xmm10, Address(table, 1 * 16));    //rk1 and rk2 in xmm10
7077   movdl(xmm0, crc);                        // get the initial crc value
7078   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7079   pxor(xmm7, xmm0);
7080 
7081   // update the buffer pointer
7082   addl(pos, 16);
7083   //update the counter.subtract 32 instead of 16 to save one instruction from the loop
7084   subl(len, 32);
7085   jmp(L_16B_reduction_loop);
7086 
7087   bind(L_less_than_32);
7088   //mov initial crc to the return value. this is necessary for zero - length buffers.
7089   movl(rax, crc);
7090   testl(len, len);
7091   jcc(Assembler::equal, L_cleanup);
7092 
7093   movdl(xmm0, crc);                        //get the initial crc value
7094 
7095   cmpl(len, 16);
7096   jcc(Assembler::equal, L_exact_16_left);
7097   jcc(Assembler::less, L_less_than_16_left);
7098 
7099   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7100   pxor(xmm7, xmm0);                       //xor the initial crc value
7101   addl(pos, 16);
7102   subl(len, 16);
7103   movdqu(xmm10, Address(table, 1 * 16));    // rk1 and rk2 in xmm10
7104   jmp(L_get_last_two_xmms);
7105 
7106   bind(L_less_than_16_left);
7107   //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
7108   pxor(xmm1, xmm1);
7109   movptr(tmp1, rsp);
7110   movdqu(Address(tmp1, 0 * 16), xmm1);
7111 
7112   cmpl(len, 4);
7113   jcc(Assembler::less, L_only_less_than_4);
7114 
7115   //backup the counter value
7116   movl(tmp2, len);
7117   cmpl(len, 8);
7118   jcc(Assembler::less, L_less_than_8_left);
7119 
7120   //load 8 Bytes
7121   movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
7122   movq(Address(tmp1, 0 * 16), rax);
7123   addptr(tmp1, 8);
7124   subl(len, 8);
7125   addl(pos, 8);
7126 
7127   bind(L_less_than_8_left);
7128   cmpl(len, 4);
7129   jcc(Assembler::less, L_less_than_4_left);
7130 
7131   //load 4 Bytes
7132   movl(rax, Address(buf, pos, Address::times_1, 0));
7133   movl(Address(tmp1, 0 * 16), rax);
7134   addptr(tmp1, 4);
7135   subl(len, 4);
7136   addl(pos, 4);
7137 
7138   bind(L_less_than_4_left);
7139   cmpl(len, 2);
7140   jcc(Assembler::less, L_less_than_2_left);
7141 
7142   // load 2 Bytes
7143   movw(rax, Address(buf, pos, Address::times_1, 0));
7144   movl(Address(tmp1, 0 * 16), rax);
7145   addptr(tmp1, 2);
7146   subl(len, 2);
7147   addl(pos, 2);
7148 
7149   bind(L_less_than_2_left);
7150   cmpl(len, 1);
7151   jcc(Assembler::less, L_zero_left);
7152 
7153   // load 1 Byte
7154   movb(rax, Address(buf, pos, Address::times_1, 0));
7155   movb(Address(tmp1, 0 * 16), rax);
7156 
7157   bind(L_zero_left);
7158   movdqu(xmm7, Address(rsp, 0));
7159   pxor(xmm7, xmm0);                       //xor the initial crc value
7160 
7161   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7162   movdqu(xmm0, Address(rax, tmp2));
7163   pshufb(xmm7, xmm0);
7164   jmp(L_128_done);
7165 
7166   bind(L_exact_16_left);
7167   movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
7168   pxor(xmm7, xmm0);                       //xor the initial crc value
7169   jmp(L_128_done);
7170 
7171   bind(L_only_less_than_4);
7172   cmpl(len, 3);
7173   jcc(Assembler::less, L_only_less_than_3);
7174 
7175   // load 3 Bytes
7176   movb(rax, Address(buf, pos, Address::times_1, 0));
7177   movb(Address(tmp1, 0), rax);
7178 
7179   movb(rax, Address(buf, pos, Address::times_1, 1));
7180   movb(Address(tmp1, 1), rax);
7181 
7182   movb(rax, Address(buf, pos, Address::times_1, 2));
7183   movb(Address(tmp1, 2), rax);
7184 
7185   movdqu(xmm7, Address(rsp, 0));
7186   pxor(xmm7, xmm0);                     //xor the initial crc value
7187 
7188   pslldq(xmm7, 0x5);
7189   jmp(L_barrett);
7190   bind(L_only_less_than_3);
7191   cmpl(len, 2);
7192   jcc(Assembler::less, L_only_less_than_2);
7193 
7194   // load 2 Bytes
7195   movb(rax, Address(buf, pos, Address::times_1, 0));
7196   movb(Address(tmp1, 0), rax);
7197 
7198   movb(rax, Address(buf, pos, Address::times_1, 1));
7199   movb(Address(tmp1, 1), rax);
7200 
7201   movdqu(xmm7, Address(rsp, 0));
7202   pxor(xmm7, xmm0);                     //xor the initial crc value
7203 
7204   pslldq(xmm7, 0x6);
7205   jmp(L_barrett);
7206 
7207   bind(L_only_less_than_2);
7208   //load 1 Byte
7209   movb(rax, Address(buf, pos, Address::times_1, 0));
7210   movb(Address(tmp1, 0), rax);
7211 
7212   movdqu(xmm7, Address(rsp, 0));
7213   pxor(xmm7, xmm0);                     //xor the initial crc value
7214 
7215   pslldq(xmm7, 0x7);
7216 }
7217 
7218 /**
7219 * Compute CRC32 using AVX512 instructions
7220 * param crc   register containing existing CRC (32-bit)
7221 * param buf   register pointing to input byte buffer (byte*)
7222 * param len   register containing number of bytes
7223 * param table address of crc or crc32c table
7224 * param tmp1  scratch register
7225 * param tmp2  scratch register
7226 * return rax  result register
7227 *
7228 * This routine is identical for crc32c with the exception of the precomputed constant
7229 * table which will be passed as the table argument.  The calculation steps are
7230 * the same for both variants.
7231 */
7232 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
7233   assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
7234 
7235   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7236   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7237   Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
7238   Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
7239   Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
7240 
7241   const Register pos = r12;
7242   push(r12);
7243   subptr(rsp, 16 * 2 + 8);
7244 
7245   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7246   // context for the registers used, where all instructions below are using 128-bit mode
7247   // On EVEX without VL and BW, these instructions will all be AVX.
7248   movl(pos, 0);
7249 
7250   // check if smaller than 256B
7251   cmpl(len, 256);
7252   jcc(Assembler::less, L_less_than_256);
7253 
7254   // load the initial crc value
7255   movdl(xmm10, crc);
7256 
7257   // receive the initial 64B data, xor the initial crc value
7258   evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
7259   evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
7260   evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
7261   evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
7262 
7263   subl(len, 256);
7264   cmpl(len, 256);
7265   jcc(Assembler::less, L_fold_128_B_loop);
7266 
7267   evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
7268   evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
7269   evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
7270   subl(len, 256);
7271 
7272   bind(L_fold_256_B_loop);
7273   addl(pos, 256);
7274   fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
7275   fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
7276   fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
7277   fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
7278 
7279   subl(len, 256);
7280   jcc(Assembler::greaterEqual, L_fold_256_B_loop);
7281 
7282   // Fold 256 into 128
7283   addl(pos, 256);
7284   evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
7285   evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
7286   vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
7287 
7288   evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
7289   evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
7290   vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
7291 
7292   evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
7293   evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
7294 
7295   addl(len, 128);
7296   jmp(L_fold_128_B_register);
7297 
7298   // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
7299   // loop will fold 128B at a time until we have 128 + y Bytes of buffer
7300 
7301   // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
7302   bind(L_fold_128_B_loop);
7303   addl(pos, 128);
7304   fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
7305   fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
7306 
7307   subl(len, 128);
7308   jcc(Assembler::greaterEqual, L_fold_128_B_loop);
7309 
7310   addl(pos, 128);
7311 
7312   // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
7313   // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
7314   bind(L_fold_128_B_register);
7315   evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
7316   evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
7317   evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
7318   evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
7319   // save last that has no multiplicand
7320   vextracti64x2(xmm7, xmm4, 3);
7321 
7322   evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
7323   evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
7324   // Needed later in reduction loop
7325   movdqu(xmm10, Address(table, 1 * 16));
7326   vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
7327   vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
7328 
7329   // Swap 1,0,3,2 - 01 00 11 10
7330   evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
7331   evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
7332   vextracti128(xmm5, xmm8, 1);
7333   evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
7334 
7335   // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
7336   // instead of a cmp instruction, we use the negative flag with the jl instruction
7337   addl(len, 128 - 16);
7338   jcc(Assembler::less, L_final_reduction_for_128);
7339 
7340   bind(L_16B_reduction_loop);
7341   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7342   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7343   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7344   movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
7345   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7346   addl(pos, 16);
7347   subl(len, 16);
7348   jcc(Assembler::greaterEqual, L_16B_reduction_loop);
7349 
7350   bind(L_final_reduction_for_128);
7351   addl(len, 16);
7352   jcc(Assembler::equal, L_128_done);
7353 
7354   bind(L_get_last_two_xmms);
7355   movdqu(xmm2, xmm7);
7356   addl(pos, len);
7357   movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
7358   subl(pos, len);
7359 
7360   // get rid of the extra data that was loaded before
7361   // load the shift constant
7362   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7363   movdqu(xmm0, Address(rax, len));
7364   addl(rax, len);
7365 
7366   vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7367   //Change mask to 512
7368   vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
7369   vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
7370 
7371   blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
7372   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7373   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7374   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7375   vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
7376 
7377   bind(L_128_done);
7378   // compute crc of a 128-bit value
7379   movdqu(xmm10, Address(table, 3 * 16));
7380   movdqu(xmm0, xmm7);
7381 
7382   // 64b fold
7383   vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
7384   vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
7385   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7386 
7387   // 32b fold
7388   movdqu(xmm0, xmm7);
7389   vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
7390   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7391   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7392   jmp(L_barrett);
7393 
7394   bind(L_less_than_256);
7395   kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
7396 
7397   //barrett reduction
7398   bind(L_barrett);
7399   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
7400   movdqu(xmm1, xmm7);
7401   movdqu(xmm2, xmm7);
7402   movdqu(xmm10, Address(table, 4 * 16));
7403 
7404   pclmulqdq(xmm7, xmm10, 0x0);
7405   pxor(xmm7, xmm2);
7406   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
7407   movdqu(xmm2, xmm7);
7408   pclmulqdq(xmm7, xmm10, 0x10);
7409   pxor(xmm7, xmm2);
7410   pxor(xmm7, xmm1);
7411   pextrd(crc, xmm7, 2);
7412 
7413   bind(L_cleanup);
7414   addptr(rsp, 16 * 2 + 8);
7415   pop(r12);
7416 }
7417 
7418 // S. Gueron / Information Processing Letters 112 (2012) 184
7419 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
7420 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
7421 // Output: the 64-bit carry-less product of B * CONST
7422 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
7423                                      Register tmp1, Register tmp2, Register tmp3) {
7424   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7425   if (n > 0) {
7426     addq(tmp3, n * 256 * 8);
7427   }
7428   //    Q1 = TABLEExt[n][B & 0xFF];
7429   movl(tmp1, in);
7430   andl(tmp1, 0x000000FF);
7431   shll(tmp1, 3);
7432   addq(tmp1, tmp3);
7433   movq(tmp1, Address(tmp1, 0));
7434 
7435   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7436   movl(tmp2, in);
7437   shrl(tmp2, 8);
7438   andl(tmp2, 0x000000FF);
7439   shll(tmp2, 3);
7440   addq(tmp2, tmp3);
7441   movq(tmp2, Address(tmp2, 0));
7442 
7443   shlq(tmp2, 8);
7444   xorq(tmp1, tmp2);
7445 
7446   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7447   movl(tmp2, in);
7448   shrl(tmp2, 16);
7449   andl(tmp2, 0x000000FF);
7450   shll(tmp2, 3);
7451   addq(tmp2, tmp3);
7452   movq(tmp2, Address(tmp2, 0));
7453 
7454   shlq(tmp2, 16);
7455   xorq(tmp1, tmp2);
7456 
7457   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7458   shrl(in, 24);
7459   andl(in, 0x000000FF);
7460   shll(in, 3);
7461   addq(in, tmp3);
7462   movq(in, Address(in, 0));
7463 
7464   shlq(in, 24);
7465   xorq(in, tmp1);
7466   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7467 }
7468 
7469 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7470                                       Register in_out,
7471                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7472                                       XMMRegister w_xtmp2,
7473                                       Register tmp1,
7474                                       Register n_tmp2, Register n_tmp3) {
7475   if (is_pclmulqdq_supported) {
7476     movdl(w_xtmp1, in_out); // modified blindly
7477 
7478     movl(tmp1, const_or_pre_comp_const_index);
7479     movdl(w_xtmp2, tmp1);
7480     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7481 
7482     movdq(in_out, w_xtmp1);
7483   } else {
7484     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
7485   }
7486 }
7487 
7488 // Recombination Alternative 2: No bit-reflections
7489 // T1 = (CRC_A * U1) << 1
7490 // T2 = (CRC_B * U2) << 1
7491 // C1 = T1 >> 32
7492 // C2 = T2 >> 32
7493 // T1 = T1 & 0xFFFFFFFF
7494 // T2 = T2 & 0xFFFFFFFF
7495 // T1 = CRC32(0, T1)
7496 // T2 = CRC32(0, T2)
7497 // C1 = C1 ^ T1
7498 // C2 = C2 ^ T2
7499 // CRC = C1 ^ C2 ^ CRC_C
7500 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7501                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7502                                      Register tmp1, Register tmp2,
7503                                      Register n_tmp3) {
7504   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7505   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7506   shlq(in_out, 1);
7507   movl(tmp1, in_out);
7508   shrq(in_out, 32);
7509   xorl(tmp2, tmp2);
7510   crc32(tmp2, tmp1, 4);
7511   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
7512   shlq(in1, 1);
7513   movl(tmp1, in1);
7514   shrq(in1, 32);
7515   xorl(tmp2, tmp2);
7516   crc32(tmp2, tmp1, 4);
7517   xorl(in1, tmp2);
7518   xorl(in_out, in1);
7519   xorl(in_out, in2);
7520 }
7521 
7522 // Set N to predefined value
7523 // Subtract from a lenght of a buffer
7524 // execute in a loop:
7525 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
7526 // for i = 1 to N do
7527 //  CRC_A = CRC32(CRC_A, A[i])
7528 //  CRC_B = CRC32(CRC_B, B[i])
7529 //  CRC_C = CRC32(CRC_C, C[i])
7530 // end for
7531 // Recombine
7532 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7533                                        Register in_out1, Register in_out2, Register in_out3,
7534                                        Register tmp1, Register tmp2, Register tmp3,
7535                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7536                                        Register tmp4, Register tmp5,
7537                                        Register n_tmp6) {
7538   Label L_processPartitions;
7539   Label L_processPartition;
7540   Label L_exit;
7541 
7542   bind(L_processPartitions);
7543   cmpl(in_out1, 3 * size);
7544   jcc(Assembler::less, L_exit);
7545     xorl(tmp1, tmp1);
7546     xorl(tmp2, tmp2);
7547     movq(tmp3, in_out2);
7548     addq(tmp3, size);
7549 
7550     bind(L_processPartition);
7551       crc32(in_out3, Address(in_out2, 0), 8);
7552       crc32(tmp1, Address(in_out2, size), 8);
7553       crc32(tmp2, Address(in_out2, size * 2), 8);
7554       addq(in_out2, 8);
7555       cmpq(in_out2, tmp3);
7556       jcc(Assembler::less, L_processPartition);
7557     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7558             w_xtmp1, w_xtmp2, w_xtmp3,
7559             tmp4, tmp5,
7560             n_tmp6);
7561     addq(in_out2, 2 * size);
7562     subl(in_out1, 3 * size);
7563     jmp(L_processPartitions);
7564 
7565   bind(L_exit);
7566 }
7567 #else
7568 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
7569                                      Register tmp1, Register tmp2, Register tmp3,
7570                                      XMMRegister xtmp1, XMMRegister xtmp2) {
7571   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7572   if (n > 0) {
7573     addl(tmp3, n * 256 * 8);
7574   }
7575   //    Q1 = TABLEExt[n][B & 0xFF];
7576   movl(tmp1, in_out);
7577   andl(tmp1, 0x000000FF);
7578   shll(tmp1, 3);
7579   addl(tmp1, tmp3);
7580   movq(xtmp1, Address(tmp1, 0));
7581 
7582   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7583   movl(tmp2, in_out);
7584   shrl(tmp2, 8);
7585   andl(tmp2, 0x000000FF);
7586   shll(tmp2, 3);
7587   addl(tmp2, tmp3);
7588   movq(xtmp2, Address(tmp2, 0));
7589 
7590   psllq(xtmp2, 8);
7591   pxor(xtmp1, xtmp2);
7592 
7593   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7594   movl(tmp2, in_out);
7595   shrl(tmp2, 16);
7596   andl(tmp2, 0x000000FF);
7597   shll(tmp2, 3);
7598   addl(tmp2, tmp3);
7599   movq(xtmp2, Address(tmp2, 0));
7600 
7601   psllq(xtmp2, 16);
7602   pxor(xtmp1, xtmp2);
7603 
7604   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7605   shrl(in_out, 24);
7606   andl(in_out, 0x000000FF);
7607   shll(in_out, 3);
7608   addl(in_out, tmp3);
7609   movq(xtmp2, Address(in_out, 0));
7610 
7611   psllq(xtmp2, 24);
7612   pxor(xtmp1, xtmp2); // Result in CXMM
7613   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7614 }
7615 
7616 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7617                                       Register in_out,
7618                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7619                                       XMMRegister w_xtmp2,
7620                                       Register tmp1,
7621                                       Register n_tmp2, Register n_tmp3) {
7622   if (is_pclmulqdq_supported) {
7623     movdl(w_xtmp1, in_out);
7624 
7625     movl(tmp1, const_or_pre_comp_const_index);
7626     movdl(w_xtmp2, tmp1);
7627     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7628     // Keep result in XMM since GPR is 32 bit in length
7629   } else {
7630     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
7631   }
7632 }
7633 
7634 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7635                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7636                                      Register tmp1, Register tmp2,
7637                                      Register n_tmp3) {
7638   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7639   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7640 
7641   psllq(w_xtmp1, 1);
7642   movdl(tmp1, w_xtmp1);
7643   psrlq(w_xtmp1, 32);
7644   movdl(in_out, w_xtmp1);
7645 
7646   xorl(tmp2, tmp2);
7647   crc32(tmp2, tmp1, 4);
7648   xorl(in_out, tmp2);
7649 
7650   psllq(w_xtmp2, 1);
7651   movdl(tmp1, w_xtmp2);
7652   psrlq(w_xtmp2, 32);
7653   movdl(in1, w_xtmp2);
7654 
7655   xorl(tmp2, tmp2);
7656   crc32(tmp2, tmp1, 4);
7657   xorl(in1, tmp2);
7658   xorl(in_out, in1);
7659   xorl(in_out, in2);
7660 }
7661 
7662 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7663                                        Register in_out1, Register in_out2, Register in_out3,
7664                                        Register tmp1, Register tmp2, Register tmp3,
7665                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7666                                        Register tmp4, Register tmp5,
7667                                        Register n_tmp6) {
7668   Label L_processPartitions;
7669   Label L_processPartition;
7670   Label L_exit;
7671 
7672   bind(L_processPartitions);
7673   cmpl(in_out1, 3 * size);
7674   jcc(Assembler::less, L_exit);
7675     xorl(tmp1, tmp1);
7676     xorl(tmp2, tmp2);
7677     movl(tmp3, in_out2);
7678     addl(tmp3, size);
7679 
7680     bind(L_processPartition);
7681       crc32(in_out3, Address(in_out2, 0), 4);
7682       crc32(tmp1, Address(in_out2, size), 4);
7683       crc32(tmp2, Address(in_out2, size*2), 4);
7684       crc32(in_out3, Address(in_out2, 0+4), 4);
7685       crc32(tmp1, Address(in_out2, size+4), 4);
7686       crc32(tmp2, Address(in_out2, size*2+4), 4);
7687       addl(in_out2, 8);
7688       cmpl(in_out2, tmp3);
7689       jcc(Assembler::less, L_processPartition);
7690 
7691         push(tmp3);
7692         push(in_out1);
7693         push(in_out2);
7694         tmp4 = tmp3;
7695         tmp5 = in_out1;
7696         n_tmp6 = in_out2;
7697 
7698       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7699             w_xtmp1, w_xtmp2, w_xtmp3,
7700             tmp4, tmp5,
7701             n_tmp6);
7702 
7703         pop(in_out2);
7704         pop(in_out1);
7705         pop(tmp3);
7706 
7707     addl(in_out2, 2 * size);
7708     subl(in_out1, 3 * size);
7709     jmp(L_processPartitions);
7710 
7711   bind(L_exit);
7712 }
7713 #endif //LP64
7714 
7715 #ifdef _LP64
7716 // Algorithm 2: Pipelined usage of the CRC32 instruction.
7717 // Input: A buffer I of L bytes.
7718 // Output: the CRC32C value of the buffer.
7719 // Notations:
7720 // Write L = 24N + r, with N = floor (L/24).
7721 // r = L mod 24 (0 <= r < 24).
7722 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
7723 // N quadwords, and R consists of r bytes.
7724 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
7725 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
7726 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
7727 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
7728 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7729                                           Register tmp1, Register tmp2, Register tmp3,
7730                                           Register tmp4, Register tmp5, Register tmp6,
7731                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7732                                           bool is_pclmulqdq_supported) {
7733   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7734   Label L_wordByWord;
7735   Label L_byteByByteProlog;
7736   Label L_byteByByte;
7737   Label L_exit;
7738 
7739   if (is_pclmulqdq_supported ) {
7740     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7741     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
7742 
7743     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7744     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7745 
7746     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7747     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7748     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
7749   } else {
7750     const_or_pre_comp_const_index[0] = 1;
7751     const_or_pre_comp_const_index[1] = 0;
7752 
7753     const_or_pre_comp_const_index[2] = 3;
7754     const_or_pre_comp_const_index[3] = 2;
7755 
7756     const_or_pre_comp_const_index[4] = 5;
7757     const_or_pre_comp_const_index[5] = 4;
7758    }
7759   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
7760                     in2, in1, in_out,
7761                     tmp1, tmp2, tmp3,
7762                     w_xtmp1, w_xtmp2, w_xtmp3,
7763                     tmp4, tmp5,
7764                     tmp6);
7765   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
7766                     in2, in1, in_out,
7767                     tmp1, tmp2, tmp3,
7768                     w_xtmp1, w_xtmp2, w_xtmp3,
7769                     tmp4, tmp5,
7770                     tmp6);
7771   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
7772                     in2, in1, in_out,
7773                     tmp1, tmp2, tmp3,
7774                     w_xtmp1, w_xtmp2, w_xtmp3,
7775                     tmp4, tmp5,
7776                     tmp6);
7777   movl(tmp1, in2);
7778   andl(tmp1, 0x00000007);
7779   negl(tmp1);
7780   addl(tmp1, in2);
7781   addq(tmp1, in1);
7782 
7783   BIND(L_wordByWord);
7784   cmpq(in1, tmp1);
7785   jcc(Assembler::greaterEqual, L_byteByByteProlog);
7786     crc32(in_out, Address(in1, 0), 4);
7787     addq(in1, 4);
7788     jmp(L_wordByWord);
7789 
7790   BIND(L_byteByByteProlog);
7791   andl(in2, 0x00000007);
7792   movl(tmp2, 1);
7793 
7794   BIND(L_byteByByte);
7795   cmpl(tmp2, in2);
7796   jccb(Assembler::greater, L_exit);
7797     crc32(in_out, Address(in1, 0), 1);
7798     incq(in1);
7799     incl(tmp2);
7800     jmp(L_byteByByte);
7801 
7802   BIND(L_exit);
7803 }
7804 #else
7805 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7806                                           Register tmp1, Register  tmp2, Register tmp3,
7807                                           Register tmp4, Register  tmp5, Register tmp6,
7808                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7809                                           bool is_pclmulqdq_supported) {
7810   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7811   Label L_wordByWord;
7812   Label L_byteByByteProlog;
7813   Label L_byteByByte;
7814   Label L_exit;
7815 
7816   if (is_pclmulqdq_supported) {
7817     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7818     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
7819 
7820     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7821     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7822 
7823     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7824     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7825   } else {
7826     const_or_pre_comp_const_index[0] = 1;
7827     const_or_pre_comp_const_index[1] = 0;
7828 
7829     const_or_pre_comp_const_index[2] = 3;
7830     const_or_pre_comp_const_index[3] = 2;
7831 
7832     const_or_pre_comp_const_index[4] = 5;
7833     const_or_pre_comp_const_index[5] = 4;
7834   }
7835   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
7836                     in2, in1, in_out,
7837                     tmp1, tmp2, tmp3,
7838                     w_xtmp1, w_xtmp2, w_xtmp3,
7839                     tmp4, tmp5,
7840                     tmp6);
7841   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
7842                     in2, in1, in_out,
7843                     tmp1, tmp2, tmp3,
7844                     w_xtmp1, w_xtmp2, w_xtmp3,
7845                     tmp4, tmp5,
7846                     tmp6);
7847   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
7848                     in2, in1, in_out,
7849                     tmp1, tmp2, tmp3,
7850                     w_xtmp1, w_xtmp2, w_xtmp3,
7851                     tmp4, tmp5,
7852                     tmp6);
7853   movl(tmp1, in2);
7854   andl(tmp1, 0x00000007);
7855   negl(tmp1);
7856   addl(tmp1, in2);
7857   addl(tmp1, in1);
7858 
7859   BIND(L_wordByWord);
7860   cmpl(in1, tmp1);
7861   jcc(Assembler::greaterEqual, L_byteByByteProlog);
7862     crc32(in_out, Address(in1,0), 4);
7863     addl(in1, 4);
7864     jmp(L_wordByWord);
7865 
7866   BIND(L_byteByByteProlog);
7867   andl(in2, 0x00000007);
7868   movl(tmp2, 1);
7869 
7870   BIND(L_byteByByte);
7871   cmpl(tmp2, in2);
7872   jccb(Assembler::greater, L_exit);
7873     movb(tmp1, Address(in1, 0));
7874     crc32(in_out, tmp1, 1);
7875     incl(in1);
7876     incl(tmp2);
7877     jmp(L_byteByByte);
7878 
7879   BIND(L_exit);
7880 }
7881 #endif // LP64
7882 #undef BIND
7883 #undef BLOCK_COMMENT
7884 
7885 // Compress char[] array to byte[].
7886 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
7887 //   @IntrinsicCandidate
7888 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
7889 //     for (int i = 0; i < len; i++) {
7890 //       int c = src[srcOff++];
7891 //       if (c >>> 8 != 0) {
7892 //         return 0;
7893 //       }
7894 //       dst[dstOff++] = (byte)c;
7895 //     }
7896 //     return len;
7897 //   }
7898 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
7899   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
7900   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
7901   Register tmp5, Register result, KRegister mask1, KRegister mask2) {
7902   Label copy_chars_loop, return_length, return_zero, done;
7903 
7904   // rsi: src
7905   // rdi: dst
7906   // rdx: len
7907   // rcx: tmp5
7908   // rax: result
7909 
7910   // rsi holds start addr of source char[] to be compressed
7911   // rdi holds start addr of destination byte[]
7912   // rdx holds length
7913 
7914   assert(len != result, "");
7915 
7916   // save length for return
7917   push(len);
7918 
7919   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
7920     VM_Version::supports_avx512vlbw() &&
7921     VM_Version::supports_bmi2()) {
7922 
7923     Label copy_32_loop, copy_loop_tail, below_threshold;
7924 
7925     // alignment
7926     Label post_alignment;
7927 
7928     // if length of the string is less than 16, handle it in an old fashioned way
7929     testl(len, -32);
7930     jcc(Assembler::zero, below_threshold);
7931 
7932     // First check whether a character is compressable ( <= 0xFF).
7933     // Create mask to test for Unicode chars inside zmm vector
7934     movl(result, 0x00FF);
7935     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
7936 
7937     testl(len, -64);
7938     jcc(Assembler::zero, post_alignment);
7939 
7940     movl(tmp5, dst);
7941     andl(tmp5, (32 - 1));
7942     negl(tmp5);
7943     andl(tmp5, (32 - 1));
7944 
7945     // bail out when there is nothing to be done
7946     testl(tmp5, 0xFFFFFFFF);
7947     jcc(Assembler::zero, post_alignment);
7948 
7949     // ~(~0 << len), where len is the # of remaining elements to process
7950     movl(result, 0xFFFFFFFF);
7951     shlxl(result, result, tmp5);
7952     notl(result);
7953     kmovdl(mask2, result);
7954 
7955     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
7956     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
7957     ktestd(mask1, mask2);
7958     jcc(Assembler::carryClear, return_zero);
7959 
7960     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
7961 
7962     addptr(src, tmp5);
7963     addptr(src, tmp5);
7964     addptr(dst, tmp5);
7965     subl(len, tmp5);
7966 
7967     bind(post_alignment);
7968     // end of alignment
7969 
7970     movl(tmp5, len);
7971     andl(tmp5, (32 - 1));    // tail count (in chars)
7972     andl(len, ~(32 - 1));    // vector count (in chars)
7973     jcc(Assembler::zero, copy_loop_tail);
7974 
7975     lea(src, Address(src, len, Address::times_2));
7976     lea(dst, Address(dst, len, Address::times_1));
7977     negptr(len);
7978 
7979     bind(copy_32_loop);
7980     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), /*merge*/ false, Assembler::AVX_512bit);
7981     evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
7982     kortestdl(mask1, mask1);
7983     jcc(Assembler::carryClear, return_zero);
7984 
7985     // All elements in current processed chunk are valid candidates for
7986     // compression. Write a truncated byte elements to the memory.
7987     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
7988     addptr(len, 32);
7989     jcc(Assembler::notZero, copy_32_loop);
7990 
7991     bind(copy_loop_tail);
7992     // bail out when there is nothing to be done
7993     testl(tmp5, 0xFFFFFFFF);
7994     jcc(Assembler::zero, return_length);
7995 
7996     movl(len, tmp5);
7997 
7998     // ~(~0 << len), where len is the # of remaining elements to process
7999     movl(result, 0xFFFFFFFF);
8000     shlxl(result, result, len);
8001     notl(result);
8002 
8003     kmovdl(mask2, result);
8004 
8005     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8006     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8007     ktestd(mask1, mask2);
8008     jcc(Assembler::carryClear, return_zero);
8009 
8010     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8011     jmp(return_length);
8012 
8013     bind(below_threshold);
8014   }
8015 
8016   if (UseSSE42Intrinsics) {
8017     Label copy_32_loop, copy_16, copy_tail;
8018 
8019     movl(result, len);
8020 
8021     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
8022 
8023     // vectored compression
8024     andl(len, 0xfffffff0);    // vector count (in chars)
8025     andl(result, 0x0000000f);    // tail count (in chars)
8026     testl(len, len);
8027     jcc(Assembler::zero, copy_16);
8028 
8029     // compress 16 chars per iter
8030     movdl(tmp1Reg, tmp5);
8031     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8032     pxor(tmp4Reg, tmp4Reg);
8033 
8034     lea(src, Address(src, len, Address::times_2));
8035     lea(dst, Address(dst, len, Address::times_1));
8036     negptr(len);
8037 
8038     bind(copy_32_loop);
8039     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
8040     por(tmp4Reg, tmp2Reg);
8041     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
8042     por(tmp4Reg, tmp3Reg);
8043     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
8044     jcc(Assembler::notZero, return_zero);
8045     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
8046     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
8047     addptr(len, 16);
8048     jcc(Assembler::notZero, copy_32_loop);
8049 
8050     // compress next vector of 8 chars (if any)
8051     bind(copy_16);
8052     movl(len, result);
8053     andl(len, 0xfffffff8);    // vector count (in chars)
8054     andl(result, 0x00000007);    // tail count (in chars)
8055     testl(len, len);
8056     jccb(Assembler::zero, copy_tail);
8057 
8058     movdl(tmp1Reg, tmp5);
8059     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8060     pxor(tmp3Reg, tmp3Reg);
8061 
8062     movdqu(tmp2Reg, Address(src, 0));
8063     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
8064     jccb(Assembler::notZero, return_zero);
8065     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
8066     movq(Address(dst, 0), tmp2Reg);
8067     addptr(src, 16);
8068     addptr(dst, 8);
8069 
8070     bind(copy_tail);
8071     movl(len, result);
8072   }
8073   // compress 1 char per iter
8074   testl(len, len);
8075   jccb(Assembler::zero, return_length);
8076   lea(src, Address(src, len, Address::times_2));
8077   lea(dst, Address(dst, len, Address::times_1));
8078   negptr(len);
8079 
8080   bind(copy_chars_loop);
8081   load_unsigned_short(result, Address(src, len, Address::times_2));
8082   testl(result, 0xff00);      // check if Unicode char
8083   jccb(Assembler::notZero, return_zero);
8084   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
8085   increment(len);
8086   jcc(Assembler::notZero, copy_chars_loop);
8087 
8088   // if compression succeeded, return length
8089   bind(return_length);
8090   pop(result);
8091   jmpb(done);
8092 
8093   // if compression failed, return 0
8094   bind(return_zero);
8095   xorl(result, result);
8096   addptr(rsp, wordSize);
8097 
8098   bind(done);
8099 }
8100 
8101 // Inflate byte[] array to char[].
8102 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
8103 //   @IntrinsicCandidate
8104 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
8105 //     for (int i = 0; i < len; i++) {
8106 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
8107 //     }
8108 //   }
8109 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
8110   XMMRegister tmp1, Register tmp2, KRegister mask) {
8111   Label copy_chars_loop, done, below_threshold, avx3_threshold;
8112   // rsi: src
8113   // rdi: dst
8114   // rdx: len
8115   // rcx: tmp2
8116 
8117   // rsi holds start addr of source byte[] to be inflated
8118   // rdi holds start addr of destination char[]
8119   // rdx holds length
8120   assert_different_registers(src, dst, len, tmp2);
8121   movl(tmp2, len);
8122   if ((UseAVX > 2) && // AVX512
8123     VM_Version::supports_avx512vlbw() &&
8124     VM_Version::supports_bmi2()) {
8125 
8126     Label copy_32_loop, copy_tail;
8127     Register tmp3_aliased = len;
8128 
8129     // if length of the string is less than 16, handle it in an old fashioned way
8130     testl(len, -16);
8131     jcc(Assembler::zero, below_threshold);
8132 
8133     testl(len, -1 * AVX3Threshold);
8134     jcc(Assembler::zero, avx3_threshold);
8135 
8136     // In order to use only one arithmetic operation for the main loop we use
8137     // this pre-calculation
8138     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
8139     andl(len, -32);     // vector count
8140     jccb(Assembler::zero, copy_tail);
8141 
8142     lea(src, Address(src, len, Address::times_1));
8143     lea(dst, Address(dst, len, Address::times_2));
8144     negptr(len);
8145 
8146 
8147     // inflate 32 chars per iter
8148     bind(copy_32_loop);
8149     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
8150     evmovdquw(Address(dst, len, Address::times_2), tmp1, /*merge*/ false, Assembler::AVX_512bit);
8151     addptr(len, 32);
8152     jcc(Assembler::notZero, copy_32_loop);
8153 
8154     bind(copy_tail);
8155     // bail out when there is nothing to be done
8156     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
8157     jcc(Assembler::zero, done);
8158 
8159     // ~(~0 << length), where length is the # of remaining elements to process
8160     movl(tmp3_aliased, -1);
8161     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
8162     notl(tmp3_aliased);
8163     kmovdl(mask, tmp3_aliased);
8164     evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
8165     evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
8166 
8167     jmp(done);
8168     bind(avx3_threshold);
8169   }
8170   if (UseSSE42Intrinsics) {
8171     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
8172 
8173     if (UseAVX > 1) {
8174       andl(tmp2, (16 - 1));
8175       andl(len, -16);
8176       jccb(Assembler::zero, copy_new_tail);
8177     } else {
8178       andl(tmp2, 0x00000007);   // tail count (in chars)
8179       andl(len, 0xfffffff8);    // vector count (in chars)
8180       jccb(Assembler::zero, copy_tail);
8181     }
8182 
8183     // vectored inflation
8184     lea(src, Address(src, len, Address::times_1));
8185     lea(dst, Address(dst, len, Address::times_2));
8186     negptr(len);
8187 
8188     if (UseAVX > 1) {
8189       bind(copy_16_loop);
8190       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
8191       vmovdqu(Address(dst, len, Address::times_2), tmp1);
8192       addptr(len, 16);
8193       jcc(Assembler::notZero, copy_16_loop);
8194 
8195       bind(below_threshold);
8196       bind(copy_new_tail);
8197       movl(len, tmp2);
8198       andl(tmp2, 0x00000007);
8199       andl(len, 0xFFFFFFF8);
8200       jccb(Assembler::zero, copy_tail);
8201 
8202       pmovzxbw(tmp1, Address(src, 0));
8203       movdqu(Address(dst, 0), tmp1);
8204       addptr(src, 8);
8205       addptr(dst, 2 * 8);
8206 
8207       jmp(copy_tail, true);
8208     }
8209 
8210     // inflate 8 chars per iter
8211     bind(copy_8_loop);
8212     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
8213     movdqu(Address(dst, len, Address::times_2), tmp1);
8214     addptr(len, 8);
8215     jcc(Assembler::notZero, copy_8_loop);
8216 
8217     bind(copy_tail);
8218     movl(len, tmp2);
8219 
8220     cmpl(len, 4);
8221     jccb(Assembler::less, copy_bytes);
8222 
8223     movdl(tmp1, Address(src, 0));  // load 4 byte chars
8224     pmovzxbw(tmp1, tmp1);
8225     movq(Address(dst, 0), tmp1);
8226     subptr(len, 4);
8227     addptr(src, 4);
8228     addptr(dst, 8);
8229 
8230     bind(copy_bytes);
8231   } else {
8232     bind(below_threshold);
8233   }
8234 
8235   testl(len, len);
8236   jccb(Assembler::zero, done);
8237   lea(src, Address(src, len, Address::times_1));
8238   lea(dst, Address(dst, len, Address::times_2));
8239   negptr(len);
8240 
8241   // inflate 1 char per iter
8242   bind(copy_chars_loop);
8243   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
8244   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
8245   increment(len);
8246   jcc(Assembler::notZero, copy_chars_loop);
8247 
8248   bind(done);
8249 }
8250 
8251 
8252 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
8253   switch(type) {
8254     case T_BYTE:
8255     case T_BOOLEAN:
8256       evmovdqub(dst, kmask, src, false, vector_len);
8257       break;
8258     case T_CHAR:
8259     case T_SHORT:
8260       evmovdquw(dst, kmask, src, false, vector_len);
8261       break;
8262     case T_INT:
8263     case T_FLOAT:
8264       evmovdqul(dst, kmask, src, false, vector_len);
8265       break;
8266     case T_LONG:
8267     case T_DOUBLE:
8268       evmovdquq(dst, kmask, src, false, vector_len);
8269       break;
8270     default:
8271       fatal("Unexpected type argument %s", type2name(type));
8272       break;
8273   }
8274 }
8275 
8276 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
8277   switch(type) {
8278     case T_BYTE:
8279     case T_BOOLEAN:
8280       evmovdqub(dst, kmask, src, true, vector_len);
8281       break;
8282     case T_CHAR:
8283     case T_SHORT:
8284       evmovdquw(dst, kmask, src, true, vector_len);
8285       break;
8286     case T_INT:
8287     case T_FLOAT:
8288       evmovdqul(dst, kmask, src, true, vector_len);
8289       break;
8290     case T_LONG:
8291     case T_DOUBLE:
8292       evmovdquq(dst, kmask, src, true, vector_len);
8293       break;
8294     default:
8295       fatal("Unexpected type argument %s", type2name(type));
8296       break;
8297   }
8298 }
8299 
8300 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) {
8301   switch(masklen) {
8302     case 2:
8303        knotbl(dst, src);
8304        movl(rtmp, 3);
8305        kmovbl(ktmp, rtmp);
8306        kandbl(dst, ktmp, dst);
8307        break;
8308     case 4:
8309        knotbl(dst, src);
8310        movl(rtmp, 15);
8311        kmovbl(ktmp, rtmp);
8312        kandbl(dst, ktmp, dst);
8313        break;
8314     case 8:
8315        knotbl(dst, src);
8316        break;
8317     case 16:
8318        knotwl(dst, src);
8319        break;
8320     case 32:
8321        knotdl(dst, src);
8322        break;
8323     case 64:
8324        knotql(dst, src);
8325        break;
8326     default:
8327       fatal("Unexpected vector length %d", masklen);
8328       break;
8329   }
8330 }
8331 
8332 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8333   switch(type) {
8334     case T_BOOLEAN:
8335     case T_BYTE:
8336        kandbl(dst, src1, src2);
8337        break;
8338     case T_CHAR:
8339     case T_SHORT:
8340        kandwl(dst, src1, src2);
8341        break;
8342     case T_INT:
8343     case T_FLOAT:
8344        kanddl(dst, src1, src2);
8345        break;
8346     case T_LONG:
8347     case T_DOUBLE:
8348        kandql(dst, src1, src2);
8349        break;
8350     default:
8351       fatal("Unexpected type argument %s", type2name(type));
8352       break;
8353   }
8354 }
8355 
8356 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8357   switch(type) {
8358     case T_BOOLEAN:
8359     case T_BYTE:
8360        korbl(dst, src1, src2);
8361        break;
8362     case T_CHAR:
8363     case T_SHORT:
8364        korwl(dst, src1, src2);
8365        break;
8366     case T_INT:
8367     case T_FLOAT:
8368        kordl(dst, src1, src2);
8369        break;
8370     case T_LONG:
8371     case T_DOUBLE:
8372        korql(dst, src1, src2);
8373        break;
8374     default:
8375       fatal("Unexpected type argument %s", type2name(type));
8376       break;
8377   }
8378 }
8379 
8380 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8381   switch(type) {
8382     case T_BOOLEAN:
8383     case T_BYTE:
8384        kxorbl(dst, src1, src2);
8385        break;
8386     case T_CHAR:
8387     case T_SHORT:
8388        kxorwl(dst, src1, src2);
8389        break;
8390     case T_INT:
8391     case T_FLOAT:
8392        kxordl(dst, src1, src2);
8393        break;
8394     case T_LONG:
8395     case T_DOUBLE:
8396        kxorql(dst, src1, src2);
8397        break;
8398     default:
8399       fatal("Unexpected type argument %s", type2name(type));
8400       break;
8401   }
8402 }
8403 
8404 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8405   switch(type) {
8406     case T_BOOLEAN:
8407     case T_BYTE:
8408       evpermb(dst, mask, nds, src, merge, vector_len); break;
8409     case T_CHAR:
8410     case T_SHORT:
8411       evpermw(dst, mask, nds, src, merge, vector_len); break;
8412     case T_INT:
8413     case T_FLOAT:
8414       evpermd(dst, mask, nds, src, merge, vector_len); break;
8415     case T_LONG:
8416     case T_DOUBLE:
8417       evpermq(dst, mask, nds, src, merge, vector_len); break;
8418     default:
8419       fatal("Unexpected type argument %s", type2name(type)); break;
8420   }
8421 }
8422 
8423 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8424   switch(type) {
8425     case T_BOOLEAN:
8426     case T_BYTE:
8427       evpermb(dst, mask, nds, src, merge, vector_len); break;
8428     case T_CHAR:
8429     case T_SHORT:
8430       evpermw(dst, mask, nds, src, merge, vector_len); break;
8431     case T_INT:
8432     case T_FLOAT:
8433       evpermd(dst, mask, nds, src, merge, vector_len); break;
8434     case T_LONG:
8435     case T_DOUBLE:
8436       evpermq(dst, mask, nds, src, merge, vector_len); break;
8437     default:
8438       fatal("Unexpected type argument %s", type2name(type)); break;
8439   }
8440 }
8441 
8442 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8443   switch(type) {
8444     case T_BYTE:
8445       evpminsb(dst, mask, nds, src, merge, vector_len); break;
8446     case T_SHORT:
8447       evpminsw(dst, mask, nds, src, merge, vector_len); break;
8448     case T_INT:
8449       evpminsd(dst, mask, nds, src, merge, vector_len); break;
8450     case T_LONG:
8451       evpminsq(dst, mask, nds, src, merge, vector_len); break;
8452     default:
8453       fatal("Unexpected type argument %s", type2name(type)); break;
8454   }
8455 }
8456 
8457 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8458   switch(type) {
8459     case T_BYTE:
8460       evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
8461     case T_SHORT:
8462       evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
8463     case T_INT:
8464       evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
8465     case T_LONG:
8466       evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
8467     default:
8468       fatal("Unexpected type argument %s", type2name(type)); break;
8469   }
8470 }
8471 
8472 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8473   switch(type) {
8474     case T_BYTE:
8475       evpminsb(dst, mask, nds, src, merge, vector_len); break;
8476     case T_SHORT:
8477       evpminsw(dst, mask, nds, src, merge, vector_len); break;
8478     case T_INT:
8479       evpminsd(dst, mask, nds, src, merge, vector_len); break;
8480     case T_LONG:
8481       evpminsq(dst, mask, nds, src, merge, vector_len); break;
8482     default:
8483       fatal("Unexpected type argument %s", type2name(type)); break;
8484   }
8485 }
8486 
8487 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8488   switch(type) {
8489     case T_BYTE:
8490       evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
8491     case T_SHORT:
8492       evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
8493     case T_INT:
8494       evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
8495     case T_LONG:
8496       evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
8497     default:
8498       fatal("Unexpected type argument %s", type2name(type)); break;
8499   }
8500 }
8501 
8502 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8503   switch(type) {
8504     case T_INT:
8505       evpxord(dst, mask, nds, src, merge, vector_len); break;
8506     case T_LONG:
8507       evpxorq(dst, mask, nds, src, merge, vector_len); break;
8508     default:
8509       fatal("Unexpected type argument %s", type2name(type)); break;
8510   }
8511 }
8512 
8513 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8514   switch(type) {
8515     case T_INT:
8516       evpxord(dst, mask, nds, src, merge, vector_len); break;
8517     case T_LONG:
8518       evpxorq(dst, mask, nds, src, merge, vector_len); break;
8519     default:
8520       fatal("Unexpected type argument %s", type2name(type)); break;
8521   }
8522 }
8523 
8524 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8525   switch(type) {
8526     case T_INT:
8527       Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
8528     case T_LONG:
8529       evporq(dst, mask, nds, src, merge, vector_len); break;
8530     default:
8531       fatal("Unexpected type argument %s", type2name(type)); break;
8532   }
8533 }
8534 
8535 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8536   switch(type) {
8537     case T_INT:
8538       Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
8539     case T_LONG:
8540       evporq(dst, mask, nds, src, merge, vector_len); break;
8541     default:
8542       fatal("Unexpected type argument %s", type2name(type)); break;
8543   }
8544 }
8545 
8546 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8547   switch(type) {
8548     case T_INT:
8549       evpandd(dst, mask, nds, src, merge, vector_len); break;
8550     case T_LONG:
8551       evpandq(dst, mask, nds, src, merge, vector_len); break;
8552     default:
8553       fatal("Unexpected type argument %s", type2name(type)); break;
8554   }
8555 }
8556 
8557 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8558   switch(type) {
8559     case T_INT:
8560       evpandd(dst, mask, nds, src, merge, vector_len); break;
8561     case T_LONG:
8562       evpandq(dst, mask, nds, src, merge, vector_len); break;
8563     default:
8564       fatal("Unexpected type argument %s", type2name(type)); break;
8565   }
8566 }
8567 
8568 void MacroAssembler::anytrue(Register dst, uint masklen, KRegister src1, KRegister src2) {
8569    masklen = masklen < 8 ? 8 : masklen;
8570    ktest(masklen, src1, src2);
8571    setb(Assembler::notZero, dst);
8572    movzbl(dst, dst);
8573 }
8574 
8575 void MacroAssembler::alltrue(Register dst, uint masklen, KRegister src1, KRegister src2, KRegister kscratch) {
8576   if (masklen < 8) {
8577     knotbl(kscratch, src2);
8578     kortestbl(src1, kscratch);
8579     setb(Assembler::carrySet, dst);
8580     movzbl(dst, dst);
8581   } else {
8582     ktest(masklen, src1, src2);
8583     setb(Assembler::carrySet, dst);
8584     movzbl(dst, dst);
8585   }
8586 }
8587 
8588 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) {
8589   switch(masklen) {
8590     case 8:
8591        kortestbl(src1, src2);
8592        break;
8593     case 16:
8594        kortestwl(src1, src2);
8595        break;
8596     case 32:
8597        kortestdl(src1, src2);
8598        break;
8599     case 64:
8600        kortestql(src1, src2);
8601        break;
8602     default:
8603       fatal("Unexpected mask length %d", masklen);
8604       break;
8605   }
8606 }
8607 
8608 
8609 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) {
8610   switch(masklen)  {
8611     case 8:
8612        ktestbl(src1, src2);
8613        break;
8614     case 16:
8615        ktestwl(src1, src2);
8616        break;
8617     case 32:
8618        ktestdl(src1, src2);
8619        break;
8620     case 64:
8621        ktestql(src1, src2);
8622        break;
8623     default:
8624       fatal("Unexpected mask length %d", masklen);
8625       break;
8626   }
8627 }
8628 
8629 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
8630   switch(type) {
8631     case T_INT:
8632       evprold(dst, mask, src, shift, merge, vlen_enc); break;
8633     case T_LONG:
8634       evprolq(dst, mask, src, shift, merge, vlen_enc); break;
8635     default:
8636       fatal("Unexpected type argument %s", type2name(type)); break;
8637       break;
8638   }
8639 }
8640 
8641 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
8642   switch(type) {
8643     case T_INT:
8644       evprord(dst, mask, src, shift, merge, vlen_enc); break;
8645     case T_LONG:
8646       evprorq(dst, mask, src, shift, merge, vlen_enc); break;
8647     default:
8648       fatal("Unexpected type argument %s", type2name(type)); break;
8649   }
8650 }
8651 
8652 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
8653   switch(type) {
8654     case T_INT:
8655       evprolvd(dst, mask, src1, src2, merge, vlen_enc); break;
8656     case T_LONG:
8657       evprolvq(dst, mask, src1, src2, merge, vlen_enc); break;
8658     default:
8659       fatal("Unexpected type argument %s", type2name(type)); break;
8660   }
8661 }
8662 
8663 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
8664   switch(type) {
8665     case T_INT:
8666       evprorvd(dst, mask, src1, src2, merge, vlen_enc); break;
8667     case T_LONG:
8668       evprorvq(dst, mask, src1, src2, merge, vlen_enc); break;
8669     default:
8670       fatal("Unexpected type argument %s", type2name(type)); break;
8671   }
8672 }
8673 #if COMPILER2_OR_JVMCI
8674 
8675 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
8676                                  Register length, Register temp, int vec_enc) {
8677   // Computing mask for predicated vector store.
8678   movptr(temp, -1);
8679   bzhiq(temp, temp, length);
8680   kmov(mask, temp);
8681   evmovdqu(bt, mask, dst, xmm, vec_enc);
8682 }
8683 
8684 // Set memory operation for length "less than" 64 bytes.
8685 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
8686                                        XMMRegister xmm, KRegister mask, Register length,
8687                                        Register temp, bool use64byteVector) {
8688   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8689   BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
8690   if (!use64byteVector) {
8691     fill32(dst, disp, xmm);
8692     subptr(length, 32 >> shift);
8693     fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
8694   } else {
8695     assert(MaxVectorSize == 64, "vector length != 64");
8696     fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
8697   }
8698 }
8699 
8700 
8701 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
8702                                        XMMRegister xmm, KRegister mask, Register length,
8703                                        Register temp) {
8704   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8705   BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
8706   fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
8707 }
8708 
8709 
8710 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
8711   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8712   vmovdqu(Address(dst, disp), xmm);
8713 }
8714 
8715 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
8716   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8717   BasicType type[] = {T_BYTE,  T_SHORT,  T_INT,   T_LONG};
8718   if (!use64byteVector) {
8719     fill32(dst, disp, xmm);
8720     fill32(dst, disp + 32, xmm);
8721   } else {
8722     evmovdquq(Address(dst, disp), xmm, Assembler::AVX_512bit);
8723   }
8724 }
8725 
8726 #ifdef _LP64
8727 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
8728                                         Register count, Register rtmp, XMMRegister xtmp) {
8729   Label L_exit;
8730   Label L_fill_start;
8731   Label L_fill_64_bytes;
8732   Label L_fill_96_bytes;
8733   Label L_fill_128_bytes;
8734   Label L_fill_128_bytes_loop;
8735   Label L_fill_128_loop_header;
8736   Label L_fill_128_bytes_loop_header;
8737   Label L_fill_128_bytes_loop_pre_header;
8738   Label L_fill_zmm_sequence;
8739 
8740   int shift = -1;
8741   int avx3threshold = VM_Version::avx3_threshold();
8742   switch(type) {
8743     case T_BYTE:  shift = 0;
8744       break;
8745     case T_SHORT: shift = 1;
8746       break;
8747     case T_INT:   shift = 2;
8748       break;
8749     /* Uncomment when LONG fill stubs are supported.
8750     case T_LONG:  shift = 3;
8751       break;
8752     */
8753     default:
8754       fatal("Unhandled type: %s\n", type2name(type));
8755   }
8756 
8757   if ((avx3threshold != 0)  || (MaxVectorSize == 32)) {
8758 
8759     if (MaxVectorSize == 64) {
8760       cmpq(count, avx3threshold >> shift);
8761       jcc(Assembler::greater, L_fill_zmm_sequence);
8762     }
8763 
8764     evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
8765 
8766     bind(L_fill_start);
8767 
8768     cmpq(count, 32 >> shift);
8769     jccb(Assembler::greater, L_fill_64_bytes);
8770     fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
8771     jmp(L_exit);
8772 
8773     bind(L_fill_64_bytes);
8774     cmpq(count, 64 >> shift);
8775     jccb(Assembler::greater, L_fill_96_bytes);
8776     fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
8777     jmp(L_exit);
8778 
8779     bind(L_fill_96_bytes);
8780     cmpq(count, 96 >> shift);
8781     jccb(Assembler::greater, L_fill_128_bytes);
8782     fill64(to, 0, xtmp);
8783     subq(count, 64 >> shift);
8784     fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
8785     jmp(L_exit);
8786 
8787     bind(L_fill_128_bytes);
8788     cmpq(count, 128 >> shift);
8789     jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
8790     fill64(to, 0, xtmp);
8791     fill32(to, 64, xtmp);
8792     subq(count, 96 >> shift);
8793     fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
8794     jmp(L_exit);
8795 
8796     bind(L_fill_128_bytes_loop_pre_header);
8797     {
8798       mov(rtmp, to);
8799       andq(rtmp, 31);
8800       jccb(Assembler::zero, L_fill_128_bytes_loop_header);
8801       negq(rtmp);
8802       addq(rtmp, 32);
8803       mov64(r8, -1L);
8804       bzhiq(r8, r8, rtmp);
8805       kmovql(k2, r8);
8806       evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, Assembler::AVX_256bit);
8807       addq(to, rtmp);
8808       shrq(rtmp, shift);
8809       subq(count, rtmp);
8810     }
8811 
8812     cmpq(count, 128 >> shift);
8813     jcc(Assembler::less, L_fill_start);
8814 
8815     bind(L_fill_128_bytes_loop_header);
8816     subq(count, 128 >> shift);
8817 
8818     align32();
8819     bind(L_fill_128_bytes_loop);
8820       fill64(to, 0, xtmp);
8821       fill64(to, 64, xtmp);
8822       addq(to, 128);
8823       subq(count, 128 >> shift);
8824       jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
8825 
8826     addq(count, 128 >> shift);
8827     jcc(Assembler::zero, L_exit);
8828     jmp(L_fill_start);
8829   }
8830 
8831   if (MaxVectorSize == 64) {
8832     // Sequence using 64 byte ZMM register.
8833     Label L_fill_128_bytes_zmm;
8834     Label L_fill_192_bytes_zmm;
8835     Label L_fill_192_bytes_loop_zmm;
8836     Label L_fill_192_bytes_loop_header_zmm;
8837     Label L_fill_192_bytes_loop_pre_header_zmm;
8838     Label L_fill_start_zmm_sequence;
8839 
8840     bind(L_fill_zmm_sequence);
8841     evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
8842 
8843     bind(L_fill_start_zmm_sequence);
8844     cmpq(count, 64 >> shift);
8845     jccb(Assembler::greater, L_fill_128_bytes_zmm);
8846     fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
8847     jmp(L_exit);
8848 
8849     bind(L_fill_128_bytes_zmm);
8850     cmpq(count, 128 >> shift);
8851     jccb(Assembler::greater, L_fill_192_bytes_zmm);
8852     fill64(to, 0, xtmp, true);
8853     subq(count, 64 >> shift);
8854     fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
8855     jmp(L_exit);
8856 
8857     bind(L_fill_192_bytes_zmm);
8858     cmpq(count, 192 >> shift);
8859     jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
8860     fill64(to, 0, xtmp, true);
8861     fill64(to, 64, xtmp, true);
8862     subq(count, 128 >> shift);
8863     fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
8864     jmp(L_exit);
8865 
8866     bind(L_fill_192_bytes_loop_pre_header_zmm);
8867     {
8868       movq(rtmp, to);
8869       andq(rtmp, 63);
8870       jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
8871       negq(rtmp);
8872       addq(rtmp, 64);
8873       mov64(r8, -1L);
8874       bzhiq(r8, r8, rtmp);
8875       kmovql(k2, r8);
8876       evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, Assembler::AVX_512bit);
8877       addq(to, rtmp);
8878       shrq(rtmp, shift);
8879       subq(count, rtmp);
8880     }
8881 
8882     cmpq(count, 192 >> shift);
8883     jcc(Assembler::less, L_fill_start_zmm_sequence);
8884 
8885     bind(L_fill_192_bytes_loop_header_zmm);
8886     subq(count, 192 >> shift);
8887 
8888     align32();
8889     bind(L_fill_192_bytes_loop_zmm);
8890       fill64(to, 0, xtmp, true);
8891       fill64(to, 64, xtmp, true);
8892       fill64(to, 128, xtmp, true);
8893       addq(to, 192);
8894       subq(count, 192 >> shift);
8895       jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
8896 
8897     addq(count, 192 >> shift);
8898     jcc(Assembler::zero, L_exit);
8899     jmp(L_fill_start_zmm_sequence);
8900   }
8901   bind(L_exit);
8902 }
8903 #endif
8904 #endif //COMPILER2_OR_JVMCI
8905 
8906 
8907 #ifdef _LP64
8908 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
8909   Label done;
8910   cvttss2sil(dst, src);
8911   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
8912   cmpl(dst, 0x80000000); // float_sign_flip
8913   jccb(Assembler::notEqual, done);
8914   subptr(rsp, 8);
8915   movflt(Address(rsp, 0), src);
8916   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
8917   pop(dst);
8918   bind(done);
8919 }
8920 
8921 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
8922   Label done;
8923   cvttsd2sil(dst, src);
8924   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
8925   cmpl(dst, 0x80000000); // float_sign_flip
8926   jccb(Assembler::notEqual, done);
8927   subptr(rsp, 8);
8928   movdbl(Address(rsp, 0), src);
8929   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
8930   pop(dst);
8931   bind(done);
8932 }
8933 
8934 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
8935   Label done;
8936   cvttss2siq(dst, src);
8937   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
8938   jccb(Assembler::notEqual, done);
8939   subptr(rsp, 8);
8940   movflt(Address(rsp, 0), src);
8941   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
8942   pop(dst);
8943   bind(done);
8944 }
8945 
8946 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
8947   Label done;
8948   cvttsd2siq(dst, src);
8949   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
8950   jccb(Assembler::notEqual, done);
8951   subptr(rsp, 8);
8952   movdbl(Address(rsp, 0), src);
8953   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
8954   pop(dst);
8955   bind(done);
8956 }
8957 
8958 void MacroAssembler::cache_wb(Address line)
8959 {
8960   // 64 bit cpus always support clflush
8961   assert(VM_Version::supports_clflush(), "clflush should be available");
8962   bool optimized = VM_Version::supports_clflushopt();
8963   bool no_evict = VM_Version::supports_clwb();
8964 
8965   // prefer clwb (writeback without evict) otherwise
8966   // prefer clflushopt (potentially parallel writeback with evict)
8967   // otherwise fallback on clflush (serial writeback with evict)
8968 
8969   if (optimized) {
8970     if (no_evict) {
8971       clwb(line);
8972     } else {
8973       clflushopt(line);
8974     }
8975   } else {
8976     // no need for fence when using CLFLUSH
8977     clflush(line);
8978   }
8979 }
8980 
8981 void MacroAssembler::cache_wbsync(bool is_pre)
8982 {
8983   assert(VM_Version::supports_clflush(), "clflush should be available");
8984   bool optimized = VM_Version::supports_clflushopt();
8985   bool no_evict = VM_Version::supports_clwb();
8986 
8987   // pick the correct implementation
8988 
8989   if (!is_pre && (optimized || no_evict)) {
8990     // need an sfence for post flush when using clflushopt or clwb
8991     // otherwise no no need for any synchroniaztion
8992 
8993     sfence();
8994   }
8995 }
8996 
8997 #endif // _LP64
8998 
8999 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9000   switch (cond) {
9001     // Note some conditions are synonyms for others
9002     case Assembler::zero:         return Assembler::notZero;
9003     case Assembler::notZero:      return Assembler::zero;
9004     case Assembler::less:         return Assembler::greaterEqual;
9005     case Assembler::lessEqual:    return Assembler::greater;
9006     case Assembler::greater:      return Assembler::lessEqual;
9007     case Assembler::greaterEqual: return Assembler::less;
9008     case Assembler::below:        return Assembler::aboveEqual;
9009     case Assembler::belowEqual:   return Assembler::above;
9010     case Assembler::above:        return Assembler::belowEqual;
9011     case Assembler::aboveEqual:   return Assembler::below;
9012     case Assembler::overflow:     return Assembler::noOverflow;
9013     case Assembler::noOverflow:   return Assembler::overflow;
9014     case Assembler::negative:     return Assembler::positive;
9015     case Assembler::positive:     return Assembler::negative;
9016     case Assembler::parity:       return Assembler::noParity;
9017     case Assembler::noParity:     return Assembler::parity;
9018   }
9019   ShouldNotReachHere(); return Assembler::overflow;
9020 }
9021 
9022 SkipIfEqual::SkipIfEqual(
9023     MacroAssembler* masm, const bool* flag_addr, bool value) {
9024   _masm = masm;
9025   _masm->cmp8(ExternalAddress((address)flag_addr), value);
9026   _masm->jcc(Assembler::equal, _label);
9027 }
9028 
9029 SkipIfEqual::~SkipIfEqual() {
9030   _masm->bind(_label);
9031 }
9032 
9033 // 32-bit Windows has its own fast-path implementation
9034 // of get_thread
9035 #if !defined(WIN32) || defined(_LP64)
9036 
9037 // This is simply a call to Thread::current()
9038 void MacroAssembler::get_thread(Register thread) {
9039   if (thread != rax) {
9040     push(rax);
9041   }
9042   LP64_ONLY(push(rdi);)
9043   LP64_ONLY(push(rsi);)
9044   push(rdx);
9045   push(rcx);
9046 #ifdef _LP64
9047   push(r8);
9048   push(r9);
9049   push(r10);
9050   push(r11);
9051 #endif
9052 
9053   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
9054 
9055 #ifdef _LP64
9056   pop(r11);
9057   pop(r10);
9058   pop(r9);
9059   pop(r8);
9060 #endif
9061   pop(rcx);
9062   pop(rdx);
9063   LP64_ONLY(pop(rsi);)
9064   LP64_ONLY(pop(rdi);)
9065   if (thread != rax) {
9066     mov(thread, rax);
9067     pop(rax);
9068   }
9069 }
9070 
9071 
9072 #endif // !WIN32 || _LP64
--- EOF ---