1 /*
   2  * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "jvm.h"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "compiler/compiler_globals.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/collectedHeap.inline.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/bytecodeHistogram.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/resourceArea.hpp"
  38 #include "memory/universe.hpp"
  39 #include "oops/accessDecorators.hpp"
  40 #include "oops/compressedOops.inline.hpp"
  41 #include "oops/klass.inline.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "runtime/flags/flagSetting.hpp"
  44 #include "runtime/interfaceSupport.inline.hpp"
  45 #include "runtime/jniHandles.hpp"
  46 #include "runtime/objectMonitor.hpp"
  47 #include "runtime/os.hpp"
  48 #include "runtime/safepoint.hpp"
  49 #include "runtime/safepointMechanism.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "runtime/thread.hpp"
  53 #include "utilities/macros.hpp"
  54 #include "crc32c.h"
  55 
  56 #ifdef PRODUCT
  57 #define BLOCK_COMMENT(str) /* nothing */
  58 #define STOP(error) stop(error)
  59 #else
  60 #define BLOCK_COMMENT(str) block_comment(str)
  61 #define STOP(error) block_comment(error); stop(error)
  62 #endif
  63 
  64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  65 
  66 #ifdef ASSERT
  67 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  68 #endif
  69 
  70 static Assembler::Condition reverse[] = {
  71     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  72     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  73     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  74     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  75     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  76     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  77     Assembler::above          /* belowEqual    = 0x6 */ ,
  78     Assembler::belowEqual     /* above         = 0x7 */ ,
  79     Assembler::positive       /* negative      = 0x8 */ ,
  80     Assembler::negative       /* positive      = 0x9 */ ,
  81     Assembler::noParity       /* parity        = 0xa */ ,
  82     Assembler::parity         /* noParity      = 0xb */ ,
  83     Assembler::greaterEqual   /* less          = 0xc */ ,
  84     Assembler::less           /* greaterEqual  = 0xd */ ,
  85     Assembler::greater        /* lessEqual     = 0xe */ ,
  86     Assembler::lessEqual      /* greater       = 0xf, */
  87 
  88 };
  89 
  90 
  91 // Implementation of MacroAssembler
  92 
  93 // First all the versions that have distinct versions depending on 32/64 bit
  94 // Unless the difference is trivial (1 line or so).
  95 
  96 #ifndef _LP64
  97 
  98 // 32bit versions
  99 
 100 Address MacroAssembler::as_Address(AddressLiteral adr) {
 101   return Address(adr.target(), adr.rspec());
 102 }
 103 
 104 Address MacroAssembler::as_Address(ArrayAddress adr) {
 105   return Address::make_array(adr);
 106 }
 107 
 108 void MacroAssembler::call_VM_leaf_base(address entry_point,
 109                                        int number_of_arguments) {
 110   call(RuntimeAddress(entry_point));
 111   increment(rsp, number_of_arguments * wordSize);
 112 }
 113 
 114 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 115   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 116 }
 117 
 118 
 119 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 120   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 121 }
 122 
 123 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 124   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 125 }
 126 
 127 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 128   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 129 }
 130 
 131 void MacroAssembler::extend_sign(Register hi, Register lo) {
 132   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 133   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 134     cdql();
 135   } else {
 136     movl(hi, lo);
 137     sarl(hi, 31);
 138   }
 139 }
 140 
 141 void MacroAssembler::jC2(Register tmp, Label& L) {
 142   // set parity bit if FPU flag C2 is set (via rax)
 143   save_rax(tmp);
 144   fwait(); fnstsw_ax();
 145   sahf();
 146   restore_rax(tmp);
 147   // branch
 148   jcc(Assembler::parity, L);
 149 }
 150 
 151 void MacroAssembler::jnC2(Register tmp, Label& L) {
 152   // set parity bit if FPU flag C2 is set (via rax)
 153   save_rax(tmp);
 154   fwait(); fnstsw_ax();
 155   sahf();
 156   restore_rax(tmp);
 157   // branch
 158   jcc(Assembler::noParity, L);
 159 }
 160 
 161 // 32bit can do a case table jump in one instruction but we no longer allow the base
 162 // to be installed in the Address class
 163 void MacroAssembler::jump(ArrayAddress entry) {
 164   jmp(as_Address(entry));
 165 }
 166 
 167 // Note: y_lo will be destroyed
 168 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 169   // Long compare for Java (semantics as described in JVM spec.)
 170   Label high, low, done;
 171 
 172   cmpl(x_hi, y_hi);
 173   jcc(Assembler::less, low);
 174   jcc(Assembler::greater, high);
 175   // x_hi is the return register
 176   xorl(x_hi, x_hi);
 177   cmpl(x_lo, y_lo);
 178   jcc(Assembler::below, low);
 179   jcc(Assembler::equal, done);
 180 
 181   bind(high);
 182   xorl(x_hi, x_hi);
 183   increment(x_hi);
 184   jmp(done);
 185 
 186   bind(low);
 187   xorl(x_hi, x_hi);
 188   decrementl(x_hi);
 189 
 190   bind(done);
 191 }
 192 
 193 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 194     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 195 }
 196 
 197 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 198   // leal(dst, as_Address(adr));
 199   // see note in movl as to why we must use a move
 200   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 201 }
 202 
 203 void MacroAssembler::leave() {
 204   mov(rsp, rbp);
 205   pop(rbp);
 206 }
 207 
 208 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 209   // Multiplication of two Java long values stored on the stack
 210   // as illustrated below. Result is in rdx:rax.
 211   //
 212   // rsp ---> [  ??  ] \               \
 213   //            ....    | y_rsp_offset  |
 214   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 215   //          [ y_hi ]                  | (in bytes)
 216   //            ....                    |
 217   //          [ x_lo ]                 /
 218   //          [ x_hi ]
 219   //            ....
 220   //
 221   // Basic idea: lo(result) = lo(x_lo * y_lo)
 222   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 223   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 224   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 225   Label quick;
 226   // load x_hi, y_hi and check if quick
 227   // multiplication is possible
 228   movl(rbx, x_hi);
 229   movl(rcx, y_hi);
 230   movl(rax, rbx);
 231   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 232   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 233   // do full multiplication
 234   // 1st step
 235   mull(y_lo);                                    // x_hi * y_lo
 236   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 237   // 2nd step
 238   movl(rax, x_lo);
 239   mull(rcx);                                     // x_lo * y_hi
 240   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 241   // 3rd step
 242   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 243   movl(rax, x_lo);
 244   mull(y_lo);                                    // x_lo * y_lo
 245   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 246 }
 247 
 248 void MacroAssembler::lneg(Register hi, Register lo) {
 249   negl(lo);
 250   adcl(hi, 0);
 251   negl(hi);
 252 }
 253 
 254 void MacroAssembler::lshl(Register hi, Register lo) {
 255   // Java shift left long support (semantics as described in JVM spec., p.305)
 256   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 257   // shift value is in rcx !
 258   assert(hi != rcx, "must not use rcx");
 259   assert(lo != rcx, "must not use rcx");
 260   const Register s = rcx;                        // shift count
 261   const int      n = BitsPerWord;
 262   Label L;
 263   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 264   cmpl(s, n);                                    // if (s < n)
 265   jcc(Assembler::less, L);                       // else (s >= n)
 266   movl(hi, lo);                                  // x := x << n
 267   xorl(lo, lo);
 268   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 269   bind(L);                                       // s (mod n) < n
 270   shldl(hi, lo);                                 // x := x << s
 271   shll(lo);
 272 }
 273 
 274 
 275 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 276   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 277   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 278   assert(hi != rcx, "must not use rcx");
 279   assert(lo != rcx, "must not use rcx");
 280   const Register s = rcx;                        // shift count
 281   const int      n = BitsPerWord;
 282   Label L;
 283   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 284   cmpl(s, n);                                    // if (s < n)
 285   jcc(Assembler::less, L);                       // else (s >= n)
 286   movl(lo, hi);                                  // x := x >> n
 287   if (sign_extension) sarl(hi, 31);
 288   else                xorl(hi, hi);
 289   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 290   bind(L);                                       // s (mod n) < n
 291   shrdl(lo, hi);                                 // x := x >> s
 292   if (sign_extension) sarl(hi);
 293   else                shrl(hi);
 294 }
 295 
 296 void MacroAssembler::movoop(Register dst, jobject obj) {
 297   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 298 }
 299 
 300 void MacroAssembler::movoop(Address dst, jobject obj) {
 301   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 302 }
 303 
 304 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 305   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 306 }
 307 
 308 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 309   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 310 }
 311 
 312 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 313   // scratch register is not used,
 314   // it is defined to match parameters of 64-bit version of this method.
 315   if (src.is_lval()) {
 316     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 317   } else {
 318     movl(dst, as_Address(src));
 319   }
 320 }
 321 
 322 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 323   movl(as_Address(dst), src);
 324 }
 325 
 326 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 327   movl(dst, as_Address(src));
 328 }
 329 
 330 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 331 void MacroAssembler::movptr(Address dst, intptr_t src) {
 332   movl(dst, src);
 333 }
 334 
 335 void MacroAssembler::pushoop(jobject obj) {
 336   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 337 }
 338 
 339 void MacroAssembler::pushklass(Metadata* obj) {
 340   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 341 }
 342 
 343 void MacroAssembler::pushptr(AddressLiteral src) {
 344   if (src.is_lval()) {
 345     push_literal32((int32_t)src.target(), src.rspec());
 346   } else {
 347     pushl(as_Address(src));
 348   }
 349 }
 350 
 351 static void pass_arg0(MacroAssembler* masm, Register arg) {
 352   masm->push(arg);
 353 }
 354 
 355 static void pass_arg1(MacroAssembler* masm, Register arg) {
 356   masm->push(arg);
 357 }
 358 
 359 static void pass_arg2(MacroAssembler* masm, Register arg) {
 360   masm->push(arg);
 361 }
 362 
 363 static void pass_arg3(MacroAssembler* masm, Register arg) {
 364   masm->push(arg);
 365 }
 366 
 367 #ifndef PRODUCT
 368 extern "C" void findpc(intptr_t x);
 369 #endif
 370 
 371 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 372   // In order to get locks to work, we need to fake a in_VM state
 373   JavaThread* thread = JavaThread::current();
 374   JavaThreadState saved_state = thread->thread_state();
 375   thread->set_thread_state(_thread_in_vm);
 376   if (ShowMessageBoxOnError) {
 377     JavaThread* thread = JavaThread::current();
 378     JavaThreadState saved_state = thread->thread_state();
 379     thread->set_thread_state(_thread_in_vm);
 380     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 381       ttyLocker ttyl;
 382       BytecodeCounter::print();
 383     }
 384     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 385     // This is the value of eip which points to where verify_oop will return.
 386     if (os::message_box(msg, "Execution stopped, print registers?")) {
 387       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 388       BREAKPOINT;
 389     }
 390   }
 391   fatal("DEBUG MESSAGE: %s", msg);
 392 }
 393 
 394 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 395   ttyLocker ttyl;
 396   FlagSetting fs(Debugging, true);
 397   tty->print_cr("eip = 0x%08x", eip);
 398 #ifndef PRODUCT
 399   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 400     tty->cr();
 401     findpc(eip);
 402     tty->cr();
 403   }
 404 #endif
 405 #define PRINT_REG(rax) \
 406   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 407   PRINT_REG(rax);
 408   PRINT_REG(rbx);
 409   PRINT_REG(rcx);
 410   PRINT_REG(rdx);
 411   PRINT_REG(rdi);
 412   PRINT_REG(rsi);
 413   PRINT_REG(rbp);
 414   PRINT_REG(rsp);
 415 #undef PRINT_REG
 416   // Print some words near top of staack.
 417   int* dump_sp = (int*) rsp;
 418   for (int col1 = 0; col1 < 8; col1++) {
 419     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 420     os::print_location(tty, *dump_sp++);
 421   }
 422   for (int row = 0; row < 16; row++) {
 423     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 424     for (int col = 0; col < 8; col++) {
 425       tty->print(" 0x%08x", *dump_sp++);
 426     }
 427     tty->cr();
 428   }
 429   // Print some instructions around pc:
 430   Disassembler::decode((address)eip-64, (address)eip);
 431   tty->print_cr("--------");
 432   Disassembler::decode((address)eip, (address)eip+32);
 433 }
 434 
 435 void MacroAssembler::stop(const char* msg) {
 436   ExternalAddress message((address)msg);
 437   // push address of message
 438   pushptr(message.addr());
 439   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 440   pusha();                                            // push registers
 441   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 442   hlt();
 443 }
 444 
 445 void MacroAssembler::warn(const char* msg) {
 446   push_CPU_state();
 447 
 448   ExternalAddress message((address) msg);
 449   // push address of message
 450   pushptr(message.addr());
 451 
 452   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 453   addl(rsp, wordSize);       // discard argument
 454   pop_CPU_state();
 455 }
 456 
 457 void MacroAssembler::print_state() {
 458   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 459   pusha();                                            // push registers
 460 
 461   push_CPU_state();
 462   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 463   pop_CPU_state();
 464 
 465   popa();
 466   addl(rsp, wordSize);
 467 }
 468 
 469 #else // _LP64
 470 
 471 // 64 bit versions
 472 
 473 Address MacroAssembler::as_Address(AddressLiteral adr) {
 474   // amd64 always does this as a pc-rel
 475   // we can be absolute or disp based on the instruction type
 476   // jmp/call are displacements others are absolute
 477   assert(!adr.is_lval(), "must be rval");
 478   assert(reachable(adr), "must be");
 479   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 480 
 481 }
 482 
 483 Address MacroAssembler::as_Address(ArrayAddress adr) {
 484   AddressLiteral base = adr.base();
 485   lea(rscratch1, base);
 486   Address index = adr.index();
 487   assert(index._disp == 0, "must not have disp"); // maybe it can?
 488   Address array(rscratch1, index._index, index._scale, index._disp);
 489   return array;
 490 }
 491 
 492 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 493   Label L, E;
 494 
 495 #ifdef _WIN64
 496   // Windows always allocates space for it's register args
 497   assert(num_args <= 4, "only register arguments supported");
 498   subq(rsp,  frame::arg_reg_save_area_bytes);
 499 #endif
 500 
 501   // Align stack if necessary
 502   testl(rsp, 15);
 503   jcc(Assembler::zero, L);
 504 
 505   subq(rsp, 8);
 506   {
 507     call(RuntimeAddress(entry_point));
 508   }
 509   addq(rsp, 8);
 510   jmp(E);
 511 
 512   bind(L);
 513   {
 514     call(RuntimeAddress(entry_point));
 515   }
 516 
 517   bind(E);
 518 
 519 #ifdef _WIN64
 520   // restore stack pointer
 521   addq(rsp, frame::arg_reg_save_area_bytes);
 522 #endif
 523 
 524 }
 525 
 526 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 527   assert(!src2.is_lval(), "should use cmpptr");
 528 
 529   if (reachable(src2)) {
 530     cmpq(src1, as_Address(src2));
 531   } else {
 532     lea(rscratch1, src2);
 533     Assembler::cmpq(src1, Address(rscratch1, 0));
 534   }
 535 }
 536 
 537 int MacroAssembler::corrected_idivq(Register reg) {
 538   // Full implementation of Java ldiv and lrem; checks for special
 539   // case as described in JVM spec., p.243 & p.271.  The function
 540   // returns the (pc) offset of the idivl instruction - may be needed
 541   // for implicit exceptions.
 542   //
 543   //         normal case                           special case
 544   //
 545   // input : rax: dividend                         min_long
 546   //         reg: divisor   (may not be eax/edx)   -1
 547   //
 548   // output: rax: quotient  (= rax idiv reg)       min_long
 549   //         rdx: remainder (= rax irem reg)       0
 550   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 551   static const int64_t min_long = 0x8000000000000000;
 552   Label normal_case, special_case;
 553 
 554   // check for special case
 555   cmp64(rax, ExternalAddress((address) &min_long));
 556   jcc(Assembler::notEqual, normal_case);
 557   xorl(rdx, rdx); // prepare rdx for possible special case (where
 558                   // remainder = 0)
 559   cmpq(reg, -1);
 560   jcc(Assembler::equal, special_case);
 561 
 562   // handle normal case
 563   bind(normal_case);
 564   cdqq();
 565   int idivq_offset = offset();
 566   idivq(reg);
 567 
 568   // normal and special case exit
 569   bind(special_case);
 570 
 571   return idivq_offset;
 572 }
 573 
 574 void MacroAssembler::decrementq(Register reg, int value) {
 575   if (value == min_jint) { subq(reg, value); return; }
 576   if (value <  0) { incrementq(reg, -value); return; }
 577   if (value == 0) {                        ; return; }
 578   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 579   /* else */      { subq(reg, value)       ; return; }
 580 }
 581 
 582 void MacroAssembler::decrementq(Address dst, int value) {
 583   if (value == min_jint) { subq(dst, value); return; }
 584   if (value <  0) { incrementq(dst, -value); return; }
 585   if (value == 0) {                        ; return; }
 586   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 587   /* else */      { subq(dst, value)       ; return; }
 588 }
 589 
 590 void MacroAssembler::incrementq(AddressLiteral dst) {
 591   if (reachable(dst)) {
 592     incrementq(as_Address(dst));
 593   } else {
 594     lea(rscratch1, dst);
 595     incrementq(Address(rscratch1, 0));
 596   }
 597 }
 598 
 599 void MacroAssembler::incrementq(Register reg, int value) {
 600   if (value == min_jint) { addq(reg, value); return; }
 601   if (value <  0) { decrementq(reg, -value); return; }
 602   if (value == 0) {                        ; return; }
 603   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 604   /* else */      { addq(reg, value)       ; return; }
 605 }
 606 
 607 void MacroAssembler::incrementq(Address dst, int value) {
 608   if (value == min_jint) { addq(dst, value); return; }
 609   if (value <  0) { decrementq(dst, -value); return; }
 610   if (value == 0) {                        ; return; }
 611   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 612   /* else */      { addq(dst, value)       ; return; }
 613 }
 614 
 615 // 32bit can do a case table jump in one instruction but we no longer allow the base
 616 // to be installed in the Address class
 617 void MacroAssembler::jump(ArrayAddress entry) {
 618   lea(rscratch1, entry.base());
 619   Address dispatch = entry.index();
 620   assert(dispatch._base == noreg, "must be");
 621   dispatch._base = rscratch1;
 622   jmp(dispatch);
 623 }
 624 
 625 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 626   ShouldNotReachHere(); // 64bit doesn't use two regs
 627   cmpq(x_lo, y_lo);
 628 }
 629 
 630 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 631     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 632 }
 633 
 634 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 635   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 636   movptr(dst, rscratch1);
 637 }
 638 
 639 void MacroAssembler::leave() {
 640   // %%% is this really better? Why not on 32bit too?
 641   emit_int8((unsigned char)0xC9); // LEAVE
 642 }
 643 
 644 void MacroAssembler::lneg(Register hi, Register lo) {
 645   ShouldNotReachHere(); // 64bit doesn't use two regs
 646   negq(lo);
 647 }
 648 
 649 void MacroAssembler::movoop(Register dst, jobject obj) {
 650   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 651 }
 652 
 653 void MacroAssembler::movoop(Address dst, jobject obj) {
 654   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 655   movq(dst, rscratch1);
 656 }
 657 
 658 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 659   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 660 }
 661 
 662 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 663   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 664   movq(dst, rscratch1);
 665 }
 666 
 667 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 668   if (src.is_lval()) {
 669     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 670   } else {
 671     if (reachable(src)) {
 672       movq(dst, as_Address(src));
 673     } else {
 674       lea(scratch, src);
 675       movq(dst, Address(scratch, 0));
 676     }
 677   }
 678 }
 679 
 680 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 681   movq(as_Address(dst), src);
 682 }
 683 
 684 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 685   movq(dst, as_Address(src));
 686 }
 687 
 688 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 689 void MacroAssembler::movptr(Address dst, intptr_t src) {
 690   if (is_simm32(src)) {
 691     movptr(dst, checked_cast<int32_t>(src));
 692   } else {
 693     mov64(rscratch1, src);
 694     movq(dst, rscratch1);
 695   }
 696 }
 697 
 698 // These are mostly for initializing NULL
 699 void MacroAssembler::movptr(Address dst, int32_t src) {
 700   movslq(dst, src);
 701 }
 702 
 703 void MacroAssembler::movptr(Register dst, int32_t src) {
 704   mov64(dst, (intptr_t)src);
 705 }
 706 
 707 void MacroAssembler::pushoop(jobject obj) {
 708   movoop(rscratch1, obj);
 709   push(rscratch1);
 710 }
 711 
 712 void MacroAssembler::pushklass(Metadata* obj) {
 713   mov_metadata(rscratch1, obj);
 714   push(rscratch1);
 715 }
 716 
 717 void MacroAssembler::pushptr(AddressLiteral src) {
 718   lea(rscratch1, src);
 719   if (src.is_lval()) {
 720     push(rscratch1);
 721   } else {
 722     pushq(Address(rscratch1, 0));
 723   }
 724 }
 725 
 726 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 727   reset_last_Java_frame(r15_thread, clear_fp);
 728 }
 729 
 730 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 731                                          Register last_java_fp,
 732                                          address  last_java_pc) {
 733   vzeroupper();
 734   // determine last_java_sp register
 735   if (!last_java_sp->is_valid()) {
 736     last_java_sp = rsp;
 737   }
 738 
 739   // last_java_fp is optional
 740   if (last_java_fp->is_valid()) {
 741     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 742            last_java_fp);
 743   }
 744 
 745   // last_java_pc is optional
 746   if (last_java_pc != NULL) {
 747     Address java_pc(r15_thread,
 748                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 749     lea(rscratch1, InternalAddress(last_java_pc));
 750     movptr(java_pc, rscratch1);
 751   }
 752 
 753   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 754 }
 755 
 756 static void pass_arg0(MacroAssembler* masm, Register arg) {
 757   if (c_rarg0 != arg ) {
 758     masm->mov(c_rarg0, arg);
 759   }
 760 }
 761 
 762 static void pass_arg1(MacroAssembler* masm, Register arg) {
 763   if (c_rarg1 != arg ) {
 764     masm->mov(c_rarg1, arg);
 765   }
 766 }
 767 
 768 static void pass_arg2(MacroAssembler* masm, Register arg) {
 769   if (c_rarg2 != arg ) {
 770     masm->mov(c_rarg2, arg);
 771   }
 772 }
 773 
 774 static void pass_arg3(MacroAssembler* masm, Register arg) {
 775   if (c_rarg3 != arg ) {
 776     masm->mov(c_rarg3, arg);
 777   }
 778 }
 779 
 780 void MacroAssembler::stop(const char* msg) {
 781   if (ShowMessageBoxOnError) {
 782     address rip = pc();
 783     pusha(); // get regs on stack
 784     lea(c_rarg1, InternalAddress(rip));
 785     movq(c_rarg2, rsp); // pass pointer to regs array
 786   }
 787   lea(c_rarg0, ExternalAddress((address) msg));
 788   andq(rsp, -16); // align stack as required by ABI
 789   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 790   hlt();
 791 }
 792 
 793 void MacroAssembler::warn(const char* msg) {
 794   push(rbp);
 795   movq(rbp, rsp);
 796   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 797   push_CPU_state();   // keeps alignment at 16 bytes
 798   lea(c_rarg0, ExternalAddress((address) msg));
 799   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
 800   call(rax);
 801   pop_CPU_state();
 802   mov(rsp, rbp);
 803   pop(rbp);
 804 }
 805 
 806 void MacroAssembler::print_state() {
 807   address rip = pc();
 808   pusha();            // get regs on stack
 809   push(rbp);
 810   movq(rbp, rsp);
 811   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 812   push_CPU_state();   // keeps alignment at 16 bytes
 813 
 814   lea(c_rarg0, InternalAddress(rip));
 815   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 816   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 817 
 818   pop_CPU_state();
 819   mov(rsp, rbp);
 820   pop(rbp);
 821   popa();
 822 }
 823 
 824 #ifndef PRODUCT
 825 extern "C" void findpc(intptr_t x);
 826 #endif
 827 
 828 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 829   // In order to get locks to work, we need to fake a in_VM state
 830   if (ShowMessageBoxOnError) {
 831     JavaThread* thread = JavaThread::current();
 832     JavaThreadState saved_state = thread->thread_state();
 833     thread->set_thread_state(_thread_in_vm);
 834 #ifndef PRODUCT
 835     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 836       ttyLocker ttyl;
 837       BytecodeCounter::print();
 838     }
 839 #endif
 840     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 841     // XXX correct this offset for amd64
 842     // This is the value of eip which points to where verify_oop will return.
 843     if (os::message_box(msg, "Execution stopped, print registers?")) {
 844       print_state64(pc, regs);
 845       BREAKPOINT;
 846     }
 847   }
 848   fatal("DEBUG MESSAGE: %s", msg);
 849 }
 850 
 851 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 852   ttyLocker ttyl;
 853   FlagSetting fs(Debugging, true);
 854   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 855 #ifndef PRODUCT
 856   tty->cr();
 857   findpc(pc);
 858   tty->cr();
 859 #endif
 860 #define PRINT_REG(rax, value) \
 861   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 862   PRINT_REG(rax, regs[15]);
 863   PRINT_REG(rbx, regs[12]);
 864   PRINT_REG(rcx, regs[14]);
 865   PRINT_REG(rdx, regs[13]);
 866   PRINT_REG(rdi, regs[8]);
 867   PRINT_REG(rsi, regs[9]);
 868   PRINT_REG(rbp, regs[10]);
 869   // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
 870   PRINT_REG(rsp, (intptr_t)(&regs[16]));
 871   PRINT_REG(r8 , regs[7]);
 872   PRINT_REG(r9 , regs[6]);
 873   PRINT_REG(r10, regs[5]);
 874   PRINT_REG(r11, regs[4]);
 875   PRINT_REG(r12, regs[3]);
 876   PRINT_REG(r13, regs[2]);
 877   PRINT_REG(r14, regs[1]);
 878   PRINT_REG(r15, regs[0]);
 879 #undef PRINT_REG
 880   // Print some words near the top of the stack.
 881   int64_t* rsp = &regs[16];
 882   int64_t* dump_sp = rsp;
 883   for (int col1 = 0; col1 < 8; col1++) {
 884     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 885     os::print_location(tty, *dump_sp++);
 886   }
 887   for (int row = 0; row < 25; row++) {
 888     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 889     for (int col = 0; col < 4; col++) {
 890       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 891     }
 892     tty->cr();
 893   }
 894   // Print some instructions around pc:
 895   Disassembler::decode((address)pc-64, (address)pc);
 896   tty->print_cr("--------");
 897   Disassembler::decode((address)pc, (address)pc+32);
 898 }
 899 
 900 // The java_calling_convention describes stack locations as ideal slots on
 901 // a frame with no abi restrictions. Since we must observe abi restrictions
 902 // (like the placement of the register window) the slots must be biased by
 903 // the following value.
 904 static int reg2offset_in(VMReg r) {
 905   // Account for saved rbp and return address
 906   // This should really be in_preserve_stack_slots
 907   return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
 908 }
 909 
 910 static int reg2offset_out(VMReg r) {
 911   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
 912 }
 913 
 914 // A long move
 915 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst) {
 916 
 917   // The calling conventions assures us that each VMregpair is either
 918   // all really one physical register or adjacent stack slots.
 919 
 920   if (src.is_single_phys_reg() ) {
 921     if (dst.is_single_phys_reg()) {
 922       if (dst.first() != src.first()) {
 923         mov(dst.first()->as_Register(), src.first()->as_Register());
 924       }
 925     } else {
 926       assert(dst.is_single_reg(), "not a stack pair");
 927       movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
 928     }
 929   } else if (dst.is_single_phys_reg()) {
 930     assert(src.is_single_reg(),  "not a stack pair");
 931     movq(dst.first()->as_Register(), Address(rbp, reg2offset_out(src.first())));
 932   } else {
 933     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 934     movq(rax, Address(rbp, reg2offset_in(src.first())));
 935     movq(Address(rsp, reg2offset_out(dst.first())), rax);
 936   }
 937 }
 938 
 939 // A double move
 940 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst) {
 941 
 942   // The calling conventions assures us that each VMregpair is either
 943   // all really one physical register or adjacent stack slots.
 944 
 945   if (src.is_single_phys_reg() ) {
 946     if (dst.is_single_phys_reg()) {
 947       // In theory these overlap but the ordering is such that this is likely a nop
 948       if ( src.first() != dst.first()) {
 949         movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
 950       }
 951     } else {
 952       assert(dst.is_single_reg(), "not a stack pair");
 953       movdbl(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
 954     }
 955   } else if (dst.is_single_phys_reg()) {
 956     assert(src.is_single_reg(),  "not a stack pair");
 957     movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_out(src.first())));
 958   } else {
 959     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 960     movq(rax, Address(rbp, reg2offset_in(src.first())));
 961     movq(Address(rsp, reg2offset_out(dst.first())), rax);
 962   }
 963 }
 964 
 965 
 966 // A float arg may have to do float reg int reg conversion
 967 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst) {
 968   assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
 969 
 970   // The calling conventions assures us that each VMregpair is either
 971   // all really one physical register or adjacent stack slots.
 972 
 973   if (src.first()->is_stack()) {
 974     if (dst.first()->is_stack()) {
 975       movl(rax, Address(rbp, reg2offset_in(src.first())));
 976       movptr(Address(rsp, reg2offset_out(dst.first())), rax);
 977     } else {
 978       // stack to reg
 979       assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
 980       movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first())));
 981     }
 982   } else if (dst.first()->is_stack()) {
 983     // reg to stack
 984     assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
 985     movflt(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
 986   } else {
 987     // reg to reg
 988     // In theory these overlap but the ordering is such that this is likely a nop
 989     if ( src.first() != dst.first()) {
 990       movdbl(dst.first()->as_XMMRegister(),  src.first()->as_XMMRegister());
 991     }
 992   }
 993 }
 994 
 995 // On 64 bit we will store integer like items to the stack as
 996 // 64 bits items (x86_32/64 abi) even though java would only store
 997 // 32bits for a parameter. On 32bit it will simply be 32 bits
 998 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
 999 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst) {
1000   if (src.first()->is_stack()) {
1001     if (dst.first()->is_stack()) {
1002       // stack to stack
1003       movslq(rax, Address(rbp, reg2offset_in(src.first())));
1004       movq(Address(rsp, reg2offset_out(dst.first())), rax);
1005     } else {
1006       // stack to reg
1007       movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1008     }
1009   } else if (dst.first()->is_stack()) {
1010     // reg to stack
1011     // Do we really have to sign extend???
1012     // __ movslq(src.first()->as_Register(), src.first()->as_Register());
1013     movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1014   } else {
1015     // Do we really have to sign extend???
1016     // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
1017     if (dst.first() != src.first()) {
1018       movq(dst.first()->as_Register(), src.first()->as_Register());
1019     }
1020   }
1021 }
1022 
1023 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
1024   if (src.first()->is_stack()) {
1025     if (dst.first()->is_stack()) {
1026       // stack to stack
1027       movq(rax, Address(rbp, reg2offset_in(src.first())));
1028       movq(Address(rsp, reg2offset_out(dst.first())), rax);
1029     } else {
1030       // stack to reg
1031       movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1032     }
1033   } else if (dst.first()->is_stack()) {
1034     // reg to stack
1035     movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1036   } else {
1037     if (dst.first() != src.first()) {
1038       movq(dst.first()->as_Register(), src.first()->as_Register());
1039     }
1040   }
1041 }
1042 
1043 // An oop arg. Must pass a handle not the oop itself
1044 void MacroAssembler::object_move(OopMap* map,
1045                         int oop_handle_offset,
1046                         int framesize_in_slots,
1047                         VMRegPair src,
1048                         VMRegPair dst,
1049                         bool is_receiver,
1050                         int* receiver_offset) {
1051 
1052   // must pass a handle. First figure out the location we use as a handle
1053 
1054   Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1055 
1056   // See if oop is NULL if it is we need no handle
1057 
1058   if (src.first()->is_stack()) {
1059 
1060     // Oop is already on the stack as an argument
1061     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1062     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1063     if (is_receiver) {
1064       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1065     }
1066 
1067     cmpptr(Address(rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD);
1068     lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1069     // conditionally move a NULL
1070     cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1071   } else {
1072 
1073     // Oop is in an a register we must store it to the space we reserve
1074     // on the stack for oop_handles and pass a handle if oop is non-NULL
1075 
1076     const Register rOop = src.first()->as_Register();
1077     int oop_slot;
1078     if (rOop == j_rarg0)
1079       oop_slot = 0;
1080     else if (rOop == j_rarg1)
1081       oop_slot = 1;
1082     else if (rOop == j_rarg2)
1083       oop_slot = 2;
1084     else if (rOop == j_rarg3)
1085       oop_slot = 3;
1086     else if (rOop == j_rarg4)
1087       oop_slot = 4;
1088     else {
1089       assert(rOop == j_rarg5, "wrong register");
1090       oop_slot = 5;
1091     }
1092 
1093     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1094     int offset = oop_slot*VMRegImpl::stack_slot_size;
1095 
1096     map->set_oop(VMRegImpl::stack2reg(oop_slot));
1097     // Store oop in handle area, may be NULL
1098     movptr(Address(rsp, offset), rOop);
1099     if (is_receiver) {
1100       *receiver_offset = offset;
1101     }
1102 
1103     cmpptr(rOop, (int32_t)NULL_WORD);
1104     lea(rHandle, Address(rsp, offset));
1105     // conditionally move a NULL from the handle area where it was just stored
1106     cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1107   }
1108 
1109   // If arg is on the stack then place it otherwise it is already in correct reg.
1110   if (dst.first()->is_stack()) {
1111     movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1112   }
1113 }
1114 
1115 #endif // _LP64
1116 
1117 // Now versions that are common to 32/64 bit
1118 
1119 void MacroAssembler::addptr(Register dst, int32_t imm32) {
1120   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
1121 }
1122 
1123 void MacroAssembler::addptr(Register dst, Register src) {
1124   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1125 }
1126 
1127 void MacroAssembler::addptr(Address dst, Register src) {
1128   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1129 }
1130 
1131 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
1132   if (reachable(src)) {
1133     Assembler::addsd(dst, as_Address(src));
1134   } else {
1135     lea(rscratch1, src);
1136     Assembler::addsd(dst, Address(rscratch1, 0));
1137   }
1138 }
1139 
1140 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
1141   if (reachable(src)) {
1142     addss(dst, as_Address(src));
1143   } else {
1144     lea(rscratch1, src);
1145     addss(dst, Address(rscratch1, 0));
1146   }
1147 }
1148 
1149 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
1150   if (reachable(src)) {
1151     Assembler::addpd(dst, as_Address(src));
1152   } else {
1153     lea(rscratch1, src);
1154     Assembler::addpd(dst, Address(rscratch1, 0));
1155   }
1156 }
1157 
1158 // See 8273459.  Function for ensuring 64-byte alignment, intended for stubs only.
1159 // Stub code is generated once and never copied.
1160 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
1161 void MacroAssembler::align64() {
1162   align(64, (unsigned long long) pc());
1163 }
1164 
1165 void MacroAssembler::align32() {
1166   align(32, (unsigned long long) pc());
1167 }
1168 
1169 void MacroAssembler::align(int modulus) {
1170   // 8273459: Ensure alignment is possible with current segment alignment
1171   assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
1172   align(modulus, offset());
1173 }
1174 
1175 void MacroAssembler::align(int modulus, int target) {
1176   if (target % modulus != 0) {
1177     nop(modulus - (target % modulus));
1178   }
1179 }
1180 
1181 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1182   // Used in sign-masking with aligned address.
1183   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1184   if (reachable(src)) {
1185     Assembler::andpd(dst, as_Address(src));
1186   } else {
1187     lea(scratch_reg, src);
1188     Assembler::andpd(dst, Address(scratch_reg, 0));
1189   }
1190 }
1191 
1192 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1193   // Used in sign-masking with aligned address.
1194   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1195   if (reachable(src)) {
1196     Assembler::andps(dst, as_Address(src));
1197   } else {
1198     lea(scratch_reg, src);
1199     Assembler::andps(dst, Address(scratch_reg, 0));
1200   }
1201 }
1202 
1203 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1204   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1205 }
1206 
1207 void MacroAssembler::atomic_incl(Address counter_addr) {
1208   lock();
1209   incrementl(counter_addr);
1210 }
1211 
1212 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1213   if (reachable(counter_addr)) {
1214     atomic_incl(as_Address(counter_addr));
1215   } else {
1216     lea(scr, counter_addr);
1217     atomic_incl(Address(scr, 0));
1218   }
1219 }
1220 
1221 #ifdef _LP64
1222 void MacroAssembler::atomic_incq(Address counter_addr) {
1223   lock();
1224   incrementq(counter_addr);
1225 }
1226 
1227 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1228   if (reachable(counter_addr)) {
1229     atomic_incq(as_Address(counter_addr));
1230   } else {
1231     lea(scr, counter_addr);
1232     atomic_incq(Address(scr, 0));
1233   }
1234 }
1235 #endif
1236 
1237 // Writes to stack successive pages until offset reached to check for
1238 // stack overflow + shadow pages.  This clobbers tmp.
1239 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1240   movptr(tmp, rsp);
1241   // Bang stack for total size given plus shadow page size.
1242   // Bang one page at a time because large size can bang beyond yellow and
1243   // red zones.
1244   Label loop;
1245   bind(loop);
1246   movl(Address(tmp, (-os::vm_page_size())), size );
1247   subptr(tmp, os::vm_page_size());
1248   subl(size, os::vm_page_size());
1249   jcc(Assembler::greater, loop);
1250 
1251   // Bang down shadow pages too.
1252   // At this point, (tmp-0) is the last address touched, so don't
1253   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1254   // was post-decremented.)  Skip this address by starting at i=1, and
1255   // touch a few more pages below.  N.B.  It is important to touch all
1256   // the way down including all pages in the shadow zone.
1257   for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1258     // this could be any sized move but this is can be a debugging crumb
1259     // so the bigger the better.
1260     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1261   }
1262 }
1263 
1264 void MacroAssembler::reserved_stack_check() {
1265     // testing if reserved zone needs to be enabled
1266     Label no_reserved_zone_enabling;
1267     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1268     NOT_LP64(get_thread(rsi);)
1269 
1270     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1271     jcc(Assembler::below, no_reserved_zone_enabling);
1272 
1273     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1274     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1275     should_not_reach_here();
1276 
1277     bind(no_reserved_zone_enabling);
1278 }
1279 
1280 void MacroAssembler::c2bool(Register x) {
1281   // implements x == 0 ? 0 : 1
1282   // note: must only look at least-significant byte of x
1283   //       since C-style booleans are stored in one byte
1284   //       only! (was bug)
1285   andl(x, 0xFF);
1286   setb(Assembler::notZero, x);
1287 }
1288 
1289 // Wouldn't need if AddressLiteral version had new name
1290 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1291   Assembler::call(L, rtype);
1292 }
1293 
1294 void MacroAssembler::call(Register entry) {
1295   Assembler::call(entry);
1296 }
1297 
1298 void MacroAssembler::call(AddressLiteral entry) {
1299   if (reachable(entry)) {
1300     Assembler::call_literal(entry.target(), entry.rspec());
1301   } else {
1302     lea(rscratch1, entry);
1303     Assembler::call(rscratch1);
1304   }
1305 }
1306 
1307 void MacroAssembler::ic_call(address entry, jint method_index) {
1308   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
1309   movptr(rax, (intptr_t)Universe::non_oop_word());
1310   call(AddressLiteral(entry, rh));
1311 }
1312 
1313 // Implementation of call_VM versions
1314 
1315 void MacroAssembler::call_VM(Register oop_result,
1316                              address entry_point,
1317                              bool check_exceptions) {
1318   Label C, E;
1319   call(C, relocInfo::none);
1320   jmp(E);
1321 
1322   bind(C);
1323   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1324   ret(0);
1325 
1326   bind(E);
1327 }
1328 
1329 void MacroAssembler::call_VM(Register oop_result,
1330                              address entry_point,
1331                              Register arg_1,
1332                              bool check_exceptions) {
1333   Label C, E;
1334   call(C, relocInfo::none);
1335   jmp(E);
1336 
1337   bind(C);
1338   pass_arg1(this, arg_1);
1339   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1340   ret(0);
1341 
1342   bind(E);
1343 }
1344 
1345 void MacroAssembler::call_VM(Register oop_result,
1346                              address entry_point,
1347                              Register arg_1,
1348                              Register arg_2,
1349                              bool check_exceptions) {
1350   Label C, E;
1351   call(C, relocInfo::none);
1352   jmp(E);
1353 
1354   bind(C);
1355 
1356   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1357 
1358   pass_arg2(this, arg_2);
1359   pass_arg1(this, arg_1);
1360   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1361   ret(0);
1362 
1363   bind(E);
1364 }
1365 
1366 void MacroAssembler::call_VM(Register oop_result,
1367                              address entry_point,
1368                              Register arg_1,
1369                              Register arg_2,
1370                              Register arg_3,
1371                              bool check_exceptions) {
1372   Label C, E;
1373   call(C, relocInfo::none);
1374   jmp(E);
1375 
1376   bind(C);
1377 
1378   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1379   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1380   pass_arg3(this, arg_3);
1381 
1382   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1383   pass_arg2(this, arg_2);
1384 
1385   pass_arg1(this, arg_1);
1386   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1387   ret(0);
1388 
1389   bind(E);
1390 }
1391 
1392 void MacroAssembler::call_VM(Register oop_result,
1393                              Register last_java_sp,
1394                              address entry_point,
1395                              int number_of_arguments,
1396                              bool check_exceptions) {
1397   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1398   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1399 }
1400 
1401 void MacroAssembler::call_VM(Register oop_result,
1402                              Register last_java_sp,
1403                              address entry_point,
1404                              Register arg_1,
1405                              bool check_exceptions) {
1406   pass_arg1(this, arg_1);
1407   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1408 }
1409 
1410 void MacroAssembler::call_VM(Register oop_result,
1411                              Register last_java_sp,
1412                              address entry_point,
1413                              Register arg_1,
1414                              Register arg_2,
1415                              bool check_exceptions) {
1416 
1417   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1418   pass_arg2(this, arg_2);
1419   pass_arg1(this, arg_1);
1420   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1421 }
1422 
1423 void MacroAssembler::call_VM(Register oop_result,
1424                              Register last_java_sp,
1425                              address entry_point,
1426                              Register arg_1,
1427                              Register arg_2,
1428                              Register arg_3,
1429                              bool check_exceptions) {
1430   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1431   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1432   pass_arg3(this, arg_3);
1433   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1434   pass_arg2(this, arg_2);
1435   pass_arg1(this, arg_1);
1436   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1437 }
1438 
1439 void MacroAssembler::super_call_VM(Register oop_result,
1440                                    Register last_java_sp,
1441                                    address entry_point,
1442                                    int number_of_arguments,
1443                                    bool check_exceptions) {
1444   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1445   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1446 }
1447 
1448 void MacroAssembler::super_call_VM(Register oop_result,
1449                                    Register last_java_sp,
1450                                    address entry_point,
1451                                    Register arg_1,
1452                                    bool check_exceptions) {
1453   pass_arg1(this, arg_1);
1454   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1455 }
1456 
1457 void MacroAssembler::super_call_VM(Register oop_result,
1458                                    Register last_java_sp,
1459                                    address entry_point,
1460                                    Register arg_1,
1461                                    Register arg_2,
1462                                    bool check_exceptions) {
1463 
1464   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1465   pass_arg2(this, arg_2);
1466   pass_arg1(this, arg_1);
1467   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1468 }
1469 
1470 void MacroAssembler::super_call_VM(Register oop_result,
1471                                    Register last_java_sp,
1472                                    address entry_point,
1473                                    Register arg_1,
1474                                    Register arg_2,
1475                                    Register arg_3,
1476                                    bool check_exceptions) {
1477   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1478   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1479   pass_arg3(this, arg_3);
1480   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1481   pass_arg2(this, arg_2);
1482   pass_arg1(this, arg_1);
1483   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1484 }
1485 
1486 void MacroAssembler::call_VM_base(Register oop_result,
1487                                   Register java_thread,
1488                                   Register last_java_sp,
1489                                   address  entry_point,
1490                                   int      number_of_arguments,
1491                                   bool     check_exceptions) {
1492   // determine java_thread register
1493   if (!java_thread->is_valid()) {
1494 #ifdef _LP64
1495     java_thread = r15_thread;
1496 #else
1497     java_thread = rdi;
1498     get_thread(java_thread);
1499 #endif // LP64
1500   }
1501   // determine last_java_sp register
1502   if (!last_java_sp->is_valid()) {
1503     last_java_sp = rsp;
1504   }
1505   // debugging support
1506   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
1507   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
1508 #ifdef ASSERT
1509   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1510   // r12 is the heapbase.
1511   LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
1512 #endif // ASSERT
1513 
1514   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
1515   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1516 
1517   // push java thread (becomes first argument of C function)
1518 
1519   NOT_LP64(push(java_thread); number_of_arguments++);
1520   LP64_ONLY(mov(c_rarg0, r15_thread));
1521 
1522   // set last Java frame before call
1523   assert(last_java_sp != rbp, "can't use ebp/rbp");
1524 
1525   // Only interpreter should have to set fp
1526   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
1527 
1528   // do the call, remove parameters
1529   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1530 
1531   // restore the thread (cannot use the pushed argument since arguments
1532   // may be overwritten by C code generated by an optimizing compiler);
1533   // however can use the register value directly if it is callee saved.
1534   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
1535     // rdi & rsi (also r15) are callee saved -> nothing to do
1536 #ifdef ASSERT
1537     guarantee(java_thread != rax, "change this code");
1538     push(rax);
1539     { Label L;
1540       get_thread(rax);
1541       cmpptr(java_thread, rax);
1542       jcc(Assembler::equal, L);
1543       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
1544       bind(L);
1545     }
1546     pop(rax);
1547 #endif
1548   } else {
1549     get_thread(java_thread);
1550   }
1551   // reset last Java frame
1552   // Only interpreter should have to clear fp
1553   reset_last_Java_frame(java_thread, true);
1554 
1555    // C++ interp handles this in the interpreter
1556   check_and_handle_popframe(java_thread);
1557   check_and_handle_earlyret(java_thread);
1558 
1559   if (check_exceptions) {
1560     // check for pending exceptions (java_thread is set upon return)
1561     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
1562 #ifndef _LP64
1563     jump_cc(Assembler::notEqual,
1564             RuntimeAddress(StubRoutines::forward_exception_entry()));
1565 #else
1566     // This used to conditionally jump to forward_exception however it is
1567     // possible if we relocate that the branch will not reach. So we must jump
1568     // around so we can always reach
1569 
1570     Label ok;
1571     jcc(Assembler::equal, ok);
1572     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1573     bind(ok);
1574 #endif // LP64
1575   }
1576 
1577   // get oop result if there is one and reset the value in the thread
1578   if (oop_result->is_valid()) {
1579     get_vm_result(oop_result, java_thread);
1580   }
1581 }
1582 
1583 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1584 
1585   // Calculate the value for last_Java_sp
1586   // somewhat subtle. call_VM does an intermediate call
1587   // which places a return address on the stack just under the
1588   // stack pointer as the user finished with it. This allows
1589   // use to retrieve last_Java_pc from last_Java_sp[-1].
1590   // On 32bit we then have to push additional args on the stack to accomplish
1591   // the actual requested call. On 64bit call_VM only can use register args
1592   // so the only extra space is the return address that call_VM created.
1593   // This hopefully explains the calculations here.
1594 
1595 #ifdef _LP64
1596   // We've pushed one address, correct last_Java_sp
1597   lea(rax, Address(rsp, wordSize));
1598 #else
1599   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
1600 #endif // LP64
1601 
1602   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
1603 
1604 }
1605 
1606 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1607 void MacroAssembler::call_VM_leaf0(address entry_point) {
1608   MacroAssembler::call_VM_leaf_base(entry_point, 0);
1609 }
1610 
1611 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1612   call_VM_leaf_base(entry_point, number_of_arguments);
1613 }
1614 
1615 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1616   pass_arg0(this, arg_0);
1617   call_VM_leaf(entry_point, 1);
1618 }
1619 
1620 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1621 
1622   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1623   pass_arg1(this, arg_1);
1624   pass_arg0(this, arg_0);
1625   call_VM_leaf(entry_point, 2);
1626 }
1627 
1628 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1629   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1630   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1631   pass_arg2(this, arg_2);
1632   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1633   pass_arg1(this, arg_1);
1634   pass_arg0(this, arg_0);
1635   call_VM_leaf(entry_point, 3);
1636 }
1637 
1638 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1639   pass_arg0(this, arg_0);
1640   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1641 }
1642 
1643 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1644 
1645   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1646   pass_arg1(this, arg_1);
1647   pass_arg0(this, arg_0);
1648   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1649 }
1650 
1651 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1652   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1653   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1654   pass_arg2(this, arg_2);
1655   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1656   pass_arg1(this, arg_1);
1657   pass_arg0(this, arg_0);
1658   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1659 }
1660 
1661 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1662   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1663   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1664   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1665   pass_arg3(this, arg_3);
1666   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1667   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1668   pass_arg2(this, arg_2);
1669   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1670   pass_arg1(this, arg_1);
1671   pass_arg0(this, arg_0);
1672   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1673 }
1674 
1675 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1676   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1677   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
1678   verify_oop_msg(oop_result, "broken oop in call_VM_base");
1679 }
1680 
1681 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1682   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1683   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
1684 }
1685 
1686 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
1687 }
1688 
1689 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
1690 }
1691 
1692 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
1693   if (reachable(src1)) {
1694     cmpl(as_Address(src1), imm);
1695   } else {
1696     lea(rscratch1, src1);
1697     cmpl(Address(rscratch1, 0), imm);
1698   }
1699 }
1700 
1701 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
1702   assert(!src2.is_lval(), "use cmpptr");
1703   if (reachable(src2)) {
1704     cmpl(src1, as_Address(src2));
1705   } else {
1706     lea(rscratch1, src2);
1707     cmpl(src1, Address(rscratch1, 0));
1708   }
1709 }
1710 
1711 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1712   Assembler::cmpl(src1, imm);
1713 }
1714 
1715 void MacroAssembler::cmp32(Register src1, Address src2) {
1716   Assembler::cmpl(src1, src2);
1717 }
1718 
1719 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1720   ucomisd(opr1, opr2);
1721 
1722   Label L;
1723   if (unordered_is_less) {
1724     movl(dst, -1);
1725     jcc(Assembler::parity, L);
1726     jcc(Assembler::below , L);
1727     movl(dst, 0);
1728     jcc(Assembler::equal , L);
1729     increment(dst);
1730   } else { // unordered is greater
1731     movl(dst, 1);
1732     jcc(Assembler::parity, L);
1733     jcc(Assembler::above , L);
1734     movl(dst, 0);
1735     jcc(Assembler::equal , L);
1736     decrementl(dst);
1737   }
1738   bind(L);
1739 }
1740 
1741 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1742   ucomiss(opr1, opr2);
1743 
1744   Label L;
1745   if (unordered_is_less) {
1746     movl(dst, -1);
1747     jcc(Assembler::parity, L);
1748     jcc(Assembler::below , L);
1749     movl(dst, 0);
1750     jcc(Assembler::equal , L);
1751     increment(dst);
1752   } else { // unordered is greater
1753     movl(dst, 1);
1754     jcc(Assembler::parity, L);
1755     jcc(Assembler::above , L);
1756     movl(dst, 0);
1757     jcc(Assembler::equal , L);
1758     decrementl(dst);
1759   }
1760   bind(L);
1761 }
1762 
1763 
1764 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
1765   if (reachable(src1)) {
1766     cmpb(as_Address(src1), imm);
1767   } else {
1768     lea(rscratch1, src1);
1769     cmpb(Address(rscratch1, 0), imm);
1770   }
1771 }
1772 
1773 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
1774 #ifdef _LP64
1775   if (src2.is_lval()) {
1776     movptr(rscratch1, src2);
1777     Assembler::cmpq(src1, rscratch1);
1778   } else if (reachable(src2)) {
1779     cmpq(src1, as_Address(src2));
1780   } else {
1781     lea(rscratch1, src2);
1782     Assembler::cmpq(src1, Address(rscratch1, 0));
1783   }
1784 #else
1785   if (src2.is_lval()) {
1786     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1787   } else {
1788     cmpl(src1, as_Address(src2));
1789   }
1790 #endif // _LP64
1791 }
1792 
1793 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
1794   assert(src2.is_lval(), "not a mem-mem compare");
1795 #ifdef _LP64
1796   // moves src2's literal address
1797   movptr(rscratch1, src2);
1798   Assembler::cmpq(src1, rscratch1);
1799 #else
1800   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1801 #endif // _LP64
1802 }
1803 
1804 void MacroAssembler::cmpoop(Register src1, Register src2) {
1805   cmpptr(src1, src2);
1806 }
1807 
1808 void MacroAssembler::cmpoop(Register src1, Address src2) {
1809   cmpptr(src1, src2);
1810 }
1811 
1812 #ifdef _LP64
1813 void MacroAssembler::cmpoop(Register src1, jobject src2) {
1814   movoop(rscratch1, src2);
1815   cmpptr(src1, rscratch1);
1816 }
1817 #endif
1818 
1819 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
1820   if (reachable(adr)) {
1821     lock();
1822     cmpxchgptr(reg, as_Address(adr));
1823   } else {
1824     lea(rscratch1, adr);
1825     lock();
1826     cmpxchgptr(reg, Address(rscratch1, 0));
1827   }
1828 }
1829 
1830 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1831   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
1832 }
1833 
1834 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
1835   if (reachable(src)) {
1836     Assembler::comisd(dst, as_Address(src));
1837   } else {
1838     lea(rscratch1, src);
1839     Assembler::comisd(dst, Address(rscratch1, 0));
1840   }
1841 }
1842 
1843 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
1844   if (reachable(src)) {
1845     Assembler::comiss(dst, as_Address(src));
1846   } else {
1847     lea(rscratch1, src);
1848     Assembler::comiss(dst, Address(rscratch1, 0));
1849   }
1850 }
1851 
1852 
1853 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
1854   Condition negated_cond = negate_condition(cond);
1855   Label L;
1856   jcc(negated_cond, L);
1857   pushf(); // Preserve flags
1858   atomic_incl(counter_addr);
1859   popf();
1860   bind(L);
1861 }
1862 
1863 int MacroAssembler::corrected_idivl(Register reg) {
1864   // Full implementation of Java idiv and irem; checks for
1865   // special case as described in JVM spec., p.243 & p.271.
1866   // The function returns the (pc) offset of the idivl
1867   // instruction - may be needed for implicit exceptions.
1868   //
1869   //         normal case                           special case
1870   //
1871   // input : rax,: dividend                         min_int
1872   //         reg: divisor   (may not be rax,/rdx)   -1
1873   //
1874   // output: rax,: quotient  (= rax, idiv reg)       min_int
1875   //         rdx: remainder (= rax, irem reg)       0
1876   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1877   const int min_int = 0x80000000;
1878   Label normal_case, special_case;
1879 
1880   // check for special case
1881   cmpl(rax, min_int);
1882   jcc(Assembler::notEqual, normal_case);
1883   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1884   cmpl(reg, -1);
1885   jcc(Assembler::equal, special_case);
1886 
1887   // handle normal case
1888   bind(normal_case);
1889   cdql();
1890   int idivl_offset = offset();
1891   idivl(reg);
1892 
1893   // normal and special case exit
1894   bind(special_case);
1895 
1896   return idivl_offset;
1897 }
1898 
1899 
1900 
1901 void MacroAssembler::decrementl(Register reg, int value) {
1902   if (value == min_jint) {subl(reg, value) ; return; }
1903   if (value <  0) { incrementl(reg, -value); return; }
1904   if (value == 0) {                        ; return; }
1905   if (value == 1 && UseIncDec) { decl(reg) ; return; }
1906   /* else */      { subl(reg, value)       ; return; }
1907 }
1908 
1909 void MacroAssembler::decrementl(Address dst, int value) {
1910   if (value == min_jint) {subl(dst, value) ; return; }
1911   if (value <  0) { incrementl(dst, -value); return; }
1912   if (value == 0) {                        ; return; }
1913   if (value == 1 && UseIncDec) { decl(dst) ; return; }
1914   /* else */      { subl(dst, value)       ; return; }
1915 }
1916 
1917 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
1918   assert (shift_value > 0, "illegal shift value");
1919   Label _is_positive;
1920   testl (reg, reg);
1921   jcc (Assembler::positive, _is_positive);
1922   int offset = (1 << shift_value) - 1 ;
1923 
1924   if (offset == 1) {
1925     incrementl(reg);
1926   } else {
1927     addl(reg, offset);
1928   }
1929 
1930   bind (_is_positive);
1931   sarl(reg, shift_value);
1932 }
1933 
1934 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
1935   if (reachable(src)) {
1936     Assembler::divsd(dst, as_Address(src));
1937   } else {
1938     lea(rscratch1, src);
1939     Assembler::divsd(dst, Address(rscratch1, 0));
1940   }
1941 }
1942 
1943 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
1944   if (reachable(src)) {
1945     Assembler::divss(dst, as_Address(src));
1946   } else {
1947     lea(rscratch1, src);
1948     Assembler::divss(dst, Address(rscratch1, 0));
1949   }
1950 }
1951 
1952 void MacroAssembler::enter() {
1953   push(rbp);
1954   mov(rbp, rsp);
1955 }
1956 
1957 // A 5 byte nop that is safe for patching (see patch_verified_entry)
1958 void MacroAssembler::fat_nop() {
1959   if (UseAddressNop) {
1960     addr_nop_5();
1961   } else {
1962     emit_int8(0x26); // es:
1963     emit_int8(0x2e); // cs:
1964     emit_int8(0x64); // fs:
1965     emit_int8(0x65); // gs:
1966     emit_int8((unsigned char)0x90);
1967   }
1968 }
1969 
1970 #ifndef _LP64
1971 void MacroAssembler::fcmp(Register tmp) {
1972   fcmp(tmp, 1, true, true);
1973 }
1974 
1975 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
1976   assert(!pop_right || pop_left, "usage error");
1977   if (VM_Version::supports_cmov()) {
1978     assert(tmp == noreg, "unneeded temp");
1979     if (pop_left) {
1980       fucomip(index);
1981     } else {
1982       fucomi(index);
1983     }
1984     if (pop_right) {
1985       fpop();
1986     }
1987   } else {
1988     assert(tmp != noreg, "need temp");
1989     if (pop_left) {
1990       if (pop_right) {
1991         fcompp();
1992       } else {
1993         fcomp(index);
1994       }
1995     } else {
1996       fcom(index);
1997     }
1998     // convert FPU condition into eflags condition via rax,
1999     save_rax(tmp);
2000     fwait(); fnstsw_ax();
2001     sahf();
2002     restore_rax(tmp);
2003   }
2004   // condition codes set as follows:
2005   //
2006   // CF (corresponds to C0) if x < y
2007   // PF (corresponds to C2) if unordered
2008   // ZF (corresponds to C3) if x = y
2009 }
2010 
2011 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2012   fcmp2int(dst, unordered_is_less, 1, true, true);
2013 }
2014 
2015 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2016   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2017   Label L;
2018   if (unordered_is_less) {
2019     movl(dst, -1);
2020     jcc(Assembler::parity, L);
2021     jcc(Assembler::below , L);
2022     movl(dst, 0);
2023     jcc(Assembler::equal , L);
2024     increment(dst);
2025   } else { // unordered is greater
2026     movl(dst, 1);
2027     jcc(Assembler::parity, L);
2028     jcc(Assembler::above , L);
2029     movl(dst, 0);
2030     jcc(Assembler::equal , L);
2031     decrementl(dst);
2032   }
2033   bind(L);
2034 }
2035 
2036 void MacroAssembler::fld_d(AddressLiteral src) {
2037   fld_d(as_Address(src));
2038 }
2039 
2040 void MacroAssembler::fld_s(AddressLiteral src) {
2041   fld_s(as_Address(src));
2042 }
2043 
2044 void MacroAssembler::fldcw(AddressLiteral src) {
2045   Assembler::fldcw(as_Address(src));
2046 }
2047 
2048 void MacroAssembler::fpop() {
2049   ffree();
2050   fincstp();
2051 }
2052 
2053 void MacroAssembler::fremr(Register tmp) {
2054   save_rax(tmp);
2055   { Label L;
2056     bind(L);
2057     fprem();
2058     fwait(); fnstsw_ax();
2059     sahf();
2060     jcc(Assembler::parity, L);
2061   }
2062   restore_rax(tmp);
2063   // Result is in ST0.
2064   // Note: fxch & fpop to get rid of ST1
2065   // (otherwise FPU stack could overflow eventually)
2066   fxch(1);
2067   fpop();
2068 }
2069 
2070 void MacroAssembler::empty_FPU_stack() {
2071   if (VM_Version::supports_mmx()) {
2072     emms();
2073   } else {
2074     for (int i = 8; i-- > 0; ) ffree(i);
2075   }
2076 }
2077 #endif // !LP64
2078 
2079 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2080   if (reachable(src)) {
2081     Assembler::mulpd(dst, as_Address(src));
2082   } else {
2083     lea(rscratch1, src);
2084     Assembler::mulpd(dst, Address(rscratch1, 0));
2085   }
2086 }
2087 
2088 void MacroAssembler::load_float(Address src) {
2089 #ifdef _LP64
2090   movflt(xmm0, src);
2091 #else
2092   if (UseSSE >= 1) {
2093     movflt(xmm0, src);
2094   } else {
2095     fld_s(src);
2096   }
2097 #endif // LP64
2098 }
2099 
2100 void MacroAssembler::store_float(Address dst) {
2101 #ifdef _LP64
2102   movflt(dst, xmm0);
2103 #else
2104   if (UseSSE >= 1) {
2105     movflt(dst, xmm0);
2106   } else {
2107     fstp_s(dst);
2108   }
2109 #endif // LP64
2110 }
2111 
2112 void MacroAssembler::load_double(Address src) {
2113 #ifdef _LP64
2114   movdbl(xmm0, src);
2115 #else
2116   if (UseSSE >= 2) {
2117     movdbl(xmm0, src);
2118   } else {
2119     fld_d(src);
2120   }
2121 #endif // LP64
2122 }
2123 
2124 void MacroAssembler::store_double(Address dst) {
2125 #ifdef _LP64
2126   movdbl(dst, xmm0);
2127 #else
2128   if (UseSSE >= 2) {
2129     movdbl(dst, xmm0);
2130   } else {
2131     fstp_d(dst);
2132   }
2133 #endif // LP64
2134 }
2135 
2136 // dst = c = a * b + c
2137 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2138   Assembler::vfmadd231sd(c, a, b);
2139   if (dst != c) {
2140     movdbl(dst, c);
2141   }
2142 }
2143 
2144 // dst = c = a * b + c
2145 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2146   Assembler::vfmadd231ss(c, a, b);
2147   if (dst != c) {
2148     movflt(dst, c);
2149   }
2150 }
2151 
2152 // dst = c = a * b + c
2153 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2154   Assembler::vfmadd231pd(c, a, b, vector_len);
2155   if (dst != c) {
2156     vmovdqu(dst, c);
2157   }
2158 }
2159 
2160 // dst = c = a * b + c
2161 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2162   Assembler::vfmadd231ps(c, a, b, vector_len);
2163   if (dst != c) {
2164     vmovdqu(dst, c);
2165   }
2166 }
2167 
2168 // dst = c = a * b + c
2169 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2170   Assembler::vfmadd231pd(c, a, b, vector_len);
2171   if (dst != c) {
2172     vmovdqu(dst, c);
2173   }
2174 }
2175 
2176 // dst = c = a * b + c
2177 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2178   Assembler::vfmadd231ps(c, a, b, vector_len);
2179   if (dst != c) {
2180     vmovdqu(dst, c);
2181   }
2182 }
2183 
2184 void MacroAssembler::incrementl(AddressLiteral dst) {
2185   if (reachable(dst)) {
2186     incrementl(as_Address(dst));
2187   } else {
2188     lea(rscratch1, dst);
2189     incrementl(Address(rscratch1, 0));
2190   }
2191 }
2192 
2193 void MacroAssembler::incrementl(ArrayAddress dst) {
2194   incrementl(as_Address(dst));
2195 }
2196 
2197 void MacroAssembler::incrementl(Register reg, int value) {
2198   if (value == min_jint) {addl(reg, value) ; return; }
2199   if (value <  0) { decrementl(reg, -value); return; }
2200   if (value == 0) {                        ; return; }
2201   if (value == 1 && UseIncDec) { incl(reg) ; return; }
2202   /* else */      { addl(reg, value)       ; return; }
2203 }
2204 
2205 void MacroAssembler::incrementl(Address dst, int value) {
2206   if (value == min_jint) {addl(dst, value) ; return; }
2207   if (value <  0) { decrementl(dst, -value); return; }
2208   if (value == 0) {                        ; return; }
2209   if (value == 1 && UseIncDec) { incl(dst) ; return; }
2210   /* else */      { addl(dst, value)       ; return; }
2211 }
2212 
2213 void MacroAssembler::jump(AddressLiteral dst) {
2214   if (reachable(dst)) {
2215     jmp_literal(dst.target(), dst.rspec());
2216   } else {
2217     lea(rscratch1, dst);
2218     jmp(rscratch1);
2219   }
2220 }
2221 
2222 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
2223   if (reachable(dst)) {
2224     InstructionMark im(this);
2225     relocate(dst.reloc());
2226     const int short_size = 2;
2227     const int long_size = 6;
2228     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2229     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2230       // 0111 tttn #8-bit disp
2231       emit_int8(0x70 | cc);
2232       emit_int8((offs - short_size) & 0xFF);
2233     } else {
2234       // 0000 1111 1000 tttn #32-bit disp
2235       emit_int8(0x0F);
2236       emit_int8((unsigned char)(0x80 | cc));
2237       emit_int32(offs - long_size);
2238     }
2239   } else {
2240 #ifdef ASSERT
2241     warning("reversing conditional branch");
2242 #endif /* ASSERT */
2243     Label skip;
2244     jccb(reverse[cc], skip);
2245     lea(rscratch1, dst);
2246     Assembler::jmp(rscratch1);
2247     bind(skip);
2248   }
2249 }
2250 
2251 void MacroAssembler::fld_x(AddressLiteral src) {
2252   Assembler::fld_x(as_Address(src));
2253 }
2254 
2255 void MacroAssembler::ldmxcsr(AddressLiteral src, Register scratchReg) {
2256   if (reachable(src)) {
2257     Assembler::ldmxcsr(as_Address(src));
2258   } else {
2259     lea(scratchReg, src);
2260     Assembler::ldmxcsr(Address(scratchReg, 0));
2261   }
2262 }
2263 
2264 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2265   int off;
2266   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2267     off = offset();
2268     movsbl(dst, src); // movsxb
2269   } else {
2270     off = load_unsigned_byte(dst, src);
2271     shll(dst, 24);
2272     sarl(dst, 24);
2273   }
2274   return off;
2275 }
2276 
2277 // Note: load_signed_short used to be called load_signed_word.
2278 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
2279 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
2280 // The term "word" in HotSpot means a 32- or 64-bit machine word.
2281 int MacroAssembler::load_signed_short(Register dst, Address src) {
2282   int off;
2283   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2284     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
2285     // version but this is what 64bit has always done. This seems to imply
2286     // that users are only using 32bits worth.
2287     off = offset();
2288     movswl(dst, src); // movsxw
2289   } else {
2290     off = load_unsigned_short(dst, src);
2291     shll(dst, 16);
2292     sarl(dst, 16);
2293   }
2294   return off;
2295 }
2296 
2297 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2298   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2299   // and "3.9 Partial Register Penalties", p. 22).
2300   int off;
2301   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
2302     off = offset();
2303     movzbl(dst, src); // movzxb
2304   } else {
2305     xorl(dst, dst);
2306     off = offset();
2307     movb(dst, src);
2308   }
2309   return off;
2310 }
2311 
2312 // Note: load_unsigned_short used to be called load_unsigned_word.
2313 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2314   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2315   // and "3.9 Partial Register Penalties", p. 22).
2316   int off;
2317   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
2318     off = offset();
2319     movzwl(dst, src); // movzxw
2320   } else {
2321     xorl(dst, dst);
2322     off = offset();
2323     movw(dst, src);
2324   }
2325   return off;
2326 }
2327 
2328 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2329   switch (size_in_bytes) {
2330 #ifndef _LP64
2331   case  8:
2332     assert(dst2 != noreg, "second dest register required");
2333     movl(dst,  src);
2334     movl(dst2, src.plus_disp(BytesPerInt));
2335     break;
2336 #else
2337   case  8:  movq(dst, src); break;
2338 #endif
2339   case  4:  movl(dst, src); break;
2340   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2341   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2342   default:  ShouldNotReachHere();
2343   }
2344 }
2345 
2346 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2347   switch (size_in_bytes) {
2348 #ifndef _LP64
2349   case  8:
2350     assert(src2 != noreg, "second source register required");
2351     movl(dst,                        src);
2352     movl(dst.plus_disp(BytesPerInt), src2);
2353     break;
2354 #else
2355   case  8:  movq(dst, src); break;
2356 #endif
2357   case  4:  movl(dst, src); break;
2358   case  2:  movw(dst, src); break;
2359   case  1:  movb(dst, src); break;
2360   default:  ShouldNotReachHere();
2361   }
2362 }
2363 
2364 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
2365   if (reachable(dst)) {
2366     movl(as_Address(dst), src);
2367   } else {
2368     lea(rscratch1, dst);
2369     movl(Address(rscratch1, 0), src);
2370   }
2371 }
2372 
2373 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
2374   if (reachable(src)) {
2375     movl(dst, as_Address(src));
2376   } else {
2377     lea(rscratch1, src);
2378     movl(dst, Address(rscratch1, 0));
2379   }
2380 }
2381 
2382 // C++ bool manipulation
2383 
2384 void MacroAssembler::movbool(Register dst, Address src) {
2385   if(sizeof(bool) == 1)
2386     movb(dst, src);
2387   else if(sizeof(bool) == 2)
2388     movw(dst, src);
2389   else if(sizeof(bool) == 4)
2390     movl(dst, src);
2391   else
2392     // unsupported
2393     ShouldNotReachHere();
2394 }
2395 
2396 void MacroAssembler::movbool(Address dst, bool boolconst) {
2397   if(sizeof(bool) == 1)
2398     movb(dst, (int) boolconst);
2399   else if(sizeof(bool) == 2)
2400     movw(dst, (int) boolconst);
2401   else if(sizeof(bool) == 4)
2402     movl(dst, (int) boolconst);
2403   else
2404     // unsupported
2405     ShouldNotReachHere();
2406 }
2407 
2408 void MacroAssembler::movbool(Address dst, Register src) {
2409   if(sizeof(bool) == 1)
2410     movb(dst, src);
2411   else if(sizeof(bool) == 2)
2412     movw(dst, src);
2413   else if(sizeof(bool) == 4)
2414     movl(dst, src);
2415   else
2416     // unsupported
2417     ShouldNotReachHere();
2418 }
2419 
2420 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
2421   movb(as_Address(dst), src);
2422 }
2423 
2424 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
2425   if (reachable(src)) {
2426     movdl(dst, as_Address(src));
2427   } else {
2428     lea(rscratch1, src);
2429     movdl(dst, Address(rscratch1, 0));
2430   }
2431 }
2432 
2433 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
2434   if (reachable(src)) {
2435     movq(dst, as_Address(src));
2436   } else {
2437     lea(rscratch1, src);
2438     movq(dst, Address(rscratch1, 0));
2439   }
2440 }
2441 
2442 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
2443   if (reachable(src)) {
2444     if (UseXmmLoadAndClearUpper) {
2445       movsd (dst, as_Address(src));
2446     } else {
2447       movlpd(dst, as_Address(src));
2448     }
2449   } else {
2450     lea(rscratch1, src);
2451     if (UseXmmLoadAndClearUpper) {
2452       movsd (dst, Address(rscratch1, 0));
2453     } else {
2454       movlpd(dst, Address(rscratch1, 0));
2455     }
2456   }
2457 }
2458 
2459 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
2460   if (reachable(src)) {
2461     movss(dst, as_Address(src));
2462   } else {
2463     lea(rscratch1, src);
2464     movss(dst, Address(rscratch1, 0));
2465   }
2466 }
2467 
2468 void MacroAssembler::movptr(Register dst, Register src) {
2469   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2470 }
2471 
2472 void MacroAssembler::movptr(Register dst, Address src) {
2473   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2474 }
2475 
2476 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
2477 void MacroAssembler::movptr(Register dst, intptr_t src) {
2478   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
2479 }
2480 
2481 void MacroAssembler::movptr(Address dst, Register src) {
2482   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2483 }
2484 
2485 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2486     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2487     Assembler::movdqu(dst, src);
2488 }
2489 
2490 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2491     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2492     Assembler::movdqu(dst, src);
2493 }
2494 
2495 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2496     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2497     Assembler::movdqu(dst, src);
2498 }
2499 
2500 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
2501   if (reachable(src)) {
2502     movdqu(dst, as_Address(src));
2503   } else {
2504     lea(scratchReg, src);
2505     movdqu(dst, Address(scratchReg, 0));
2506   }
2507 }
2508 
2509 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2510     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2511     Assembler::vmovdqu(dst, src);
2512 }
2513 
2514 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2515     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2516     Assembler::vmovdqu(dst, src);
2517 }
2518 
2519 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2520     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2521     Assembler::vmovdqu(dst, src);
2522 }
2523 
2524 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
2525   if (reachable(src)) {
2526     vmovdqu(dst, as_Address(src));
2527   }
2528   else {
2529     lea(scratch_reg, src);
2530     vmovdqu(dst, Address(scratch_reg, 0));
2531   }
2532 }
2533 
2534 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg, int vector_len) {
2535   assert(vector_len <= AVX_256bit, "AVX2 vector length");
2536   if (vector_len == AVX_256bit) {
2537     vmovdqu(dst, src, scratch_reg);
2538   } else {
2539     movdqu(dst, src, scratch_reg);
2540   }
2541 }
2542 
2543 void MacroAssembler::kmov(KRegister dst, Address src) {
2544   if (VM_Version::supports_avx512bw()) {
2545     kmovql(dst, src);
2546   } else {
2547     assert(VM_Version::supports_evex(), "");
2548     kmovwl(dst, src);
2549   }
2550 }
2551 
2552 void MacroAssembler::kmov(Address dst, KRegister src) {
2553   if (VM_Version::supports_avx512bw()) {
2554     kmovql(dst, src);
2555   } else {
2556     assert(VM_Version::supports_evex(), "");
2557     kmovwl(dst, src);
2558   }
2559 }
2560 
2561 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2562   if (VM_Version::supports_avx512bw()) {
2563     kmovql(dst, src);
2564   } else {
2565     assert(VM_Version::supports_evex(), "");
2566     kmovwl(dst, src);
2567   }
2568 }
2569 
2570 void MacroAssembler::kmov(Register dst, KRegister src) {
2571   if (VM_Version::supports_avx512bw()) {
2572     kmovql(dst, src);
2573   } else {
2574     assert(VM_Version::supports_evex(), "");
2575     kmovwl(dst, src);
2576   }
2577 }
2578 
2579 void MacroAssembler::kmov(KRegister dst, Register src) {
2580   if (VM_Version::supports_avx512bw()) {
2581     kmovql(dst, src);
2582   } else {
2583     assert(VM_Version::supports_evex(), "");
2584     kmovwl(dst, src);
2585   }
2586 }
2587 
2588 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register scratch_reg) {
2589   if (reachable(src)) {
2590     kmovql(dst, as_Address(src));
2591   } else {
2592     lea(scratch_reg, src);
2593     kmovql(dst, Address(scratch_reg, 0));
2594   }
2595 }
2596 
2597 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg) {
2598   if (reachable(src)) {
2599     kmovwl(dst, as_Address(src));
2600   } else {
2601     lea(scratch_reg, src);
2602     kmovwl(dst, Address(scratch_reg, 0));
2603   }
2604 }
2605 
2606 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2607                                int vector_len, Register scratch_reg) {
2608   if (reachable(src)) {
2609     Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2610   } else {
2611     lea(scratch_reg, src);
2612     Assembler::evmovdqub(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2613   }
2614 }
2615 
2616 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2617                                int vector_len, Register scratch_reg) {
2618   if (reachable(src)) {
2619     Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2620   } else {
2621     lea(scratch_reg, src);
2622     Assembler::evmovdquw(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2623   }
2624 }
2625 
2626 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2627                                int vector_len, Register scratch_reg) {
2628   if (reachable(src)) {
2629     Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2630   } else {
2631     lea(scratch_reg, src);
2632     Assembler::evmovdqul(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2633   }
2634 }
2635 
2636 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2637                                int vector_len, Register scratch_reg) {
2638   if (reachable(src)) {
2639     Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2640   } else {
2641     lea(scratch_reg, src);
2642     Assembler::evmovdquq(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2643   }
2644 }
2645 
2646 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2647   if (reachable(src)) {
2648     Assembler::evmovdquq(dst, as_Address(src), vector_len);
2649   } else {
2650     lea(rscratch, src);
2651     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2652   }
2653 }
2654 
2655 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
2656   if (reachable(src)) {
2657     Assembler::movdqa(dst, as_Address(src));
2658   } else {
2659     lea(rscratch1, src);
2660     Assembler::movdqa(dst, Address(rscratch1, 0));
2661   }
2662 }
2663 
2664 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
2665   if (reachable(src)) {
2666     Assembler::movsd(dst, as_Address(src));
2667   } else {
2668     lea(rscratch1, src);
2669     Assembler::movsd(dst, Address(rscratch1, 0));
2670   }
2671 }
2672 
2673 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
2674   if (reachable(src)) {
2675     Assembler::movss(dst, as_Address(src));
2676   } else {
2677     lea(rscratch1, src);
2678     Assembler::movss(dst, Address(rscratch1, 0));
2679   }
2680 }
2681 
2682 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2683   if (reachable(src)) {
2684     Assembler::vmovddup(dst, as_Address(src), vector_len);
2685   } else {
2686     lea(rscratch, src);
2687     Assembler::vmovddup(dst, Address(rscratch, 0), vector_len);
2688   }
2689 }
2690 
2691 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
2692   if (reachable(src)) {
2693     Assembler::mulsd(dst, as_Address(src));
2694   } else {
2695     lea(rscratch1, src);
2696     Assembler::mulsd(dst, Address(rscratch1, 0));
2697   }
2698 }
2699 
2700 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
2701   if (reachable(src)) {
2702     Assembler::mulss(dst, as_Address(src));
2703   } else {
2704     lea(rscratch1, src);
2705     Assembler::mulss(dst, Address(rscratch1, 0));
2706   }
2707 }
2708 
2709 void MacroAssembler::null_check(Register reg, int offset) {
2710   if (needs_explicit_null_check(offset)) {
2711     // provoke OS NULL exception if reg = NULL by
2712     // accessing M[reg] w/o changing any (non-CC) registers
2713     // NOTE: cmpl is plenty here to provoke a segv
2714     cmpptr(rax, Address(reg, 0));
2715     // Note: should probably use testl(rax, Address(reg, 0));
2716     //       may be shorter code (however, this version of
2717     //       testl needs to be implemented first)
2718   } else {
2719     // nothing to do, (later) access of M[reg + offset]
2720     // will provoke OS NULL exception if reg = NULL
2721   }
2722 }
2723 
2724 void MacroAssembler::os_breakpoint() {
2725   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2726   // (e.g., MSVC can't call ps() otherwise)
2727   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2728 }
2729 
2730 void MacroAssembler::unimplemented(const char* what) {
2731   const char* buf = NULL;
2732   {
2733     ResourceMark rm;
2734     stringStream ss;
2735     ss.print("unimplemented: %s", what);
2736     buf = code_string(ss.as_string());
2737   }
2738   stop(buf);
2739 }
2740 
2741 #ifdef _LP64
2742 #define XSTATE_BV 0x200
2743 #endif
2744 
2745 void MacroAssembler::pop_CPU_state() {
2746   pop_FPU_state();
2747   pop_IU_state();
2748 }
2749 
2750 void MacroAssembler::pop_FPU_state() {
2751 #ifndef _LP64
2752   frstor(Address(rsp, 0));
2753 #else
2754   fxrstor(Address(rsp, 0));
2755 #endif
2756   addptr(rsp, FPUStateSizeInWords * wordSize);
2757 }
2758 
2759 void MacroAssembler::pop_IU_state() {
2760   popa();
2761   LP64_ONLY(addq(rsp, 8));
2762   popf();
2763 }
2764 
2765 // Save Integer and Float state
2766 // Warning: Stack must be 16 byte aligned (64bit)
2767 void MacroAssembler::push_CPU_state() {
2768   push_IU_state();
2769   push_FPU_state();
2770 }
2771 
2772 void MacroAssembler::push_FPU_state() {
2773   subptr(rsp, FPUStateSizeInWords * wordSize);
2774 #ifndef _LP64
2775   fnsave(Address(rsp, 0));
2776   fwait();
2777 #else
2778   fxsave(Address(rsp, 0));
2779 #endif // LP64
2780 }
2781 
2782 void MacroAssembler::push_IU_state() {
2783   // Push flags first because pusha kills them
2784   pushf();
2785   // Make sure rsp stays 16-byte aligned
2786   LP64_ONLY(subq(rsp, 8));
2787   pusha();
2788 }
2789 
2790 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
2791   if (!java_thread->is_valid()) {
2792     java_thread = rdi;
2793     get_thread(java_thread);
2794   }
2795   // we must set sp to zero to clear frame
2796   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
2797   // must clear fp, so that compiled frames are not confused; it is
2798   // possible that we need it only for debugging
2799   if (clear_fp) {
2800     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2801   }
2802   // Always clear the pc because it could have been set by make_walkable()
2803   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
2804   vzeroupper();
2805 }
2806 
2807 void MacroAssembler::restore_rax(Register tmp) {
2808   if (tmp == noreg) pop(rax);
2809   else if (tmp != rax) mov(rax, tmp);
2810 }
2811 
2812 void MacroAssembler::round_to(Register reg, int modulus) {
2813   addptr(reg, modulus - 1);
2814   andptr(reg, -modulus);
2815 }
2816 
2817 void MacroAssembler::save_rax(Register tmp) {
2818   if (tmp == noreg) push(rax);
2819   else if (tmp != rax) mov(tmp, rax);
2820 }
2821 
2822 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) {
2823   if (at_return) {
2824     // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
2825     // we may safely use rsp instead to perform the stack watermark check.
2826     cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset()));
2827     jcc(Assembler::above, slow_path);
2828     return;
2829   }
2830   testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2831   jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
2832 }
2833 
2834 // Calls to C land
2835 //
2836 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
2837 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
2838 // has to be reset to 0. This is required to allow proper stack traversal.
2839 void MacroAssembler::set_last_Java_frame(Register java_thread,
2840                                          Register last_java_sp,
2841                                          Register last_java_fp,
2842                                          address  last_java_pc) {
2843   vzeroupper();
2844   // determine java_thread register
2845   if (!java_thread->is_valid()) {
2846     java_thread = rdi;
2847     get_thread(java_thread);
2848   }
2849   // determine last_java_sp register
2850   if (!last_java_sp->is_valid()) {
2851     last_java_sp = rsp;
2852   }
2853 
2854   // last_java_fp is optional
2855 
2856   if (last_java_fp->is_valid()) {
2857     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
2858   }
2859 
2860   // last_java_pc is optional
2861 
2862   if (last_java_pc != NULL) {
2863     lea(Address(java_thread,
2864                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
2865         InternalAddress(last_java_pc));
2866 
2867   }
2868   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
2869 }
2870 
2871 void MacroAssembler::shlptr(Register dst, int imm8) {
2872   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
2873 }
2874 
2875 void MacroAssembler::shrptr(Register dst, int imm8) {
2876   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
2877 }
2878 
2879 void MacroAssembler::sign_extend_byte(Register reg) {
2880   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
2881     movsbl(reg, reg); // movsxb
2882   } else {
2883     shll(reg, 24);
2884     sarl(reg, 24);
2885   }
2886 }
2887 
2888 void MacroAssembler::sign_extend_short(Register reg) {
2889   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2890     movswl(reg, reg); // movsxw
2891   } else {
2892     shll(reg, 16);
2893     sarl(reg, 16);
2894   }
2895 }
2896 
2897 void MacroAssembler::testl(Register dst, AddressLiteral src) {
2898   assert(reachable(src), "Address should be reachable");
2899   testl(dst, as_Address(src));
2900 }
2901 
2902 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
2903   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2904   Assembler::pcmpeqb(dst, src);
2905 }
2906 
2907 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
2908   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2909   Assembler::pcmpeqw(dst, src);
2910 }
2911 
2912 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2913   assert((dst->encoding() < 16),"XMM register should be 0-15");
2914   Assembler::pcmpestri(dst, src, imm8);
2915 }
2916 
2917 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2918   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2919   Assembler::pcmpestri(dst, src, imm8);
2920 }
2921 
2922 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2923   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2924   Assembler::pmovzxbw(dst, src);
2925 }
2926 
2927 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
2928   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2929   Assembler::pmovzxbw(dst, src);
2930 }
2931 
2932 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
2933   assert((src->encoding() < 16),"XMM register should be 0-15");
2934   Assembler::pmovmskb(dst, src);
2935 }
2936 
2937 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
2938   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2939   Assembler::ptest(dst, src);
2940 }
2941 
2942 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
2943   if (reachable(src)) {
2944     Assembler::sqrtsd(dst, as_Address(src));
2945   } else {
2946     lea(rscratch1, src);
2947     Assembler::sqrtsd(dst, Address(rscratch1, 0));
2948   }
2949 }
2950 
2951 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
2952   if (reachable(src)) {
2953     Assembler::sqrtss(dst, as_Address(src));
2954   } else {
2955     lea(rscratch1, src);
2956     Assembler::sqrtss(dst, Address(rscratch1, 0));
2957   }
2958 }
2959 
2960 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
2961   if (reachable(src)) {
2962     Assembler::subsd(dst, as_Address(src));
2963   } else {
2964     lea(rscratch1, src);
2965     Assembler::subsd(dst, Address(rscratch1, 0));
2966   }
2967 }
2968 
2969 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg) {
2970   if (reachable(src)) {
2971     Assembler::roundsd(dst, as_Address(src), rmode);
2972   } else {
2973     lea(scratch_reg, src);
2974     Assembler::roundsd(dst, Address(scratch_reg, 0), rmode);
2975   }
2976 }
2977 
2978 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
2979   if (reachable(src)) {
2980     Assembler::subss(dst, as_Address(src));
2981   } else {
2982     lea(rscratch1, src);
2983     Assembler::subss(dst, Address(rscratch1, 0));
2984   }
2985 }
2986 
2987 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
2988   if (reachable(src)) {
2989     Assembler::ucomisd(dst, as_Address(src));
2990   } else {
2991     lea(rscratch1, src);
2992     Assembler::ucomisd(dst, Address(rscratch1, 0));
2993   }
2994 }
2995 
2996 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
2997   if (reachable(src)) {
2998     Assembler::ucomiss(dst, as_Address(src));
2999   } else {
3000     lea(rscratch1, src);
3001     Assembler::ucomiss(dst, Address(rscratch1, 0));
3002   }
3003 }
3004 
3005 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3006   // Used in sign-bit flipping with aligned address.
3007   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3008   if (reachable(src)) {
3009     Assembler::xorpd(dst, as_Address(src));
3010   } else {
3011     lea(scratch_reg, src);
3012     Assembler::xorpd(dst, Address(scratch_reg, 0));
3013   }
3014 }
3015 
3016 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3017   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3018     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3019   }
3020   else {
3021     Assembler::xorpd(dst, src);
3022   }
3023 }
3024 
3025 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3026   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3027     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3028   } else {
3029     Assembler::xorps(dst, src);
3030   }
3031 }
3032 
3033 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3034   // Used in sign-bit flipping with aligned address.
3035   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3036   if (reachable(src)) {
3037     Assembler::xorps(dst, as_Address(src));
3038   } else {
3039     lea(scratch_reg, src);
3040     Assembler::xorps(dst, Address(scratch_reg, 0));
3041   }
3042 }
3043 
3044 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3045   // Used in sign-bit flipping with aligned address.
3046   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3047   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3048   if (reachable(src)) {
3049     Assembler::pshufb(dst, as_Address(src));
3050   } else {
3051     lea(rscratch1, src);
3052     Assembler::pshufb(dst, Address(rscratch1, 0));
3053   }
3054 }
3055 
3056 // AVX 3-operands instructions
3057 
3058 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3059   if (reachable(src)) {
3060     vaddsd(dst, nds, as_Address(src));
3061   } else {
3062     lea(rscratch1, src);
3063     vaddsd(dst, nds, Address(rscratch1, 0));
3064   }
3065 }
3066 
3067 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3068   if (reachable(src)) {
3069     vaddss(dst, nds, as_Address(src));
3070   } else {
3071     lea(rscratch1, src);
3072     vaddss(dst, nds, Address(rscratch1, 0));
3073   }
3074 }
3075 
3076 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3077   assert(UseAVX > 0, "requires some form of AVX");
3078   if (reachable(src)) {
3079     Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
3080   } else {
3081     lea(rscratch, src);
3082     Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
3083   }
3084 }
3085 
3086 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3087   assert(UseAVX > 0, "requires some form of AVX");
3088   if (reachable(src)) {
3089     Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
3090   } else {
3091     lea(rscratch, src);
3092     Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
3093   }
3094 }
3095 
3096 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3097   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3098   vandps(dst, nds, negate_field, vector_len);
3099 }
3100 
3101 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3102   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3103   vandpd(dst, nds, negate_field, vector_len);
3104 }
3105 
3106 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3107   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3108   Assembler::vpaddb(dst, nds, src, vector_len);
3109 }
3110 
3111 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3112   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3113   Assembler::vpaddb(dst, nds, src, vector_len);
3114 }
3115 
3116 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3117   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3118   Assembler::vpaddw(dst, nds, src, vector_len);
3119 }
3120 
3121 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3122   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3123   Assembler::vpaddw(dst, nds, src, vector_len);
3124 }
3125 
3126 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3127   if (reachable(src)) {
3128     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3129   } else {
3130     lea(scratch_reg, src);
3131     Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3132   }
3133 }
3134 
3135 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3136   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3137   Assembler::vpbroadcastw(dst, src, vector_len);
3138 }
3139 
3140 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3141   if (reachable(src)) {
3142     Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
3143   } else {
3144     lea(rscratch, src);
3145     Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len);
3146   }
3147 }
3148 
3149 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3150   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3151   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3152 }
3153 
3154 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3155   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3156   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3157 }
3158 
3159 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds,
3160                                AddressLiteral src, int vector_len, Register scratch_reg) {
3161   if (reachable(src)) {
3162     Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3163   } else {
3164     lea(scratch_reg, src);
3165     Assembler::evpcmpeqd(kdst, mask, nds, Address(scratch_reg, 0), vector_len);
3166   }
3167 }
3168 
3169 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3170                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3171   if (reachable(src)) {
3172     Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3173   } else {
3174     lea(scratch_reg, src);
3175     Assembler::evpcmpd(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3176   }
3177 }
3178 
3179 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3180                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3181   if (reachable(src)) {
3182     Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3183   } else {
3184     lea(scratch_reg, src);
3185     Assembler::evpcmpq(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3186   }
3187 }
3188 
3189 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3190                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3191   if (reachable(src)) {
3192     Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3193   } else {
3194     lea(scratch_reg, src);
3195     Assembler::evpcmpb(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3196   }
3197 }
3198 
3199 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3200                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3201   if (reachable(src)) {
3202     Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3203   } else {
3204     lea(scratch_reg, src);
3205     Assembler::evpcmpw(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3206   }
3207 }
3208 
3209 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3210   if (width == Assembler::Q) {
3211     Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3212   } else {
3213     Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3214   }
3215 }
3216 
3217 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) {
3218   int eq_cond_enc = 0x29;
3219   int gt_cond_enc = 0x37;
3220   if (width != Assembler::Q) {
3221     eq_cond_enc = 0x74 + width;
3222     gt_cond_enc = 0x64 + width;
3223   }
3224   switch (cond) {
3225   case eq:
3226     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3227     break;
3228   case neq:
3229     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3230     vallones(xtmp, vector_len);
3231     vpxor(dst, xtmp, dst, vector_len);
3232     break;
3233   case le:
3234     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3235     vallones(xtmp, vector_len);
3236     vpxor(dst, xtmp, dst, vector_len);
3237     break;
3238   case nlt:
3239     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3240     vallones(xtmp, vector_len);
3241     vpxor(dst, xtmp, dst, vector_len);
3242     break;
3243   case lt:
3244     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3245     break;
3246   case nle:
3247     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3248     break;
3249   default:
3250     assert(false, "Should not reach here");
3251   }
3252 }
3253 
3254 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3255   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3256   Assembler::vpmovzxbw(dst, src, vector_len);
3257 }
3258 
3259 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3260   assert((src->encoding() < 16),"XMM register should be 0-15");
3261   Assembler::vpmovmskb(dst, src, vector_len);
3262 }
3263 
3264 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3265   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3266   Assembler::vpmullw(dst, nds, src, vector_len);
3267 }
3268 
3269 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3270   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3271   Assembler::vpmullw(dst, nds, src, vector_len);
3272 }
3273 
3274 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3275   assert((UseAVX > 0), "AVX support is needed");
3276   if (reachable(src)) {
3277     Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3278   } else {
3279     lea(scratch_reg, src);
3280     Assembler::vpmulld(dst, nds, Address(scratch_reg, 0), vector_len);
3281   }
3282 }
3283 
3284 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3285   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3286   Assembler::vpsubb(dst, nds, src, vector_len);
3287 }
3288 
3289 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3290   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3291   Assembler::vpsubb(dst, nds, src, vector_len);
3292 }
3293 
3294 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3295   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3296   Assembler::vpsubw(dst, nds, src, vector_len);
3297 }
3298 
3299 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3300   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3301   Assembler::vpsubw(dst, nds, src, vector_len);
3302 }
3303 
3304 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3305   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3306   Assembler::vpsraw(dst, nds, shift, vector_len);
3307 }
3308 
3309 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3310   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3311   Assembler::vpsraw(dst, nds, shift, vector_len);
3312 }
3313 
3314 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3315   assert(UseAVX > 2,"");
3316   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3317      vector_len = 2;
3318   }
3319   Assembler::evpsraq(dst, nds, shift, vector_len);
3320 }
3321 
3322 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3323   assert(UseAVX > 2,"");
3324   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3325      vector_len = 2;
3326   }
3327   Assembler::evpsraq(dst, nds, shift, vector_len);
3328 }
3329 
3330 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3331   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3332   Assembler::vpsrlw(dst, nds, shift, vector_len);
3333 }
3334 
3335 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3336   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3337   Assembler::vpsrlw(dst, nds, shift, vector_len);
3338 }
3339 
3340 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3341   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3342   Assembler::vpsllw(dst, nds, shift, vector_len);
3343 }
3344 
3345 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3346   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3347   Assembler::vpsllw(dst, nds, shift, vector_len);
3348 }
3349 
3350 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3351   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3352   Assembler::vptest(dst, src);
3353 }
3354 
3355 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3356   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3357   Assembler::punpcklbw(dst, src);
3358 }
3359 
3360 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3361   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3362   Assembler::pshufd(dst, src, mode);
3363 }
3364 
3365 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3366   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3367   Assembler::pshuflw(dst, src, mode);
3368 }
3369 
3370 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3371   if (reachable(src)) {
3372     vandpd(dst, nds, as_Address(src), vector_len);
3373   } else {
3374     lea(scratch_reg, src);
3375     vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
3376   }
3377 }
3378 
3379 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3380   if (reachable(src)) {
3381     vandps(dst, nds, as_Address(src), vector_len);
3382   } else {
3383     lea(scratch_reg, src);
3384     vandps(dst, nds, Address(scratch_reg, 0), vector_len);
3385   }
3386 }
3387 
3388 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3389                             bool merge, int vector_len, Register scratch_reg) {
3390   if (reachable(src)) {
3391     Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3392   } else {
3393     lea(scratch_reg, src);
3394     Assembler::evpord(dst, mask, nds, Address(scratch_reg, 0), merge, vector_len);
3395   }
3396 }
3397 
3398 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3399   if (reachable(src)) {
3400     vdivsd(dst, nds, as_Address(src));
3401   } else {
3402     lea(rscratch1, src);
3403     vdivsd(dst, nds, Address(rscratch1, 0));
3404   }
3405 }
3406 
3407 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3408   if (reachable(src)) {
3409     vdivss(dst, nds, as_Address(src));
3410   } else {
3411     lea(rscratch1, src);
3412     vdivss(dst, nds, Address(rscratch1, 0));
3413   }
3414 }
3415 
3416 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3417   if (reachable(src)) {
3418     vmulsd(dst, nds, as_Address(src));
3419   } else {
3420     lea(rscratch1, src);
3421     vmulsd(dst, nds, Address(rscratch1, 0));
3422   }
3423 }
3424 
3425 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3426   if (reachable(src)) {
3427     vmulss(dst, nds, as_Address(src));
3428   } else {
3429     lea(rscratch1, src);
3430     vmulss(dst, nds, Address(rscratch1, 0));
3431   }
3432 }
3433 
3434 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3435   if (reachable(src)) {
3436     vsubsd(dst, nds, as_Address(src));
3437   } else {
3438     lea(rscratch1, src);
3439     vsubsd(dst, nds, Address(rscratch1, 0));
3440   }
3441 }
3442 
3443 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3444   if (reachable(src)) {
3445     vsubss(dst, nds, as_Address(src));
3446   } else {
3447     lea(rscratch1, src);
3448     vsubss(dst, nds, Address(rscratch1, 0));
3449   }
3450 }
3451 
3452 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3453   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3454   vxorps(dst, nds, src, Assembler::AVX_128bit);
3455 }
3456 
3457 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3458   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3459   vxorpd(dst, nds, src, Assembler::AVX_128bit);
3460 }
3461 
3462 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3463   if (reachable(src)) {
3464     vxorpd(dst, nds, as_Address(src), vector_len);
3465   } else {
3466     lea(scratch_reg, src);
3467     vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
3468   }
3469 }
3470 
3471 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3472   if (reachable(src)) {
3473     vxorps(dst, nds, as_Address(src), vector_len);
3474   } else {
3475     lea(scratch_reg, src);
3476     vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
3477   }
3478 }
3479 
3480 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3481   if (UseAVX > 1 || (vector_len < 1)) {
3482     if (reachable(src)) {
3483       Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3484     } else {
3485       lea(scratch_reg, src);
3486       Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
3487     }
3488   }
3489   else {
3490     MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
3491   }
3492 }
3493 
3494 void MacroAssembler::vpermd(XMMRegister dst,  XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3495   if (reachable(src)) {
3496     Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3497   } else {
3498     lea(scratch_reg, src);
3499     Assembler::vpermd(dst, nds, Address(scratch_reg, 0), vector_len);
3500   }
3501 }
3502 
3503 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
3504   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
3505   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
3506   // The inverted mask is sign-extended
3507   andptr(possibly_jweak, inverted_jweak_mask);
3508 }
3509 
3510 void MacroAssembler::resolve_jobject(Register value,
3511                                      Register thread,
3512                                      Register tmp) {
3513   assert_different_registers(value, thread, tmp);
3514   Label done, not_weak;
3515   testptr(value, value);
3516   jcc(Assembler::zero, done);                // Use NULL as-is.
3517   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
3518   jcc(Assembler::zero, not_weak);
3519   // Resolve jweak.
3520   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3521                  value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
3522   verify_oop(value);
3523   jmp(done);
3524   bind(not_weak);
3525   // Resolve (untagged) jobject.
3526   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
3527   verify_oop(value);
3528   bind(done);
3529 }
3530 
3531 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3532   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
3533 }
3534 
3535 // Force generation of a 4 byte immediate value even if it fits into 8bit
3536 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3537   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
3538 }
3539 
3540 void MacroAssembler::subptr(Register dst, Register src) {
3541   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
3542 }
3543 
3544 // C++ bool manipulation
3545 void MacroAssembler::testbool(Register dst) {
3546   if(sizeof(bool) == 1)
3547     testb(dst, 0xff);
3548   else if(sizeof(bool) == 2) {
3549     // testw implementation needed for two byte bools
3550     ShouldNotReachHere();
3551   } else if(sizeof(bool) == 4)
3552     testl(dst, dst);
3553   else
3554     // unsupported
3555     ShouldNotReachHere();
3556 }
3557 
3558 void MacroAssembler::testptr(Register dst, Register src) {
3559   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
3560 }
3561 
3562 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3563 void MacroAssembler::tlab_allocate(Register thread, Register obj,
3564                                    Register var_size_in_bytes,
3565                                    int con_size_in_bytes,
3566                                    Register t1,
3567                                    Register t2,
3568                                    Label& slow_case) {
3569   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3570   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3571 }
3572 
3573 RegSet MacroAssembler::call_clobbered_gp_registers() {
3574   RegSet regs;
3575 #ifdef _LP64
3576   regs += RegSet::of(rax, rcx, rdx);
3577 #ifndef WINDOWS
3578   regs += RegSet::of(rsi, rdi);
3579 #endif
3580   regs += RegSet::range(r8, r11);
3581 #else
3582   regs += RegSet::of(rax, rcx, rdx);
3583 #endif
3584   return regs;
3585 }
3586 
3587 XMMRegSet MacroAssembler::call_clobbered_xmm_registers() {
3588   int num_xmm_registers = XMMRegisterImpl::available_xmm_registers();
3589 #if defined(WINDOWS) && defined(_LP64)
3590   XMMRegSet result = XMMRegSet::range(xmm0, xmm5);
3591   if (num_xmm_registers > 16) {
3592      result += XMMRegSet::range(xmm16, as_XMMRegister(num_xmm_registers - 1));
3593   }
3594   return result;
3595 #else
3596   return XMMRegSet::range(xmm0, as_XMMRegister(num_xmm_registers - 1));
3597 #endif
3598 }
3599 
3600 static int FPUSaveAreaSize = align_up(108, StackAlignmentInBytes); // 108 bytes needed for FPU state by fsave/frstor
3601 
3602 #ifndef _LP64
3603 static bool use_x87_registers() { return UseSSE < 2; }
3604 #endif
3605 static bool use_xmm_registers() { return UseSSE >= 1; }
3606 
3607 // C1 only ever uses the first double/float of the XMM register.
3608 static int xmm_save_size() { return UseSSE >= 2 ? sizeof(double) : sizeof(float); }
3609 
3610 static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3611   if (UseSSE == 1) {
3612     masm->movflt(Address(rsp, offset), reg);
3613   } else {
3614     masm->movdbl(Address(rsp, offset), reg);
3615   }
3616 }
3617 
3618 static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3619   if (UseSSE == 1) {
3620     masm->movflt(reg, Address(rsp, offset));
3621   } else {
3622     masm->movdbl(reg, Address(rsp, offset));
3623   }
3624 }
3625 
3626 int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers, bool save_fpu,
3627                            int& gp_area_size, int& fp_area_size, int& xmm_area_size) {
3628 
3629   gp_area_size = align_up(gp_registers.size() * RegisterImpl::max_slots_per_register * VMRegImpl::stack_slot_size,
3630                          StackAlignmentInBytes);
3631 #ifdef _LP64
3632   fp_area_size = 0;
3633 #else
3634   fp_area_size = (save_fpu && use_x87_registers()) ? FPUSaveAreaSize : 0;
3635 #endif
3636   xmm_area_size = (save_fpu && use_xmm_registers()) ? xmm_registers.size() * xmm_save_size() : 0;
3637 
3638   return gp_area_size + fp_area_size + xmm_area_size;
3639 }
3640 
3641 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) {
3642   block_comment("push_call_clobbered_registers start");
3643   // Regular registers
3644   RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude;
3645 
3646   int gp_area_size;
3647   int fp_area_size;
3648   int xmm_area_size;
3649   int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu,
3650                                                gp_area_size, fp_area_size, xmm_area_size);
3651   subptr(rsp, total_save_size);
3652 
3653   push_set(gp_registers_to_push, 0);
3654 
3655 #ifndef _LP64
3656   if (save_fpu && use_x87_registers()) {
3657     fnsave(Address(rsp, gp_area_size));
3658     fwait();
3659   }
3660 #endif
3661   if (save_fpu && use_xmm_registers()) {
3662     push_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size);
3663   }
3664 
3665   block_comment("push_call_clobbered_registers end");
3666 }
3667 
3668 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) {
3669   block_comment("pop_call_clobbered_registers start");
3670 
3671   RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude;
3672 
3673   int gp_area_size;
3674   int fp_area_size;
3675   int xmm_area_size;
3676   int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu,
3677                                                gp_area_size, fp_area_size, xmm_area_size);
3678 
3679   if (restore_fpu && use_xmm_registers()) {
3680     pop_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size);
3681   }
3682 #ifndef _LP64
3683   if (restore_fpu && use_x87_registers()) {
3684     frstor(Address(rsp, gp_area_size));
3685   }
3686 #endif
3687 
3688   pop_set(gp_registers_to_pop, 0);
3689 
3690   addptr(rsp, total_save_size);
3691 
3692   vzeroupper();
3693 
3694   block_comment("pop_call_clobbered_registers end");
3695 }
3696 
3697 void MacroAssembler::push_set(XMMRegSet set, int offset) {
3698   assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be");
3699   int spill_offset = offset;
3700 
3701   for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) {
3702     save_xmm_register(this, spill_offset, *it);
3703     spill_offset += xmm_save_size();
3704   }
3705 }
3706 
3707 void MacroAssembler::pop_set(XMMRegSet set, int offset) {
3708   int restore_size = set.size() * xmm_save_size();
3709   assert(is_aligned(restore_size, StackAlignmentInBytes), "must be");
3710 
3711   int restore_offset = offset + restore_size - xmm_save_size();
3712 
3713   for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) {
3714     restore_xmm_register(this, restore_offset, *it);
3715     restore_offset -= xmm_save_size();
3716   }
3717 }
3718 
3719 void MacroAssembler::push_set(RegSet set, int offset) {
3720   int spill_offset;
3721   if (offset == -1) {
3722     int register_push_size = set.size() * RegisterImpl::max_slots_per_register * VMRegImpl::stack_slot_size;
3723     int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
3724     subptr(rsp, aligned_size);
3725     spill_offset = 0;
3726   } else {
3727     spill_offset = offset;
3728   }
3729 
3730   for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
3731     movptr(Address(rsp, spill_offset), *it);
3732     spill_offset += RegisterImpl::max_slots_per_register * VMRegImpl::stack_slot_size;
3733   }
3734 }
3735 
3736 void MacroAssembler::pop_set(RegSet set, int offset) {
3737 
3738   int gp_reg_size = RegisterImpl::max_slots_per_register * VMRegImpl::stack_slot_size;
3739   int restore_size = set.size() * gp_reg_size;
3740   int aligned_size = align_up(restore_size, StackAlignmentInBytes);
3741 
3742   int restore_offset;
3743   if (offset == -1) {
3744     restore_offset = restore_size - gp_reg_size;
3745   } else {
3746     restore_offset = offset + restore_size - gp_reg_size;
3747   }
3748   for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) {
3749     movptr(*it, Address(rsp, restore_offset));
3750     restore_offset -= gp_reg_size;
3751   }
3752 
3753   if (offset == -1) {
3754     addptr(rsp, aligned_size);
3755   }
3756 }
3757 
3758 // Defines obj, preserves var_size_in_bytes
3759 void MacroAssembler::eden_allocate(Register thread, Register obj,
3760                                    Register var_size_in_bytes,
3761                                    int con_size_in_bytes,
3762                                    Register t1,
3763                                    Label& slow_case) {
3764   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3765   bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
3766 }
3767 
3768 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3769 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3770   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3771   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3772   Label done;
3773 
3774   testptr(length_in_bytes, length_in_bytes);
3775   jcc(Assembler::zero, done);
3776 
3777   // initialize topmost word, divide index by 2, check if odd and test if zero
3778   // note: for the remaining code to work, index must be a multiple of BytesPerWord
3779 #ifdef ASSERT
3780   {
3781     Label L;
3782     testptr(length_in_bytes, BytesPerWord - 1);
3783     jcc(Assembler::zero, L);
3784     stop("length must be a multiple of BytesPerWord");
3785     bind(L);
3786   }
3787 #endif
3788   Register index = length_in_bytes;
3789   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
3790   if (UseIncDec) {
3791     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
3792   } else {
3793     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
3794     shrptr(index, 1);
3795   }
3796 #ifndef _LP64
3797   // index could have not been a multiple of 8 (i.e., bit 2 was set)
3798   {
3799     Label even;
3800     // note: if index was a multiple of 8, then it cannot
3801     //       be 0 now otherwise it must have been 0 before
3802     //       => if it is even, we don't need to check for 0 again
3803     jcc(Assembler::carryClear, even);
3804     // clear topmost word (no jump would be needed if conditional assignment worked here)
3805     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
3806     // index could be 0 now, must check again
3807     jcc(Assembler::zero, done);
3808     bind(even);
3809   }
3810 #endif // !_LP64
3811   // initialize remaining object fields: index is a multiple of 2 now
3812   {
3813     Label loop;
3814     bind(loop);
3815     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3816     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
3817     decrement(index);
3818     jcc(Assembler::notZero, loop);
3819   }
3820 
3821   bind(done);
3822 }
3823 
3824 // Look up the method for a megamorphic invokeinterface call.
3825 // The target method is determined by <intf_klass, itable_index>.
3826 // The receiver klass is in recv_klass.
3827 // On success, the result will be in method_result, and execution falls through.
3828 // On failure, execution transfers to the given label.
3829 void MacroAssembler::lookup_interface_method(Register recv_klass,
3830                                              Register intf_klass,
3831                                              RegisterOrConstant itable_index,
3832                                              Register method_result,
3833                                              Register scan_temp,
3834                                              Label& L_no_such_interface,
3835                                              bool return_method) {
3836   assert_different_registers(recv_klass, intf_klass, scan_temp);
3837   assert_different_registers(method_result, intf_klass, scan_temp);
3838   assert(recv_klass != method_result || !return_method,
3839          "recv_klass can be destroyed when method isn't needed");
3840 
3841   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3842          "caller must use same register for non-constant itable index as for method");
3843 
3844   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3845   int vtable_base = in_bytes(Klass::vtable_start_offset());
3846   int itentry_off = itableMethodEntry::method_offset_in_bytes();
3847   int scan_step   = itableOffsetEntry::size() * wordSize;
3848   int vte_size    = vtableEntry::size_in_bytes();
3849   Address::ScaleFactor times_vte_scale = Address::times_ptr;
3850   assert(vte_size == wordSize, "else adjust times_vte_scale");
3851 
3852   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3853 
3854   // %%% Could store the aligned, prescaled offset in the klassoop.
3855   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3856 
3857   if (return_method) {
3858     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3859     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3860     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
3861   }
3862 
3863   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
3864   //   if (scan->interface() == intf) {
3865   //     result = (klass + scan->offset() + itable_index);
3866   //   }
3867   // }
3868   Label search, found_method;
3869 
3870   for (int peel = 1; peel >= 0; peel--) {
3871     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
3872     cmpptr(intf_klass, method_result);
3873 
3874     if (peel) {
3875       jccb(Assembler::equal, found_method);
3876     } else {
3877       jccb(Assembler::notEqual, search);
3878       // (invert the test to fall through to found_method...)
3879     }
3880 
3881     if (!peel)  break;
3882 
3883     bind(search);
3884 
3885     // Check that the previous entry is non-null.  A null entry means that
3886     // the receiver class doesn't implement the interface, and wasn't the
3887     // same as when the caller was compiled.
3888     testptr(method_result, method_result);
3889     jcc(Assembler::zero, L_no_such_interface);
3890     addptr(scan_temp, scan_step);
3891   }
3892 
3893   bind(found_method);
3894 
3895   if (return_method) {
3896     // Got a hit.
3897     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
3898     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
3899   }
3900 }
3901 
3902 
3903 // virtual method calling
3904 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3905                                            RegisterOrConstant vtable_index,
3906                                            Register method_result) {
3907   const int base = in_bytes(Klass::vtable_start_offset());
3908   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
3909   Address vtable_entry_addr(recv_klass,
3910                             vtable_index, Address::times_ptr,
3911                             base + vtableEntry::method_offset_in_bytes());
3912   movptr(method_result, vtable_entry_addr);
3913 }
3914 
3915 
3916 void MacroAssembler::check_klass_subtype(Register sub_klass,
3917                            Register super_klass,
3918                            Register temp_reg,
3919                            Label& L_success) {
3920   Label L_failure;
3921   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
3922   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
3923   bind(L_failure);
3924 }
3925 
3926 
3927 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3928                                                    Register super_klass,
3929                                                    Register temp_reg,
3930                                                    Label* L_success,
3931                                                    Label* L_failure,
3932                                                    Label* L_slow_path,
3933                                         RegisterOrConstant super_check_offset) {
3934   assert_different_registers(sub_klass, super_klass, temp_reg);
3935   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
3936   if (super_check_offset.is_register()) {
3937     assert_different_registers(sub_klass, super_klass,
3938                                super_check_offset.as_register());
3939   } else if (must_load_sco) {
3940     assert(temp_reg != noreg, "supply either a temp or a register offset");
3941   }
3942 
3943   Label L_fallthrough;
3944   int label_nulls = 0;
3945   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
3946   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
3947   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
3948   assert(label_nulls <= 1, "at most one NULL in the batch");
3949 
3950   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3951   int sco_offset = in_bytes(Klass::super_check_offset_offset());
3952   Address super_check_offset_addr(super_klass, sco_offset);
3953 
3954   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
3955   // range of a jccb.  If this routine grows larger, reconsider at
3956   // least some of these.
3957 #define local_jcc(assembler_cond, label)                                \
3958   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
3959   else                             jcc( assembler_cond, label) /*omit semi*/
3960 
3961   // Hacked jmp, which may only be used just before L_fallthrough.
3962 #define final_jmp(label)                                                \
3963   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
3964   else                            jmp(label)                /*omit semi*/
3965 
3966   // If the pointers are equal, we are done (e.g., String[] elements).
3967   // This self-check enables sharing of secondary supertype arrays among
3968   // non-primary types such as array-of-interface.  Otherwise, each such
3969   // type would need its own customized SSA.
3970   // We move this check to the front of the fast path because many
3971   // type checks are in fact trivially successful in this manner,
3972   // so we get a nicely predicted branch right at the start of the check.
3973   cmpptr(sub_klass, super_klass);
3974   local_jcc(Assembler::equal, *L_success);
3975 
3976   // Check the supertype display:
3977   if (must_load_sco) {
3978     // Positive movl does right thing on LP64.
3979     movl(temp_reg, super_check_offset_addr);
3980     super_check_offset = RegisterOrConstant(temp_reg);
3981   }
3982   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
3983   cmpptr(super_klass, super_check_addr); // load displayed supertype
3984 
3985   // This check has worked decisively for primary supers.
3986   // Secondary supers are sought in the super_cache ('super_cache_addr').
3987   // (Secondary supers are interfaces and very deeply nested subtypes.)
3988   // This works in the same check above because of a tricky aliasing
3989   // between the super_cache and the primary super display elements.
3990   // (The 'super_check_addr' can address either, as the case requires.)
3991   // Note that the cache is updated below if it does not help us find
3992   // what we need immediately.
3993   // So if it was a primary super, we can just fail immediately.
3994   // Otherwise, it's the slow path for us (no success at this point).
3995 
3996   if (super_check_offset.is_register()) {
3997     local_jcc(Assembler::equal, *L_success);
3998     cmpl(super_check_offset.as_register(), sc_offset);
3999     if (L_failure == &L_fallthrough) {
4000       local_jcc(Assembler::equal, *L_slow_path);
4001     } else {
4002       local_jcc(Assembler::notEqual, *L_failure);
4003       final_jmp(*L_slow_path);
4004     }
4005   } else if (super_check_offset.as_constant() == sc_offset) {
4006     // Need a slow path; fast failure is impossible.
4007     if (L_slow_path == &L_fallthrough) {
4008       local_jcc(Assembler::equal, *L_success);
4009     } else {
4010       local_jcc(Assembler::notEqual, *L_slow_path);
4011       final_jmp(*L_success);
4012     }
4013   } else {
4014     // No slow path; it's a fast decision.
4015     if (L_failure == &L_fallthrough) {
4016       local_jcc(Assembler::equal, *L_success);
4017     } else {
4018       local_jcc(Assembler::notEqual, *L_failure);
4019       final_jmp(*L_success);
4020     }
4021   }
4022 
4023   bind(L_fallthrough);
4024 
4025 #undef local_jcc
4026 #undef final_jmp
4027 }
4028 
4029 
4030 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4031                                                    Register super_klass,
4032                                                    Register temp_reg,
4033                                                    Register temp2_reg,
4034                                                    Label* L_success,
4035                                                    Label* L_failure,
4036                                                    bool set_cond_codes) {
4037   assert_different_registers(sub_klass, super_klass, temp_reg);
4038   if (temp2_reg != noreg)
4039     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4040 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4041 
4042   Label L_fallthrough;
4043   int label_nulls = 0;
4044   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4045   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4046   assert(label_nulls <= 1, "at most one NULL in the batch");
4047 
4048   // a couple of useful fields in sub_klass:
4049   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4050   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4051   Address secondary_supers_addr(sub_klass, ss_offset);
4052   Address super_cache_addr(     sub_klass, sc_offset);
4053 
4054   // Do a linear scan of the secondary super-klass chain.
4055   // This code is rarely used, so simplicity is a virtue here.
4056   // The repne_scan instruction uses fixed registers, which we must spill.
4057   // Don't worry too much about pre-existing connections with the input regs.
4058 
4059   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4060   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4061 
4062   // Get super_klass value into rax (even if it was in rdi or rcx).
4063   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4064   if (super_klass != rax || UseCompressedOops) {
4065     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4066     mov(rax, super_klass);
4067   }
4068   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4069   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4070 
4071 #ifndef PRODUCT
4072   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4073   ExternalAddress pst_counter_addr((address) pst_counter);
4074   NOT_LP64(  incrementl(pst_counter_addr) );
4075   LP64_ONLY( lea(rcx, pst_counter_addr) );
4076   LP64_ONLY( incrementl(Address(rcx, 0)) );
4077 #endif //PRODUCT
4078 
4079   // We will consult the secondary-super array.
4080   movptr(rdi, secondary_supers_addr);
4081   // Load the array length.  (Positive movl does right thing on LP64.)
4082   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4083   // Skip to start of data.
4084   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4085 
4086   // Scan RCX words at [RDI] for an occurrence of RAX.
4087   // Set NZ/Z based on last compare.
4088   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4089   // not change flags (only scas instruction which is repeated sets flags).
4090   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4091 
4092     testptr(rax,rax); // Set Z = 0
4093     repne_scan();
4094 
4095   // Unspill the temp. registers:
4096   if (pushed_rdi)  pop(rdi);
4097   if (pushed_rcx)  pop(rcx);
4098   if (pushed_rax)  pop(rax);
4099 
4100   if (set_cond_codes) {
4101     // Special hack for the AD files:  rdi is guaranteed non-zero.
4102     assert(!pushed_rdi, "rdi must be left non-NULL");
4103     // Also, the condition codes are properly set Z/NZ on succeed/failure.
4104   }
4105 
4106   if (L_failure == &L_fallthrough)
4107         jccb(Assembler::notEqual, *L_failure);
4108   else  jcc(Assembler::notEqual, *L_failure);
4109 
4110   // Success.  Cache the super we found and proceed in triumph.
4111   movptr(super_cache_addr, super_klass);
4112 
4113   if (L_success != &L_fallthrough) {
4114     jmp(*L_success);
4115   }
4116 
4117 #undef IS_A_TEMP
4118 
4119   bind(L_fallthrough);
4120 }
4121 
4122 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
4123   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
4124 
4125   Label L_fallthrough;
4126   if (L_fast_path == NULL) {
4127     L_fast_path = &L_fallthrough;
4128   } else if (L_slow_path == NULL) {
4129     L_slow_path = &L_fallthrough;
4130   }
4131 
4132   // Fast path check: class is fully initialized
4133   cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4134   jcc(Assembler::equal, *L_fast_path);
4135 
4136   // Fast path check: current thread is initializer thread
4137   cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
4138   if (L_slow_path == &L_fallthrough) {
4139     jcc(Assembler::equal, *L_fast_path);
4140     bind(*L_slow_path);
4141   } else if (L_fast_path == &L_fallthrough) {
4142     jcc(Assembler::notEqual, *L_slow_path);
4143     bind(*L_fast_path);
4144   } else {
4145     Unimplemented();
4146   }
4147 }
4148 
4149 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4150   if (VM_Version::supports_cmov()) {
4151     cmovl(cc, dst, src);
4152   } else {
4153     Label L;
4154     jccb(negate_condition(cc), L);
4155     movl(dst, src);
4156     bind(L);
4157   }
4158 }
4159 
4160 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4161   if (VM_Version::supports_cmov()) {
4162     cmovl(cc, dst, src);
4163   } else {
4164     Label L;
4165     jccb(negate_condition(cc), L);
4166     movl(dst, src);
4167     bind(L);
4168   }
4169 }
4170 
4171 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4172   if (!VerifyOops) return;
4173 
4174   // Pass register number to verify_oop_subroutine
4175   const char* b = NULL;
4176   {
4177     ResourceMark rm;
4178     stringStream ss;
4179     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4180     b = code_string(ss.as_string());
4181   }
4182   BLOCK_COMMENT("verify_oop {");
4183 #ifdef _LP64
4184   push(rscratch1);                    // save r10, trashed by movptr()
4185 #endif
4186   push(rax);                          // save rax,
4187   push(reg);                          // pass register argument
4188   ExternalAddress buffer((address) b);
4189   // avoid using pushptr, as it modifies scratch registers
4190   // and our contract is not to modify anything
4191   movptr(rax, buffer.addr());
4192   push(rax);
4193   // call indirectly to solve generation ordering problem
4194   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4195   call(rax);
4196   // Caller pops the arguments (oop, message) and restores rax, r10
4197   BLOCK_COMMENT("} verify_oop");
4198 }
4199 
4200 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4201   if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4202     vpternlogd(dst, 0xFF, dst, dst, vector_len);
4203   } else {
4204     assert(UseAVX > 0, "");
4205     vpcmpeqb(dst, dst, dst, vector_len);
4206   }
4207 }
4208 
4209 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4210                                          int extra_slot_offset) {
4211   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4212   int stackElementSize = Interpreter::stackElementSize;
4213   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4214 #ifdef ASSERT
4215   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4216   assert(offset1 - offset == stackElementSize, "correct arithmetic");
4217 #endif
4218   Register             scale_reg    = noreg;
4219   Address::ScaleFactor scale_factor = Address::no_scale;
4220   if (arg_slot.is_constant()) {
4221     offset += arg_slot.as_constant() * stackElementSize;
4222   } else {
4223     scale_reg    = arg_slot.as_register();
4224     scale_factor = Address::times(stackElementSize);
4225   }
4226   offset += wordSize;           // return PC is on stack
4227   return Address(rsp, scale_reg, scale_factor, offset);
4228 }
4229 
4230 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
4231   if (!VerifyOops) return;
4232 
4233   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4234   // Pass register number to verify_oop_subroutine
4235   const char* b = NULL;
4236   {
4237     ResourceMark rm;
4238     stringStream ss;
4239     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
4240     b = code_string(ss.as_string());
4241   }
4242 #ifdef _LP64
4243   push(rscratch1);                    // save r10, trashed by movptr()
4244 #endif
4245   push(rax);                          // save rax,
4246   // addr may contain rsp so we will have to adjust it based on the push
4247   // we just did (and on 64 bit we do two pushes)
4248   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4249   // stores rax into addr which is backwards of what was intended.
4250   if (addr.uses(rsp)) {
4251     lea(rax, addr);
4252     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4253   } else {
4254     pushptr(addr);
4255   }
4256 
4257   ExternalAddress buffer((address) b);
4258   // pass msg argument
4259   // avoid using pushptr, as it modifies scratch registers
4260   // and our contract is not to modify anything
4261   movptr(rax, buffer.addr());
4262   push(rax);
4263 
4264   // call indirectly to solve generation ordering problem
4265   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4266   call(rax);
4267   // Caller pops the arguments (addr, message) and restores rax, r10.
4268 }
4269 
4270 void MacroAssembler::verify_tlab() {
4271 #ifdef ASSERT
4272   if (UseTLAB && VerifyOops) {
4273     Label next, ok;
4274     Register t1 = rsi;
4275     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4276 
4277     push(t1);
4278     NOT_LP64(push(thread_reg));
4279     NOT_LP64(get_thread(thread_reg));
4280 
4281     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4282     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4283     jcc(Assembler::aboveEqual, next);
4284     STOP("assert(top >= start)");
4285     should_not_reach_here();
4286 
4287     bind(next);
4288     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4289     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4290     jcc(Assembler::aboveEqual, ok);
4291     STOP("assert(top <= end)");
4292     should_not_reach_here();
4293 
4294     bind(ok);
4295     NOT_LP64(pop(thread_reg));
4296     pop(t1);
4297   }
4298 #endif
4299 }
4300 
4301 class ControlWord {
4302  public:
4303   int32_t _value;
4304 
4305   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4306   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4307   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4308   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4309   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4310   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4311   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4312   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4313 
4314   void print() const {
4315     // rounding control
4316     const char* rc;
4317     switch (rounding_control()) {
4318       case 0: rc = "round near"; break;
4319       case 1: rc = "round down"; break;
4320       case 2: rc = "round up  "; break;
4321       case 3: rc = "chop      "; break;
4322       default:
4323         rc = NULL; // silence compiler warnings
4324         fatal("Unknown rounding control: %d", rounding_control());
4325     };
4326     // precision control
4327     const char* pc;
4328     switch (precision_control()) {
4329       case 0: pc = "24 bits "; break;
4330       case 1: pc = "reserved"; break;
4331       case 2: pc = "53 bits "; break;
4332       case 3: pc = "64 bits "; break;
4333       default:
4334         pc = NULL; // silence compiler warnings
4335         fatal("Unknown precision control: %d", precision_control());
4336     };
4337     // flags
4338     char f[9];
4339     f[0] = ' ';
4340     f[1] = ' ';
4341     f[2] = (precision   ()) ? 'P' : 'p';
4342     f[3] = (underflow   ()) ? 'U' : 'u';
4343     f[4] = (overflow    ()) ? 'O' : 'o';
4344     f[5] = (zero_divide ()) ? 'Z' : 'z';
4345     f[6] = (denormalized()) ? 'D' : 'd';
4346     f[7] = (invalid     ()) ? 'I' : 'i';
4347     f[8] = '\x0';
4348     // output
4349     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4350   }
4351 
4352 };
4353 
4354 class StatusWord {
4355  public:
4356   int32_t _value;
4357 
4358   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4359   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4360   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4361   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4362   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4363   int  top() const                     { return  (_value >> 11) & 7      ; }
4364   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4365   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4366   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4367   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4368   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4369   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4370   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4371   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4372 
4373   void print() const {
4374     // condition codes
4375     char c[5];
4376     c[0] = (C3()) ? '3' : '-';
4377     c[1] = (C2()) ? '2' : '-';
4378     c[2] = (C1()) ? '1' : '-';
4379     c[3] = (C0()) ? '0' : '-';
4380     c[4] = '\x0';
4381     // flags
4382     char f[9];
4383     f[0] = (error_status()) ? 'E' : '-';
4384     f[1] = (stack_fault ()) ? 'S' : '-';
4385     f[2] = (precision   ()) ? 'P' : '-';
4386     f[3] = (underflow   ()) ? 'U' : '-';
4387     f[4] = (overflow    ()) ? 'O' : '-';
4388     f[5] = (zero_divide ()) ? 'Z' : '-';
4389     f[6] = (denormalized()) ? 'D' : '-';
4390     f[7] = (invalid     ()) ? 'I' : '-';
4391     f[8] = '\x0';
4392     // output
4393     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4394   }
4395 
4396 };
4397 
4398 class TagWord {
4399  public:
4400   int32_t _value;
4401 
4402   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4403 
4404   void print() const {
4405     printf("%04x", _value & 0xFFFF);
4406   }
4407 
4408 };
4409 
4410 class FPU_Register {
4411  public:
4412   int32_t _m0;
4413   int32_t _m1;
4414   int16_t _ex;
4415 
4416   bool is_indefinite() const           {
4417     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4418   }
4419 
4420   void print() const {
4421     char  sign = (_ex < 0) ? '-' : '+';
4422     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
4423     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
4424   };
4425 
4426 };
4427 
4428 class FPU_State {
4429  public:
4430   enum {
4431     register_size       = 10,
4432     number_of_registers =  8,
4433     register_mask       =  7
4434   };
4435 
4436   ControlWord  _control_word;
4437   StatusWord   _status_word;
4438   TagWord      _tag_word;
4439   int32_t      _error_offset;
4440   int32_t      _error_selector;
4441   int32_t      _data_offset;
4442   int32_t      _data_selector;
4443   int8_t       _register[register_size * number_of_registers];
4444 
4445   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4446   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
4447 
4448   const char* tag_as_string(int tag) const {
4449     switch (tag) {
4450       case 0: return "valid";
4451       case 1: return "zero";
4452       case 2: return "special";
4453       case 3: return "empty";
4454     }
4455     ShouldNotReachHere();
4456     return NULL;
4457   }
4458 
4459   void print() const {
4460     // print computation registers
4461     { int t = _status_word.top();
4462       for (int i = 0; i < number_of_registers; i++) {
4463         int j = (i - t) & register_mask;
4464         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4465         st(j)->print();
4466         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4467       }
4468     }
4469     printf("\n");
4470     // print control registers
4471     printf("ctrl = "); _control_word.print(); printf("\n");
4472     printf("stat = "); _status_word .print(); printf("\n");
4473     printf("tags = "); _tag_word    .print(); printf("\n");
4474   }
4475 
4476 };
4477 
4478 class Flag_Register {
4479  public:
4480   int32_t _value;
4481 
4482   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
4483   bool direction() const               { return ((_value >> 10) & 1) != 0; }
4484   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
4485   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
4486   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
4487   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
4488   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
4489 
4490   void print() const {
4491     // flags
4492     char f[8];
4493     f[0] = (overflow       ()) ? 'O' : '-';
4494     f[1] = (direction      ()) ? 'D' : '-';
4495     f[2] = (sign           ()) ? 'S' : '-';
4496     f[3] = (zero           ()) ? 'Z' : '-';
4497     f[4] = (auxiliary_carry()) ? 'A' : '-';
4498     f[5] = (parity         ()) ? 'P' : '-';
4499     f[6] = (carry          ()) ? 'C' : '-';
4500     f[7] = '\x0';
4501     // output
4502     printf("%08x  flags = %s", _value, f);
4503   }
4504 
4505 };
4506 
4507 class IU_Register {
4508  public:
4509   int32_t _value;
4510 
4511   void print() const {
4512     printf("%08x  %11d", _value, _value);
4513   }
4514 
4515 };
4516 
4517 class IU_State {
4518  public:
4519   Flag_Register _eflags;
4520   IU_Register   _rdi;
4521   IU_Register   _rsi;
4522   IU_Register   _rbp;
4523   IU_Register   _rsp;
4524   IU_Register   _rbx;
4525   IU_Register   _rdx;
4526   IU_Register   _rcx;
4527   IU_Register   _rax;
4528 
4529   void print() const {
4530     // computation registers
4531     printf("rax,  = "); _rax.print(); printf("\n");
4532     printf("rbx,  = "); _rbx.print(); printf("\n");
4533     printf("rcx  = "); _rcx.print(); printf("\n");
4534     printf("rdx  = "); _rdx.print(); printf("\n");
4535     printf("rdi  = "); _rdi.print(); printf("\n");
4536     printf("rsi  = "); _rsi.print(); printf("\n");
4537     printf("rbp,  = "); _rbp.print(); printf("\n");
4538     printf("rsp  = "); _rsp.print(); printf("\n");
4539     printf("\n");
4540     // control registers
4541     printf("flgs = "); _eflags.print(); printf("\n");
4542   }
4543 };
4544 
4545 
4546 class CPU_State {
4547  public:
4548   FPU_State _fpu_state;
4549   IU_State  _iu_state;
4550 
4551   void print() const {
4552     printf("--------------------------------------------------\n");
4553     _iu_state .print();
4554     printf("\n");
4555     _fpu_state.print();
4556     printf("--------------------------------------------------\n");
4557   }
4558 
4559 };
4560 
4561 
4562 static void _print_CPU_state(CPU_State* state) {
4563   state->print();
4564 };
4565 
4566 
4567 void MacroAssembler::print_CPU_state() {
4568   push_CPU_state();
4569   push(rsp);                // pass CPU state
4570   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
4571   addptr(rsp, wordSize);       // discard argument
4572   pop_CPU_state();
4573 }
4574 
4575 
4576 #ifndef _LP64
4577 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
4578   static int counter = 0;
4579   FPU_State* fs = &state->_fpu_state;
4580   counter++;
4581   // For leaf calls, only verify that the top few elements remain empty.
4582   // We only need 1 empty at the top for C2 code.
4583   if( stack_depth < 0 ) {
4584     if( fs->tag_for_st(7) != 3 ) {
4585       printf("FPR7 not empty\n");
4586       state->print();
4587       assert(false, "error");
4588       return false;
4589     }
4590     return true;                // All other stack states do not matter
4591   }
4592 
4593   assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(),
4594          "bad FPU control word");
4595 
4596   // compute stack depth
4597   int i = 0;
4598   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
4599   int d = i;
4600   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
4601   // verify findings
4602   if (i != FPU_State::number_of_registers) {
4603     // stack not contiguous
4604     printf("%s: stack not contiguous at ST%d\n", s, i);
4605     state->print();
4606     assert(false, "error");
4607     return false;
4608   }
4609   // check if computed stack depth corresponds to expected stack depth
4610   if (stack_depth < 0) {
4611     // expected stack depth is -stack_depth or less
4612     if (d > -stack_depth) {
4613       // too many elements on the stack
4614       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
4615       state->print();
4616       assert(false, "error");
4617       return false;
4618     }
4619   } else {
4620     // expected stack depth is stack_depth
4621     if (d != stack_depth) {
4622       // wrong stack depth
4623       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
4624       state->print();
4625       assert(false, "error");
4626       return false;
4627     }
4628   }
4629   // everything is cool
4630   return true;
4631 }
4632 
4633 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
4634   if (!VerifyFPU) return;
4635   push_CPU_state();
4636   push(rsp);                // pass CPU state
4637   ExternalAddress msg((address) s);
4638   // pass message string s
4639   pushptr(msg.addr());
4640   push(stack_depth);        // pass stack depth
4641   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
4642   addptr(rsp, 3 * wordSize);   // discard arguments
4643   // check for error
4644   { Label L;
4645     testl(rax, rax);
4646     jcc(Assembler::notZero, L);
4647     int3();                  // break if error condition
4648     bind(L);
4649   }
4650   pop_CPU_state();
4651 }
4652 #endif // _LP64
4653 
4654 void MacroAssembler::restore_cpu_control_state_after_jni() {
4655   // Either restore the MXCSR register after returning from the JNI Call
4656   // or verify that it wasn't changed (with -Xcheck:jni flag).
4657   if (VM_Version::supports_sse()) {
4658     if (RestoreMXCSROnJNICalls) {
4659       ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()));
4660     } else if (CheckJNICalls) {
4661       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
4662     }
4663   }
4664   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
4665   vzeroupper();
4666 
4667 #ifndef _LP64
4668   // Either restore the x87 floating pointer control word after returning
4669   // from the JNI call or verify that it wasn't changed.
4670   if (CheckJNICalls) {
4671     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
4672   }
4673 #endif // _LP64
4674 }
4675 
4676 // ((OopHandle)result).resolve();
4677 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
4678   assert_different_registers(result, tmp);
4679 
4680   // Only 64 bit platforms support GCs that require a tmp register
4681   // Only IN_HEAP loads require a thread_tmp register
4682   // OopHandle::resolve is an indirection like jobject.
4683   access_load_at(T_OBJECT, IN_NATIVE,
4684                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
4685 }
4686 
4687 // ((WeakHandle)result).resolve();
4688 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
4689   assert_different_registers(rresult, rtmp);
4690   Label resolved;
4691 
4692   // A null weak handle resolves to null.
4693   cmpptr(rresult, 0);
4694   jcc(Assembler::equal, resolved);
4695 
4696   // Only 64 bit platforms support GCs that require a tmp register
4697   // Only IN_HEAP loads require a thread_tmp register
4698   // WeakHandle::resolve is an indirection like jweak.
4699   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4700                  rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
4701   bind(resolved);
4702 }
4703 
4704 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
4705   // get mirror
4706   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
4707   load_method_holder(mirror, method);
4708   movptr(mirror, Address(mirror, mirror_offset));
4709   resolve_oop_handle(mirror, tmp);
4710 }
4711 
4712 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
4713   load_method_holder(rresult, rmethod);
4714   movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
4715 }
4716 
4717 void MacroAssembler::load_method_holder(Register holder, Register method) {
4718   movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
4719   movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
4720   movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
4721 }
4722 
4723 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
4724   assert_different_registers(src, tmp);
4725   assert_different_registers(dst, tmp);
4726 #ifdef _LP64
4727   if (UseCompressedClassPointers) {
4728     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4729     decode_klass_not_null(dst, tmp);
4730   } else
4731 #endif
4732     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4733 }
4734 
4735 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
4736   assert_different_registers(src, tmp);
4737   assert_different_registers(dst, tmp);
4738 #ifdef _LP64
4739   if (UseCompressedClassPointers) {
4740     encode_klass_not_null(src, tmp);
4741     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4742   } else
4743 #endif
4744     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4745 }
4746 
4747 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
4748                                     Register tmp1, Register thread_tmp) {
4749   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4750   decorators = AccessInternal::decorator_fixup(decorators);
4751   bool as_raw = (decorators & AS_RAW) != 0;
4752   if (as_raw) {
4753     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4754   } else {
4755     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4756   }
4757 }
4758 
4759 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
4760                                      Register tmp1, Register tmp2, Register tmp3) {
4761   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4762   decorators = AccessInternal::decorator_fixup(decorators);
4763   bool as_raw = (decorators & AS_RAW) != 0;
4764   if (as_raw) {
4765     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2, tmp3);
4766   } else {
4767     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2, tmp3);
4768   }
4769 }
4770 
4771 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4772                                    Register thread_tmp, DecoratorSet decorators) {
4773   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4774 }
4775 
4776 // Doesn't do verification, generates fixed size code
4777 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4778                                             Register thread_tmp, DecoratorSet decorators) {
4779   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4780 }
4781 
4782 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4783                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
4784   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2, tmp3);
4785 }
4786 
4787 // Used for storing NULLs.
4788 void MacroAssembler::store_heap_oop_null(Address dst) {
4789   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
4790 }
4791 
4792 #ifdef _LP64
4793 void MacroAssembler::store_klass_gap(Register dst, Register src) {
4794   if (UseCompressedClassPointers) {
4795     // Store to klass gap in destination
4796     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
4797   }
4798 }
4799 
4800 #ifdef ASSERT
4801 void MacroAssembler::verify_heapbase(const char* msg) {
4802   assert (UseCompressedOops, "should be compressed");
4803   assert (Universe::heap() != NULL, "java heap should be initialized");
4804   if (CheckCompressedOops) {
4805     Label ok;
4806     const auto src2 = ExternalAddress((address)CompressedOops::ptrs_base_addr());
4807     assert(!src2.is_lval(), "should not be lval");
4808     const bool is_src2_reachable = reachable(src2);
4809     if (!is_src2_reachable) {
4810       push(rscratch1);  // cmpptr trashes rscratch1
4811     }
4812     cmpptr(r12_heapbase, src2);
4813     jcc(Assembler::equal, ok);
4814     STOP(msg);
4815     bind(ok);
4816     if (!is_src2_reachable) {
4817       pop(rscratch1);
4818     }
4819   }
4820 }
4821 #endif
4822 
4823 // Algorithm must match oop.inline.hpp encode_heap_oop.
4824 void MacroAssembler::encode_heap_oop(Register r) {
4825 #ifdef ASSERT
4826   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
4827 #endif
4828   verify_oop_msg(r, "broken oop in encode_heap_oop");
4829   if (CompressedOops::base() == NULL) {
4830     if (CompressedOops::shift() != 0) {
4831       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4832       shrq(r, LogMinObjAlignmentInBytes);
4833     }
4834     return;
4835   }
4836   testq(r, r);
4837   cmovq(Assembler::equal, r, r12_heapbase);
4838   subq(r, r12_heapbase);
4839   shrq(r, LogMinObjAlignmentInBytes);
4840 }
4841 
4842 void MacroAssembler::encode_heap_oop_not_null(Register r) {
4843 #ifdef ASSERT
4844   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
4845   if (CheckCompressedOops) {
4846     Label ok;
4847     testq(r, r);
4848     jcc(Assembler::notEqual, ok);
4849     STOP("null oop passed to encode_heap_oop_not_null");
4850     bind(ok);
4851   }
4852 #endif
4853   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
4854   if (CompressedOops::base() != NULL) {
4855     subq(r, r12_heapbase);
4856   }
4857   if (CompressedOops::shift() != 0) {
4858     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4859     shrq(r, LogMinObjAlignmentInBytes);
4860   }
4861 }
4862 
4863 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
4864 #ifdef ASSERT
4865   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
4866   if (CheckCompressedOops) {
4867     Label ok;
4868     testq(src, src);
4869     jcc(Assembler::notEqual, ok);
4870     STOP("null oop passed to encode_heap_oop_not_null2");
4871     bind(ok);
4872   }
4873 #endif
4874   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
4875   if (dst != src) {
4876     movq(dst, src);
4877   }
4878   if (CompressedOops::base() != NULL) {
4879     subq(dst, r12_heapbase);
4880   }
4881   if (CompressedOops::shift() != 0) {
4882     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4883     shrq(dst, LogMinObjAlignmentInBytes);
4884   }
4885 }
4886 
4887 void  MacroAssembler::decode_heap_oop(Register r) {
4888 #ifdef ASSERT
4889   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
4890 #endif
4891   if (CompressedOops::base() == NULL) {
4892     if (CompressedOops::shift() != 0) {
4893       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4894       shlq(r, LogMinObjAlignmentInBytes);
4895     }
4896   } else {
4897     Label done;
4898     shlq(r, LogMinObjAlignmentInBytes);
4899     jccb(Assembler::equal, done);
4900     addq(r, r12_heapbase);
4901     bind(done);
4902   }
4903   verify_oop_msg(r, "broken oop in decode_heap_oop");
4904 }
4905 
4906 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
4907   // Note: it will change flags
4908   assert (UseCompressedOops, "should only be used for compressed headers");
4909   assert (Universe::heap() != NULL, "java heap should be initialized");
4910   // Cannot assert, unverified entry point counts instructions (see .ad file)
4911   // vtableStubs also counts instructions in pd_code_size_limit.
4912   // Also do not verify_oop as this is called by verify_oop.
4913   if (CompressedOops::shift() != 0) {
4914     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4915     shlq(r, LogMinObjAlignmentInBytes);
4916     if (CompressedOops::base() != NULL) {
4917       addq(r, r12_heapbase);
4918     }
4919   } else {
4920     assert (CompressedOops::base() == NULL, "sanity");
4921   }
4922 }
4923 
4924 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
4925   // Note: it will change flags
4926   assert (UseCompressedOops, "should only be used for compressed headers");
4927   assert (Universe::heap() != NULL, "java heap should be initialized");
4928   // Cannot assert, unverified entry point counts instructions (see .ad file)
4929   // vtableStubs also counts instructions in pd_code_size_limit.
4930   // Also do not verify_oop as this is called by verify_oop.
4931   if (CompressedOops::shift() != 0) {
4932     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4933     if (LogMinObjAlignmentInBytes == Address::times_8) {
4934       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
4935     } else {
4936       if (dst != src) {
4937         movq(dst, src);
4938       }
4939       shlq(dst, LogMinObjAlignmentInBytes);
4940       if (CompressedOops::base() != NULL) {
4941         addq(dst, r12_heapbase);
4942       }
4943     }
4944   } else {
4945     assert (CompressedOops::base() == NULL, "sanity");
4946     if (dst != src) {
4947       movq(dst, src);
4948     }
4949   }
4950 }
4951 
4952 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
4953   assert_different_registers(r, tmp);
4954   if (CompressedKlassPointers::base() != NULL) {
4955     mov64(tmp, (int64_t)CompressedKlassPointers::base());
4956     subq(r, tmp);
4957   }
4958   if (CompressedKlassPointers::shift() != 0) {
4959     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4960     shrq(r, LogKlassAlignmentInBytes);
4961   }
4962 }
4963 
4964 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
4965   assert_different_registers(src, dst);
4966   if (CompressedKlassPointers::base() != NULL) {
4967     mov64(dst, -(int64_t)CompressedKlassPointers::base());
4968     addq(dst, src);
4969   } else {
4970     movptr(dst, src);
4971   }
4972   if (CompressedKlassPointers::shift() != 0) {
4973     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4974     shrq(dst, LogKlassAlignmentInBytes);
4975   }
4976 }
4977 
4978 void  MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
4979   assert_different_registers(r, tmp);
4980   // Note: it will change flags
4981   assert(UseCompressedClassPointers, "should only be used for compressed headers");
4982   // Cannot assert, unverified entry point counts instructions (see .ad file)
4983   // vtableStubs also counts instructions in pd_code_size_limit.
4984   // Also do not verify_oop as this is called by verify_oop.
4985   if (CompressedKlassPointers::shift() != 0) {
4986     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4987     shlq(r, LogKlassAlignmentInBytes);
4988   }
4989   if (CompressedKlassPointers::base() != NULL) {
4990     mov64(tmp, (int64_t)CompressedKlassPointers::base());
4991     addq(r, tmp);
4992   }
4993 }
4994 
4995 void  MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
4996   assert_different_registers(src, dst);
4997   // Note: it will change flags
4998   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4999   // Cannot assert, unverified entry point counts instructions (see .ad file)
5000   // vtableStubs also counts instructions in pd_code_size_limit.
5001   // Also do not verify_oop as this is called by verify_oop.
5002 
5003   if (CompressedKlassPointers::base() == NULL &&
5004       CompressedKlassPointers::shift() == 0) {
5005     // The best case scenario is that there is no base or shift. Then it is already
5006     // a pointer that needs nothing but a register rename.
5007     movl(dst, src);
5008   } else {
5009     if (CompressedKlassPointers::base() != NULL) {
5010       mov64(dst, (int64_t)CompressedKlassPointers::base());
5011     } else {
5012       xorq(dst, dst);
5013     }
5014     if (CompressedKlassPointers::shift() != 0) {
5015       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5016       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5017       leaq(dst, Address(dst, src, Address::times_8, 0));
5018     } else {
5019       addq(dst, src);
5020     }
5021   }
5022 }
5023 
5024 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5025   assert (UseCompressedOops, "should only be used for compressed headers");
5026   assert (Universe::heap() != NULL, "java heap should be initialized");
5027   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5028   int oop_index = oop_recorder()->find_index(obj);
5029   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5030   mov_narrow_oop(dst, oop_index, rspec);
5031 }
5032 
5033 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5034   assert (UseCompressedOops, "should only be used for compressed headers");
5035   assert (Universe::heap() != NULL, "java heap should be initialized");
5036   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5037   int oop_index = oop_recorder()->find_index(obj);
5038   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5039   mov_narrow_oop(dst, oop_index, rspec);
5040 }
5041 
5042 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5043   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5044   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5045   int klass_index = oop_recorder()->find_index(k);
5046   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5047   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5048 }
5049 
5050 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5051   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5052   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5053   int klass_index = oop_recorder()->find_index(k);
5054   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5055   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5056 }
5057 
5058 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5059   assert (UseCompressedOops, "should only be used for compressed headers");
5060   assert (Universe::heap() != NULL, "java heap should be initialized");
5061   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5062   int oop_index = oop_recorder()->find_index(obj);
5063   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5064   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5065 }
5066 
5067 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5068   assert (UseCompressedOops, "should only be used for compressed headers");
5069   assert (Universe::heap() != NULL, "java heap should be initialized");
5070   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5071   int oop_index = oop_recorder()->find_index(obj);
5072   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5073   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5074 }
5075 
5076 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5077   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5078   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5079   int klass_index = oop_recorder()->find_index(k);
5080   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5081   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5082 }
5083 
5084 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5085   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5086   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5087   int klass_index = oop_recorder()->find_index(k);
5088   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5089   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5090 }
5091 
5092 void MacroAssembler::reinit_heapbase() {
5093   if (UseCompressedOops) {
5094     if (Universe::heap() != NULL) {
5095       if (CompressedOops::base() == NULL) {
5096         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5097       } else {
5098         mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
5099       }
5100     } else {
5101       movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5102     }
5103   }
5104 }
5105 
5106 #endif // _LP64
5107 
5108 // C2 compiled method's prolog code.
5109 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
5110 
5111   // WARNING: Initial instruction MUST be 5 bytes or longer so that
5112   // NativeJump::patch_verified_entry will be able to patch out the entry
5113   // code safely. The push to verify stack depth is ok at 5 bytes,
5114   // the frame allocation can be either 3 or 6 bytes. So if we don't do
5115   // stack bang then we must use the 6 byte frame allocation even if
5116   // we have no frame. :-(
5117   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
5118 
5119   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
5120   // Remove word for return addr
5121   framesize -= wordSize;
5122   stack_bang_size -= wordSize;
5123 
5124   // Calls to C2R adapters often do not accept exceptional returns.
5125   // We require that their callers must bang for them.  But be careful, because
5126   // some VM calls (such as call site linkage) can use several kilobytes of
5127   // stack.  But the stack safety zone should account for that.
5128   // See bugs 4446381, 4468289, 4497237.
5129   if (stack_bang_size > 0) {
5130     generate_stack_overflow_check(stack_bang_size);
5131 
5132     // We always push rbp, so that on return to interpreter rbp, will be
5133     // restored correctly and we can correct the stack.
5134     push(rbp);
5135     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5136     if (PreserveFramePointer) {
5137       mov(rbp, rsp);
5138     }
5139     // Remove word for ebp
5140     framesize -= wordSize;
5141 
5142     // Create frame
5143     if (framesize) {
5144       subptr(rsp, framesize);
5145     }
5146   } else {
5147     // Create frame (force generation of a 4 byte immediate value)
5148     subptr_imm32(rsp, framesize);
5149 
5150     // Save RBP register now.
5151     framesize -= wordSize;
5152     movptr(Address(rsp, framesize), rbp);
5153     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5154     if (PreserveFramePointer) {
5155       movptr(rbp, rsp);
5156       if (framesize > 0) {
5157         addptr(rbp, framesize);
5158       }
5159     }
5160   }
5161 
5162   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5163     framesize -= wordSize;
5164     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5165   }
5166 
5167 #ifndef _LP64
5168   // If method sets FPU control word do it now
5169   if (fp_mode_24b) {
5170     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
5171   }
5172   if (UseSSE >= 2 && VerifyFPU) {
5173     verify_FPU(0, "FPU stack must be clean on entry");
5174   }
5175 #endif
5176 
5177 #ifdef ASSERT
5178   if (VerifyStackAtCalls) {
5179     Label L;
5180     push(rax);
5181     mov(rax, rsp);
5182     andptr(rax, StackAlignmentInBytes-1);
5183     cmpptr(rax, StackAlignmentInBytes-wordSize);
5184     pop(rax);
5185     jcc(Assembler::equal, L);
5186     STOP("Stack is not properly aligned!");
5187     bind(L);
5188   }
5189 #endif
5190 
5191   if (!is_stub) {
5192     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5193     bs->nmethod_entry_barrier(this);
5194   }
5195 }
5196 
5197 #if COMPILER2_OR_JVMCI
5198 
5199 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5200 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5201   // cnt - number of qwords (8-byte words).
5202   // base - start address, qword aligned.
5203   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5204   bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0);
5205   if (use64byteVector) {
5206     vpxor(xtmp, xtmp, xtmp, AVX_512bit);
5207   } else if (MaxVectorSize >= 32) {
5208     vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5209   } else {
5210     pxor(xtmp, xtmp);
5211   }
5212   jmp(L_zero_64_bytes);
5213 
5214   BIND(L_loop);
5215   if (MaxVectorSize >= 32) {
5216     fill64(base, 0, xtmp, use64byteVector);
5217   } else {
5218     movdqu(Address(base,  0), xtmp);
5219     movdqu(Address(base, 16), xtmp);
5220     movdqu(Address(base, 32), xtmp);
5221     movdqu(Address(base, 48), xtmp);
5222   }
5223   addptr(base, 64);
5224 
5225   BIND(L_zero_64_bytes);
5226   subptr(cnt, 8);
5227   jccb(Assembler::greaterEqual, L_loop);
5228 
5229   // Copy trailing 64 bytes
5230   if (use64byteVector) {
5231     addptr(cnt, 8);
5232     jccb(Assembler::equal, L_end);
5233     fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
5234     jmp(L_end);
5235   } else {
5236     addptr(cnt, 4);
5237     jccb(Assembler::less, L_tail);
5238     if (MaxVectorSize >= 32) {
5239       vmovdqu(Address(base, 0), xtmp);
5240     } else {
5241       movdqu(Address(base,  0), xtmp);
5242       movdqu(Address(base, 16), xtmp);
5243     }
5244   }
5245   addptr(base, 32);
5246   subptr(cnt, 4);
5247 
5248   BIND(L_tail);
5249   addptr(cnt, 4);
5250   jccb(Assembler::lessEqual, L_end);
5251   if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5252     fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
5253   } else {
5254     decrement(cnt);
5255 
5256     BIND(L_sloop);
5257     movq(Address(base, 0), xtmp);
5258     addptr(base, 8);
5259     decrement(cnt);
5260     jccb(Assembler::greaterEqual, L_sloop);
5261   }
5262   BIND(L_end);
5263 }
5264 
5265 // Clearing constant sized memory using YMM/ZMM registers.
5266 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5267   assert(UseAVX > 2 && VM_Version::supports_avx512vlbw(), "");
5268   bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
5269 
5270   int vector64_count = (cnt & (~0x7)) >> 3;
5271   cnt = cnt & 0x7;
5272   const int fill64_per_loop = 4;
5273   const int max_unrolled_fill64 = 8;
5274 
5275   // 64 byte initialization loop.
5276   vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
5277   int start64 = 0;
5278   if (vector64_count > max_unrolled_fill64) {
5279     Label LOOP;
5280     Register index = rtmp;
5281 
5282     start64 = vector64_count - (vector64_count % fill64_per_loop);
5283 
5284     movl(index, 0);
5285     BIND(LOOP);
5286     for (int i = 0; i < fill64_per_loop; i++) {
5287       fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
5288     }
5289     addl(index, fill64_per_loop * 64);
5290     cmpl(index, start64 * 64);
5291     jccb(Assembler::less, LOOP);
5292   }
5293   for (int i = start64; i < vector64_count; i++) {
5294     fill64(base, i * 64, xtmp, use64byteVector);
5295   }
5296 
5297   // Clear remaining 64 byte tail.
5298   int disp = vector64_count * 64;
5299   if (cnt) {
5300     switch (cnt) {
5301       case 1:
5302         movq(Address(base, disp), xtmp);
5303         break;
5304       case 2:
5305         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_128bit);
5306         break;
5307       case 3:
5308         movl(rtmp, 0x7);
5309         kmovwl(mask, rtmp);
5310         evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_256bit);
5311         break;
5312       case 4:
5313         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5314         break;
5315       case 5:
5316         if (use64byteVector) {
5317           movl(rtmp, 0x1F);
5318           kmovwl(mask, rtmp);
5319           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5320         } else {
5321           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5322           movq(Address(base, disp + 32), xtmp);
5323         }
5324         break;
5325       case 6:
5326         if (use64byteVector) {
5327           movl(rtmp, 0x3F);
5328           kmovwl(mask, rtmp);
5329           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5330         } else {
5331           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5332           evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, false, Assembler::AVX_128bit);
5333         }
5334         break;
5335       case 7:
5336         if (use64byteVector) {
5337           movl(rtmp, 0x7F);
5338           kmovwl(mask, rtmp);
5339           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5340         } else {
5341           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5342           movl(rtmp, 0x7);
5343           kmovwl(mask, rtmp);
5344           evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, true, Assembler::AVX_256bit);
5345         }
5346         break;
5347       default:
5348         fatal("Unexpected length : %d\n",cnt);
5349         break;
5350     }
5351   }
5352 }
5353 
5354 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
5355                                bool is_large, KRegister mask) {
5356   // cnt      - number of qwords (8-byte words).
5357   // base     - start address, qword aligned.
5358   // is_large - if optimizers know cnt is larger than InitArrayShortSize
5359   assert(base==rdi, "base register must be edi for rep stos");
5360   assert(tmp==rax,   "tmp register must be eax for rep stos");
5361   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
5362   assert(InitArrayShortSize % BytesPerLong == 0,
5363     "InitArrayShortSize should be the multiple of BytesPerLong");
5364 
5365   Label DONE;
5366   if (!is_large || !UseXMMForObjInit) {
5367     xorptr(tmp, tmp);
5368   }
5369 
5370   if (!is_large) {
5371     Label LOOP, LONG;
5372     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5373     jccb(Assembler::greater, LONG);
5374 
5375     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5376 
5377     decrement(cnt);
5378     jccb(Assembler::negative, DONE); // Zero length
5379 
5380     // Use individual pointer-sized stores for small counts:
5381     BIND(LOOP);
5382     movptr(Address(base, cnt, Address::times_ptr), tmp);
5383     decrement(cnt);
5384     jccb(Assembler::greaterEqual, LOOP);
5385     jmpb(DONE);
5386 
5387     BIND(LONG);
5388   }
5389 
5390   // Use longer rep-prefixed ops for non-small counts:
5391   if (UseFastStosb) {
5392     shlptr(cnt, 3); // convert to number of bytes
5393     rep_stosb();
5394   } else if (UseXMMForObjInit) {
5395     xmm_clear_mem(base, cnt, tmp, xtmp, mask);
5396   } else {
5397     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5398     rep_stos();
5399   }
5400 
5401   BIND(DONE);
5402 }
5403 
5404 #endif //COMPILER2_OR_JVMCI
5405 
5406 
5407 void MacroAssembler::generate_fill(BasicType t, bool aligned,
5408                                    Register to, Register value, Register count,
5409                                    Register rtmp, XMMRegister xtmp) {
5410   ShortBranchVerifier sbv(this);
5411   assert_different_registers(to, value, count, rtmp);
5412   Label L_exit;
5413   Label L_fill_2_bytes, L_fill_4_bytes;
5414 
5415 #if defined(COMPILER2) && defined(_LP64)
5416   if(MaxVectorSize >=32 &&
5417      VM_Version::supports_avx512vlbw() &&
5418      VM_Version::supports_bmi2()) {
5419     generate_fill_avx3(t, to, value, count, rtmp, xtmp);
5420     return;
5421   }
5422 #endif
5423 
5424   int shift = -1;
5425   switch (t) {
5426     case T_BYTE:
5427       shift = 2;
5428       break;
5429     case T_SHORT:
5430       shift = 1;
5431       break;
5432     case T_INT:
5433       shift = 0;
5434       break;
5435     default: ShouldNotReachHere();
5436   }
5437 
5438   if (t == T_BYTE) {
5439     andl(value, 0xff);
5440     movl(rtmp, value);
5441     shll(rtmp, 8);
5442     orl(value, rtmp);
5443   }
5444   if (t == T_SHORT) {
5445     andl(value, 0xffff);
5446   }
5447   if (t == T_BYTE || t == T_SHORT) {
5448     movl(rtmp, value);
5449     shll(rtmp, 16);
5450     orl(value, rtmp);
5451   }
5452 
5453   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
5454   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
5455   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
5456     Label L_skip_align2;
5457     // align source address at 4 bytes address boundary
5458     if (t == T_BYTE) {
5459       Label L_skip_align1;
5460       // One byte misalignment happens only for byte arrays
5461       testptr(to, 1);
5462       jccb(Assembler::zero, L_skip_align1);
5463       movb(Address(to, 0), value);
5464       increment(to);
5465       decrement(count);
5466       BIND(L_skip_align1);
5467     }
5468     // Two bytes misalignment happens only for byte and short (char) arrays
5469     testptr(to, 2);
5470     jccb(Assembler::zero, L_skip_align2);
5471     movw(Address(to, 0), value);
5472     addptr(to, 2);
5473     subl(count, 1<<(shift-1));
5474     BIND(L_skip_align2);
5475   }
5476   if (UseSSE < 2) {
5477     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5478     // Fill 32-byte chunks
5479     subl(count, 8 << shift);
5480     jcc(Assembler::less, L_check_fill_8_bytes);
5481     align(16);
5482 
5483     BIND(L_fill_32_bytes_loop);
5484 
5485     for (int i = 0; i < 32; i += 4) {
5486       movl(Address(to, i), value);
5487     }
5488 
5489     addptr(to, 32);
5490     subl(count, 8 << shift);
5491     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5492     BIND(L_check_fill_8_bytes);
5493     addl(count, 8 << shift);
5494     jccb(Assembler::zero, L_exit);
5495     jmpb(L_fill_8_bytes);
5496 
5497     //
5498     // length is too short, just fill qwords
5499     //
5500     BIND(L_fill_8_bytes_loop);
5501     movl(Address(to, 0), value);
5502     movl(Address(to, 4), value);
5503     addptr(to, 8);
5504     BIND(L_fill_8_bytes);
5505     subl(count, 1 << (shift + 1));
5506     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5507     // fall through to fill 4 bytes
5508   } else {
5509     Label L_fill_32_bytes;
5510     if (!UseUnalignedLoadStores) {
5511       // align to 8 bytes, we know we are 4 byte aligned to start
5512       testptr(to, 4);
5513       jccb(Assembler::zero, L_fill_32_bytes);
5514       movl(Address(to, 0), value);
5515       addptr(to, 4);
5516       subl(count, 1<<shift);
5517     }
5518     BIND(L_fill_32_bytes);
5519     {
5520       assert( UseSSE >= 2, "supported cpu only" );
5521       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5522       movdl(xtmp, value);
5523       if (UseAVX >= 2 && UseUnalignedLoadStores) {
5524         Label L_check_fill_32_bytes;
5525         if (UseAVX > 2) {
5526           // Fill 64-byte chunks
5527           Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
5528 
5529           // If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2
5530           cmpl(count, VM_Version::avx3_threshold());
5531           jccb(Assembler::below, L_check_fill_64_bytes_avx2);
5532 
5533           vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
5534 
5535           subl(count, 16 << shift);
5536           jccb(Assembler::less, L_check_fill_32_bytes);
5537           align(16);
5538 
5539           BIND(L_fill_64_bytes_loop_avx3);
5540           evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
5541           addptr(to, 64);
5542           subl(count, 16 << shift);
5543           jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
5544           jmpb(L_check_fill_32_bytes);
5545 
5546           BIND(L_check_fill_64_bytes_avx2);
5547         }
5548         // Fill 64-byte chunks
5549         Label L_fill_64_bytes_loop;
5550         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
5551 
5552         subl(count, 16 << shift);
5553         jcc(Assembler::less, L_check_fill_32_bytes);
5554         align(16);
5555 
5556         BIND(L_fill_64_bytes_loop);
5557         vmovdqu(Address(to, 0), xtmp);
5558         vmovdqu(Address(to, 32), xtmp);
5559         addptr(to, 64);
5560         subl(count, 16 << shift);
5561         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
5562 
5563         BIND(L_check_fill_32_bytes);
5564         addl(count, 8 << shift);
5565         jccb(Assembler::less, L_check_fill_8_bytes);
5566         vmovdqu(Address(to, 0), xtmp);
5567         addptr(to, 32);
5568         subl(count, 8 << shift);
5569 
5570         BIND(L_check_fill_8_bytes);
5571         // clean upper bits of YMM registers
5572         movdl(xtmp, value);
5573         pshufd(xtmp, xtmp, 0);
5574       } else {
5575         // Fill 32-byte chunks
5576         pshufd(xtmp, xtmp, 0);
5577 
5578         subl(count, 8 << shift);
5579         jcc(Assembler::less, L_check_fill_8_bytes);
5580         align(16);
5581 
5582         BIND(L_fill_32_bytes_loop);
5583 
5584         if (UseUnalignedLoadStores) {
5585           movdqu(Address(to, 0), xtmp);
5586           movdqu(Address(to, 16), xtmp);
5587         } else {
5588           movq(Address(to, 0), xtmp);
5589           movq(Address(to, 8), xtmp);
5590           movq(Address(to, 16), xtmp);
5591           movq(Address(to, 24), xtmp);
5592         }
5593 
5594         addptr(to, 32);
5595         subl(count, 8 << shift);
5596         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5597 
5598         BIND(L_check_fill_8_bytes);
5599       }
5600       addl(count, 8 << shift);
5601       jccb(Assembler::zero, L_exit);
5602       jmpb(L_fill_8_bytes);
5603 
5604       //
5605       // length is too short, just fill qwords
5606       //
5607       BIND(L_fill_8_bytes_loop);
5608       movq(Address(to, 0), xtmp);
5609       addptr(to, 8);
5610       BIND(L_fill_8_bytes);
5611       subl(count, 1 << (shift + 1));
5612       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5613     }
5614   }
5615   // fill trailing 4 bytes
5616   BIND(L_fill_4_bytes);
5617   testl(count, 1<<shift);
5618   jccb(Assembler::zero, L_fill_2_bytes);
5619   movl(Address(to, 0), value);
5620   if (t == T_BYTE || t == T_SHORT) {
5621     Label L_fill_byte;
5622     addptr(to, 4);
5623     BIND(L_fill_2_bytes);
5624     // fill trailing 2 bytes
5625     testl(count, 1<<(shift-1));
5626     jccb(Assembler::zero, L_fill_byte);
5627     movw(Address(to, 0), value);
5628     if (t == T_BYTE) {
5629       addptr(to, 2);
5630       BIND(L_fill_byte);
5631       // fill trailing byte
5632       testl(count, 1);
5633       jccb(Assembler::zero, L_exit);
5634       movb(Address(to, 0), value);
5635     } else {
5636       BIND(L_fill_byte);
5637     }
5638   } else {
5639     BIND(L_fill_2_bytes);
5640   }
5641   BIND(L_exit);
5642 }
5643 
5644 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
5645   switch(type) {
5646     case T_BYTE:
5647     case T_BOOLEAN:
5648       evpbroadcastb(dst, src, vector_len);
5649       break;
5650     case T_SHORT:
5651     case T_CHAR:
5652       evpbroadcastw(dst, src, vector_len);
5653       break;
5654     case T_INT:
5655     case T_FLOAT:
5656       evpbroadcastd(dst, src, vector_len);
5657       break;
5658     case T_LONG:
5659     case T_DOUBLE:
5660       evpbroadcastq(dst, src, vector_len);
5661       break;
5662     default:
5663       fatal("Unhandled type : %s", type2name(type));
5664       break;
5665   }
5666 }
5667 
5668 // encode char[] to byte[] in ISO_8859_1 or ASCII
5669    //@IntrinsicCandidate
5670    //private static int implEncodeISOArray(byte[] sa, int sp,
5671    //byte[] da, int dp, int len) {
5672    //  int i = 0;
5673    //  for (; i < len; i++) {
5674    //    char c = StringUTF16.getChar(sa, sp++);
5675    //    if (c > '\u00FF')
5676    //      break;
5677    //    da[dp++] = (byte)c;
5678    //  }
5679    //  return i;
5680    //}
5681    //
5682    //@IntrinsicCandidate
5683    //private static int implEncodeAsciiArray(char[] sa, int sp,
5684    //    byte[] da, int dp, int len) {
5685    //  int i = 0;
5686    //  for (; i < len; i++) {
5687    //    char c = sa[sp++];
5688    //    if (c >= '\u0080')
5689    //      break;
5690    //    da[dp++] = (byte)c;
5691    //  }
5692    //  return i;
5693    //}
5694 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
5695   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
5696   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
5697   Register tmp5, Register result, bool ascii) {
5698 
5699   // rsi: src
5700   // rdi: dst
5701   // rdx: len
5702   // rcx: tmp5
5703   // rax: result
5704   ShortBranchVerifier sbv(this);
5705   assert_different_registers(src, dst, len, tmp5, result);
5706   Label L_done, L_copy_1_char, L_copy_1_char_exit;
5707 
5708   int mask = ascii ? 0xff80ff80 : 0xff00ff00;
5709   int short_mask = ascii ? 0xff80 : 0xff00;
5710 
5711   // set result
5712   xorl(result, result);
5713   // check for zero length
5714   testl(len, len);
5715   jcc(Assembler::zero, L_done);
5716 
5717   movl(result, len);
5718 
5719   // Setup pointers
5720   lea(src, Address(src, len, Address::times_2)); // char[]
5721   lea(dst, Address(dst, len, Address::times_1)); // byte[]
5722   negptr(len);
5723 
5724   if (UseSSE42Intrinsics || UseAVX >= 2) {
5725     Label L_copy_8_chars, L_copy_8_chars_exit;
5726     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
5727 
5728     if (UseAVX >= 2) {
5729       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
5730       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
5731       movdl(tmp1Reg, tmp5);
5732       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
5733       jmp(L_chars_32_check);
5734 
5735       bind(L_copy_32_chars);
5736       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
5737       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
5738       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5739       vptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
5740       jccb(Assembler::notZero, L_copy_32_chars_exit);
5741       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5742       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
5743       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
5744 
5745       bind(L_chars_32_check);
5746       addptr(len, 32);
5747       jcc(Assembler::lessEqual, L_copy_32_chars);
5748 
5749       bind(L_copy_32_chars_exit);
5750       subptr(len, 16);
5751       jccb(Assembler::greater, L_copy_16_chars_exit);
5752 
5753     } else if (UseSSE42Intrinsics) {
5754       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
5755       movdl(tmp1Reg, tmp5);
5756       pshufd(tmp1Reg, tmp1Reg, 0);
5757       jmpb(L_chars_16_check);
5758     }
5759 
5760     bind(L_copy_16_chars);
5761     if (UseAVX >= 2) {
5762       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
5763       vptest(tmp2Reg, tmp1Reg);
5764       jcc(Assembler::notZero, L_copy_16_chars_exit);
5765       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
5766       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
5767     } else {
5768       if (UseAVX > 0) {
5769         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5770         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5771         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
5772       } else {
5773         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5774         por(tmp2Reg, tmp3Reg);
5775         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5776         por(tmp2Reg, tmp4Reg);
5777       }
5778       ptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
5779       jccb(Assembler::notZero, L_copy_16_chars_exit);
5780       packuswb(tmp3Reg, tmp4Reg);
5781     }
5782     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
5783 
5784     bind(L_chars_16_check);
5785     addptr(len, 16);
5786     jcc(Assembler::lessEqual, L_copy_16_chars);
5787 
5788     bind(L_copy_16_chars_exit);
5789     if (UseAVX >= 2) {
5790       // clean upper bits of YMM registers
5791       vpxor(tmp2Reg, tmp2Reg);
5792       vpxor(tmp3Reg, tmp3Reg);
5793       vpxor(tmp4Reg, tmp4Reg);
5794       movdl(tmp1Reg, tmp5);
5795       pshufd(tmp1Reg, tmp1Reg, 0);
5796     }
5797     subptr(len, 8);
5798     jccb(Assembler::greater, L_copy_8_chars_exit);
5799 
5800     bind(L_copy_8_chars);
5801     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
5802     ptest(tmp3Reg, tmp1Reg);
5803     jccb(Assembler::notZero, L_copy_8_chars_exit);
5804     packuswb(tmp3Reg, tmp1Reg);
5805     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
5806     addptr(len, 8);
5807     jccb(Assembler::lessEqual, L_copy_8_chars);
5808 
5809     bind(L_copy_8_chars_exit);
5810     subptr(len, 8);
5811     jccb(Assembler::zero, L_done);
5812   }
5813 
5814   bind(L_copy_1_char);
5815   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
5816   testl(tmp5, short_mask);      // check if Unicode or non-ASCII char
5817   jccb(Assembler::notZero, L_copy_1_char_exit);
5818   movb(Address(dst, len, Address::times_1, 0), tmp5);
5819   addptr(len, 1);
5820   jccb(Assembler::less, L_copy_1_char);
5821 
5822   bind(L_copy_1_char_exit);
5823   addptr(result, len); // len is negative count of not processed elements
5824 
5825   bind(L_done);
5826 }
5827 
5828 #ifdef _LP64
5829 /**
5830  * Helper for multiply_to_len().
5831  */
5832 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
5833   addq(dest_lo, src1);
5834   adcq(dest_hi, 0);
5835   addq(dest_lo, src2);
5836   adcq(dest_hi, 0);
5837 }
5838 
5839 /**
5840  * Multiply 64 bit by 64 bit first loop.
5841  */
5842 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5843                                            Register y, Register y_idx, Register z,
5844                                            Register carry, Register product,
5845                                            Register idx, Register kdx) {
5846   //
5847   //  jlong carry, x[], y[], z[];
5848   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5849   //    huge_128 product = y[idx] * x[xstart] + carry;
5850   //    z[kdx] = (jlong)product;
5851   //    carry  = (jlong)(product >>> 64);
5852   //  }
5853   //  z[xstart] = carry;
5854   //
5855 
5856   Label L_first_loop, L_first_loop_exit;
5857   Label L_one_x, L_one_y, L_multiply;
5858 
5859   decrementl(xstart);
5860   jcc(Assembler::negative, L_one_x);
5861 
5862   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
5863   rorq(x_xstart, 32); // convert big-endian to little-endian
5864 
5865   bind(L_first_loop);
5866   decrementl(idx);
5867   jcc(Assembler::negative, L_first_loop_exit);
5868   decrementl(idx);
5869   jcc(Assembler::negative, L_one_y);
5870   movq(y_idx, Address(y, idx, Address::times_4,  0));
5871   rorq(y_idx, 32); // convert big-endian to little-endian
5872   bind(L_multiply);
5873   movq(product, x_xstart);
5874   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
5875   addq(product, carry);
5876   adcq(rdx, 0);
5877   subl(kdx, 2);
5878   movl(Address(z, kdx, Address::times_4,  4), product);
5879   shrq(product, 32);
5880   movl(Address(z, kdx, Address::times_4,  0), product);
5881   movq(carry, rdx);
5882   jmp(L_first_loop);
5883 
5884   bind(L_one_y);
5885   movl(y_idx, Address(y,  0));
5886   jmp(L_multiply);
5887 
5888   bind(L_one_x);
5889   movl(x_xstart, Address(x,  0));
5890   jmp(L_first_loop);
5891 
5892   bind(L_first_loop_exit);
5893 }
5894 
5895 /**
5896  * Multiply 64 bit by 64 bit and add 128 bit.
5897  */
5898 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
5899                                             Register yz_idx, Register idx,
5900                                             Register carry, Register product, int offset) {
5901   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
5902   //     z[kdx] = (jlong)product;
5903 
5904   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
5905   rorq(yz_idx, 32); // convert big-endian to little-endian
5906   movq(product, x_xstart);
5907   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
5908   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
5909   rorq(yz_idx, 32); // convert big-endian to little-endian
5910 
5911   add2_with_carry(rdx, product, carry, yz_idx);
5912 
5913   movl(Address(z, idx, Address::times_4,  offset+4), product);
5914   shrq(product, 32);
5915   movl(Address(z, idx, Address::times_4,  offset), product);
5916 
5917 }
5918 
5919 /**
5920  * Multiply 128 bit by 128 bit. Unrolled inner loop.
5921  */
5922 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
5923                                              Register yz_idx, Register idx, Register jdx,
5924                                              Register carry, Register product,
5925                                              Register carry2) {
5926   //   jlong carry, x[], y[], z[];
5927   //   int kdx = ystart+1;
5928   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5929   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
5930   //     z[kdx+idx+1] = (jlong)product;
5931   //     jlong carry2  = (jlong)(product >>> 64);
5932   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
5933   //     z[kdx+idx] = (jlong)product;
5934   //     carry  = (jlong)(product >>> 64);
5935   //   }
5936   //   idx += 2;
5937   //   if (idx > 0) {
5938   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
5939   //     z[kdx+idx] = (jlong)product;
5940   //     carry  = (jlong)(product >>> 64);
5941   //   }
5942   //
5943 
5944   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5945 
5946   movl(jdx, idx);
5947   andl(jdx, 0xFFFFFFFC);
5948   shrl(jdx, 2);
5949 
5950   bind(L_third_loop);
5951   subl(jdx, 1);
5952   jcc(Assembler::negative, L_third_loop_exit);
5953   subl(idx, 4);
5954 
5955   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
5956   movq(carry2, rdx);
5957 
5958   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
5959   movq(carry, rdx);
5960   jmp(L_third_loop);
5961 
5962   bind (L_third_loop_exit);
5963 
5964   andl (idx, 0x3);
5965   jcc(Assembler::zero, L_post_third_loop_done);
5966 
5967   Label L_check_1;
5968   subl(idx, 2);
5969   jcc(Assembler::negative, L_check_1);
5970 
5971   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
5972   movq(carry, rdx);
5973 
5974   bind (L_check_1);
5975   addl (idx, 0x2);
5976   andl (idx, 0x1);
5977   subl(idx, 1);
5978   jcc(Assembler::negative, L_post_third_loop_done);
5979 
5980   movl(yz_idx, Address(y, idx, Address::times_4,  0));
5981   movq(product, x_xstart);
5982   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
5983   movl(yz_idx, Address(z, idx, Address::times_4,  0));
5984 
5985   add2_with_carry(rdx, product, yz_idx, carry);
5986 
5987   movl(Address(z, idx, Address::times_4,  0), product);
5988   shrq(product, 32);
5989 
5990   shlq(rdx, 32);
5991   orq(product, rdx);
5992   movq(carry, product);
5993 
5994   bind(L_post_third_loop_done);
5995 }
5996 
5997 /**
5998  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
5999  *
6000  */
6001 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
6002                                                   Register carry, Register carry2,
6003                                                   Register idx, Register jdx,
6004                                                   Register yz_idx1, Register yz_idx2,
6005                                                   Register tmp, Register tmp3, Register tmp4) {
6006   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
6007 
6008   //   jlong carry, x[], y[], z[];
6009   //   int kdx = ystart+1;
6010   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6011   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
6012   //     jlong carry2  = (jlong)(tmp3 >>> 64);
6013   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
6014   //     carry  = (jlong)(tmp4 >>> 64);
6015   //     z[kdx+idx+1] = (jlong)tmp3;
6016   //     z[kdx+idx] = (jlong)tmp4;
6017   //   }
6018   //   idx += 2;
6019   //   if (idx > 0) {
6020   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
6021   //     z[kdx+idx] = (jlong)yz_idx1;
6022   //     carry  = (jlong)(yz_idx1 >>> 64);
6023   //   }
6024   //
6025 
6026   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6027 
6028   movl(jdx, idx);
6029   andl(jdx, 0xFFFFFFFC);
6030   shrl(jdx, 2);
6031 
6032   bind(L_third_loop);
6033   subl(jdx, 1);
6034   jcc(Assembler::negative, L_third_loop_exit);
6035   subl(idx, 4);
6036 
6037   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
6038   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
6039   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
6040   rorxq(yz_idx2, yz_idx2, 32);
6041 
6042   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
6043   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
6044 
6045   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
6046   rorxq(yz_idx1, yz_idx1, 32);
6047   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6048   rorxq(yz_idx2, yz_idx2, 32);
6049 
6050   if (VM_Version::supports_adx()) {
6051     adcxq(tmp3, carry);
6052     adoxq(tmp3, yz_idx1);
6053 
6054     adcxq(tmp4, tmp);
6055     adoxq(tmp4, yz_idx2);
6056 
6057     movl(carry, 0); // does not affect flags
6058     adcxq(carry2, carry);
6059     adoxq(carry2, carry);
6060   } else {
6061     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
6062     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
6063   }
6064   movq(carry, carry2);
6065 
6066   movl(Address(z, idx, Address::times_4, 12), tmp3);
6067   shrq(tmp3, 32);
6068   movl(Address(z, idx, Address::times_4,  8), tmp3);
6069 
6070   movl(Address(z, idx, Address::times_4,  4), tmp4);
6071   shrq(tmp4, 32);
6072   movl(Address(z, idx, Address::times_4,  0), tmp4);
6073 
6074   jmp(L_third_loop);
6075 
6076   bind (L_third_loop_exit);
6077 
6078   andl (idx, 0x3);
6079   jcc(Assembler::zero, L_post_third_loop_done);
6080 
6081   Label L_check_1;
6082   subl(idx, 2);
6083   jcc(Assembler::negative, L_check_1);
6084 
6085   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
6086   rorxq(yz_idx1, yz_idx1, 32);
6087   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
6088   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6089   rorxq(yz_idx2, yz_idx2, 32);
6090 
6091   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
6092 
6093   movl(Address(z, idx, Address::times_4,  4), tmp3);
6094   shrq(tmp3, 32);
6095   movl(Address(z, idx, Address::times_4,  0), tmp3);
6096   movq(carry, tmp4);
6097 
6098   bind (L_check_1);
6099   addl (idx, 0x2);
6100   andl (idx, 0x1);
6101   subl(idx, 1);
6102   jcc(Assembler::negative, L_post_third_loop_done);
6103   movl(tmp4, Address(y, idx, Address::times_4,  0));
6104   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
6105   movl(tmp4, Address(z, idx, Address::times_4,  0));
6106 
6107   add2_with_carry(carry2, tmp3, tmp4, carry);
6108 
6109   movl(Address(z, idx, Address::times_4,  0), tmp3);
6110   shrq(tmp3, 32);
6111 
6112   shlq(carry2, 32);
6113   orq(tmp3, carry2);
6114   movq(carry, tmp3);
6115 
6116   bind(L_post_third_loop_done);
6117 }
6118 
6119 /**
6120  * Code for BigInteger::multiplyToLen() intrinsic.
6121  *
6122  * rdi: x
6123  * rax: xlen
6124  * rsi: y
6125  * rcx: ylen
6126  * r8:  z
6127  * r11: zlen
6128  * r12: tmp1
6129  * r13: tmp2
6130  * r14: tmp3
6131  * r15: tmp4
6132  * rbx: tmp5
6133  *
6134  */
6135 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
6136                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
6137   ShortBranchVerifier sbv(this);
6138   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
6139 
6140   push(tmp1);
6141   push(tmp2);
6142   push(tmp3);
6143   push(tmp4);
6144   push(tmp5);
6145 
6146   push(xlen);
6147   push(zlen);
6148 
6149   const Register idx = tmp1;
6150   const Register kdx = tmp2;
6151   const Register xstart = tmp3;
6152 
6153   const Register y_idx = tmp4;
6154   const Register carry = tmp5;
6155   const Register product  = xlen;
6156   const Register x_xstart = zlen;  // reuse register
6157 
6158   // First Loop.
6159   //
6160   //  final static long LONG_MASK = 0xffffffffL;
6161   //  int xstart = xlen - 1;
6162   //  int ystart = ylen - 1;
6163   //  long carry = 0;
6164   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6165   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
6166   //    z[kdx] = (int)product;
6167   //    carry = product >>> 32;
6168   //  }
6169   //  z[xstart] = (int)carry;
6170   //
6171 
6172   movl(idx, ylen);      // idx = ylen;
6173   movl(kdx, zlen);      // kdx = xlen+ylen;
6174   xorq(carry, carry);   // carry = 0;
6175 
6176   Label L_done;
6177 
6178   movl(xstart, xlen);
6179   decrementl(xstart);
6180   jcc(Assembler::negative, L_done);
6181 
6182   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6183 
6184   Label L_second_loop;
6185   testl(kdx, kdx);
6186   jcc(Assembler::zero, L_second_loop);
6187 
6188   Label L_carry;
6189   subl(kdx, 1);
6190   jcc(Assembler::zero, L_carry);
6191 
6192   movl(Address(z, kdx, Address::times_4,  0), carry);
6193   shrq(carry, 32);
6194   subl(kdx, 1);
6195 
6196   bind(L_carry);
6197   movl(Address(z, kdx, Address::times_4,  0), carry);
6198 
6199   // Second and third (nested) loops.
6200   //
6201   // for (int i = xstart-1; i >= 0; i--) { // Second loop
6202   //   carry = 0;
6203   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6204   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6205   //                    (z[k] & LONG_MASK) + carry;
6206   //     z[k] = (int)product;
6207   //     carry = product >>> 32;
6208   //   }
6209   //   z[i] = (int)carry;
6210   // }
6211   //
6212   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
6213 
6214   const Register jdx = tmp1;
6215 
6216   bind(L_second_loop);
6217   xorl(carry, carry);    // carry = 0;
6218   movl(jdx, ylen);       // j = ystart+1
6219 
6220   subl(xstart, 1);       // i = xstart-1;
6221   jcc(Assembler::negative, L_done);
6222 
6223   push (z);
6224 
6225   Label L_last_x;
6226   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
6227   subl(xstart, 1);       // i = xstart-1;
6228   jcc(Assembler::negative, L_last_x);
6229 
6230   if (UseBMI2Instructions) {
6231     movq(rdx,  Address(x, xstart, Address::times_4,  0));
6232     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
6233   } else {
6234     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
6235     rorq(x_xstart, 32);  // convert big-endian to little-endian
6236   }
6237 
6238   Label L_third_loop_prologue;
6239   bind(L_third_loop_prologue);
6240 
6241   push (x);
6242   push (xstart);
6243   push (ylen);
6244 
6245 
6246   if (UseBMI2Instructions) {
6247     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
6248   } else { // !UseBMI2Instructions
6249     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
6250   }
6251 
6252   pop(ylen);
6253   pop(xlen);
6254   pop(x);
6255   pop(z);
6256 
6257   movl(tmp3, xlen);
6258   addl(tmp3, 1);
6259   movl(Address(z, tmp3, Address::times_4,  0), carry);
6260   subl(tmp3, 1);
6261   jccb(Assembler::negative, L_done);
6262 
6263   shrq(carry, 32);
6264   movl(Address(z, tmp3, Address::times_4,  0), carry);
6265   jmp(L_second_loop);
6266 
6267   // Next infrequent code is moved outside loops.
6268   bind(L_last_x);
6269   if (UseBMI2Instructions) {
6270     movl(rdx, Address(x,  0));
6271   } else {
6272     movl(x_xstart, Address(x,  0));
6273   }
6274   jmp(L_third_loop_prologue);
6275 
6276   bind(L_done);
6277 
6278   pop(zlen);
6279   pop(xlen);
6280 
6281   pop(tmp5);
6282   pop(tmp4);
6283   pop(tmp3);
6284   pop(tmp2);
6285   pop(tmp1);
6286 }
6287 
6288 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
6289   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
6290   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
6291   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
6292   Label VECTOR8_TAIL, VECTOR4_TAIL;
6293   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
6294   Label SAME_TILL_END, DONE;
6295   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
6296 
6297   //scale is in rcx in both Win64 and Unix
6298   ShortBranchVerifier sbv(this);
6299 
6300   shlq(length);
6301   xorq(result, result);
6302 
6303   if ((AVX3Threshold == 0) && (UseAVX > 2) &&
6304       VM_Version::supports_avx512vlbw()) {
6305     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
6306 
6307     cmpq(length, 64);
6308     jcc(Assembler::less, VECTOR32_TAIL);
6309 
6310     movq(tmp1, length);
6311     andq(tmp1, 0x3F);      // tail count
6312     andq(length, ~(0x3F)); //vector count
6313 
6314     bind(VECTOR64_LOOP);
6315     // AVX512 code to compare 64 byte vectors.
6316     evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
6317     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
6318     kortestql(k7, k7);
6319     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
6320     addq(result, 64);
6321     subq(length, 64);
6322     jccb(Assembler::notZero, VECTOR64_LOOP);
6323 
6324     //bind(VECTOR64_TAIL);
6325     testq(tmp1, tmp1);
6326     jcc(Assembler::zero, SAME_TILL_END);
6327 
6328     //bind(VECTOR64_TAIL);
6329     // AVX512 code to compare up to 63 byte vectors.
6330     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
6331     shlxq(tmp2, tmp2, tmp1);
6332     notq(tmp2);
6333     kmovql(k3, tmp2);
6334 
6335     evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
6336     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
6337 
6338     ktestql(k7, k3);
6339     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
6340 
6341     bind(VECTOR64_NOT_EQUAL);
6342     kmovql(tmp1, k7);
6343     notq(tmp1);
6344     tzcntq(tmp1, tmp1);
6345     addq(result, tmp1);
6346     shrq(result);
6347     jmp(DONE);
6348     bind(VECTOR32_TAIL);
6349   }
6350 
6351   cmpq(length, 8);
6352   jcc(Assembler::equal, VECTOR8_LOOP);
6353   jcc(Assembler::less, VECTOR4_TAIL);
6354 
6355   if (UseAVX >= 2) {
6356     Label VECTOR16_TAIL, VECTOR32_LOOP;
6357 
6358     cmpq(length, 16);
6359     jcc(Assembler::equal, VECTOR16_LOOP);
6360     jcc(Assembler::less, VECTOR8_LOOP);
6361 
6362     cmpq(length, 32);
6363     jccb(Assembler::less, VECTOR16_TAIL);
6364 
6365     subq(length, 32);
6366     bind(VECTOR32_LOOP);
6367     vmovdqu(rymm0, Address(obja, result));
6368     vmovdqu(rymm1, Address(objb, result));
6369     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
6370     vptest(rymm2, rymm2);
6371     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
6372     addq(result, 32);
6373     subq(length, 32);
6374     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
6375     addq(length, 32);
6376     jcc(Assembler::equal, SAME_TILL_END);
6377     //falling through if less than 32 bytes left //close the branch here.
6378 
6379     bind(VECTOR16_TAIL);
6380     cmpq(length, 16);
6381     jccb(Assembler::less, VECTOR8_TAIL);
6382     bind(VECTOR16_LOOP);
6383     movdqu(rymm0, Address(obja, result));
6384     movdqu(rymm1, Address(objb, result));
6385     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
6386     ptest(rymm2, rymm2);
6387     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6388     addq(result, 16);
6389     subq(length, 16);
6390     jcc(Assembler::equal, SAME_TILL_END);
6391     //falling through if less than 16 bytes left
6392   } else {//regular intrinsics
6393 
6394     cmpq(length, 16);
6395     jccb(Assembler::less, VECTOR8_TAIL);
6396 
6397     subq(length, 16);
6398     bind(VECTOR16_LOOP);
6399     movdqu(rymm0, Address(obja, result));
6400     movdqu(rymm1, Address(objb, result));
6401     pxor(rymm0, rymm1);
6402     ptest(rymm0, rymm0);
6403     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6404     addq(result, 16);
6405     subq(length, 16);
6406     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
6407     addq(length, 16);
6408     jcc(Assembler::equal, SAME_TILL_END);
6409     //falling through if less than 16 bytes left
6410   }
6411 
6412   bind(VECTOR8_TAIL);
6413   cmpq(length, 8);
6414   jccb(Assembler::less, VECTOR4_TAIL);
6415   bind(VECTOR8_LOOP);
6416   movq(tmp1, Address(obja, result));
6417   movq(tmp2, Address(objb, result));
6418   xorq(tmp1, tmp2);
6419   testq(tmp1, tmp1);
6420   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
6421   addq(result, 8);
6422   subq(length, 8);
6423   jcc(Assembler::equal, SAME_TILL_END);
6424   //falling through if less than 8 bytes left
6425 
6426   bind(VECTOR4_TAIL);
6427   cmpq(length, 4);
6428   jccb(Assembler::less, BYTES_TAIL);
6429   bind(VECTOR4_LOOP);
6430   movl(tmp1, Address(obja, result));
6431   xorl(tmp1, Address(objb, result));
6432   testl(tmp1, tmp1);
6433   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
6434   addq(result, 4);
6435   subq(length, 4);
6436   jcc(Assembler::equal, SAME_TILL_END);
6437   //falling through if less than 4 bytes left
6438 
6439   bind(BYTES_TAIL);
6440   bind(BYTES_LOOP);
6441   load_unsigned_byte(tmp1, Address(obja, result));
6442   load_unsigned_byte(tmp2, Address(objb, result));
6443   xorl(tmp1, tmp2);
6444   testl(tmp1, tmp1);
6445   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6446   decq(length);
6447   jcc(Assembler::zero, SAME_TILL_END);
6448   incq(result);
6449   load_unsigned_byte(tmp1, Address(obja, result));
6450   load_unsigned_byte(tmp2, Address(objb, result));
6451   xorl(tmp1, tmp2);
6452   testl(tmp1, tmp1);
6453   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6454   decq(length);
6455   jcc(Assembler::zero, SAME_TILL_END);
6456   incq(result);
6457   load_unsigned_byte(tmp1, Address(obja, result));
6458   load_unsigned_byte(tmp2, Address(objb, result));
6459   xorl(tmp1, tmp2);
6460   testl(tmp1, tmp1);
6461   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6462   jmp(SAME_TILL_END);
6463 
6464   if (UseAVX >= 2) {
6465     bind(VECTOR32_NOT_EQUAL);
6466     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
6467     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
6468     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
6469     vpmovmskb(tmp1, rymm0);
6470     bsfq(tmp1, tmp1);
6471     addq(result, tmp1);
6472     shrq(result);
6473     jmp(DONE);
6474   }
6475 
6476   bind(VECTOR16_NOT_EQUAL);
6477   if (UseAVX >= 2) {
6478     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
6479     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
6480     pxor(rymm0, rymm2);
6481   } else {
6482     pcmpeqb(rymm2, rymm2);
6483     pxor(rymm0, rymm1);
6484     pcmpeqb(rymm0, rymm1);
6485     pxor(rymm0, rymm2);
6486   }
6487   pmovmskb(tmp1, rymm0);
6488   bsfq(tmp1, tmp1);
6489   addq(result, tmp1);
6490   shrq(result);
6491   jmpb(DONE);
6492 
6493   bind(VECTOR8_NOT_EQUAL);
6494   bind(VECTOR4_NOT_EQUAL);
6495   bsfq(tmp1, tmp1);
6496   shrq(tmp1, 3);
6497   addq(result, tmp1);
6498   bind(BYTES_NOT_EQUAL);
6499   shrq(result);
6500   jmpb(DONE);
6501 
6502   bind(SAME_TILL_END);
6503   mov64(result, -1);
6504 
6505   bind(DONE);
6506 }
6507 
6508 //Helper functions for square_to_len()
6509 
6510 /**
6511  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
6512  * Preserves x and z and modifies rest of the registers.
6513  */
6514 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6515   // Perform square and right shift by 1
6516   // Handle odd xlen case first, then for even xlen do the following
6517   // jlong carry = 0;
6518   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
6519   //     huge_128 product = x[j:j+1] * x[j:j+1];
6520   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
6521   //     z[i+2:i+3] = (jlong)(product >>> 1);
6522   //     carry = (jlong)product;
6523   // }
6524 
6525   xorq(tmp5, tmp5);     // carry
6526   xorq(rdxReg, rdxReg);
6527   xorl(tmp1, tmp1);     // index for x
6528   xorl(tmp4, tmp4);     // index for z
6529 
6530   Label L_first_loop, L_first_loop_exit;
6531 
6532   testl(xlen, 1);
6533   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
6534 
6535   // Square and right shift by 1 the odd element using 32 bit multiply
6536   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
6537   imulq(raxReg, raxReg);
6538   shrq(raxReg, 1);
6539   adcq(tmp5, 0);
6540   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
6541   incrementl(tmp1);
6542   addl(tmp4, 2);
6543 
6544   // Square and  right shift by 1 the rest using 64 bit multiply
6545   bind(L_first_loop);
6546   cmpptr(tmp1, xlen);
6547   jccb(Assembler::equal, L_first_loop_exit);
6548 
6549   // Square
6550   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
6551   rorq(raxReg, 32);    // convert big-endian to little-endian
6552   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
6553 
6554   // Right shift by 1 and save carry
6555   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
6556   rcrq(rdxReg, 1);
6557   rcrq(raxReg, 1);
6558   adcq(tmp5, 0);
6559 
6560   // Store result in z
6561   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
6562   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
6563 
6564   // Update indices for x and z
6565   addl(tmp1, 2);
6566   addl(tmp4, 4);
6567   jmp(L_first_loop);
6568 
6569   bind(L_first_loop_exit);
6570 }
6571 
6572 
6573 /**
6574  * Perform the following multiply add operation using BMI2 instructions
6575  * carry:sum = sum + op1*op2 + carry
6576  * op2 should be in rdx
6577  * op2 is preserved, all other registers are modified
6578  */
6579 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
6580   // assert op2 is rdx
6581   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
6582   addq(sum, carry);
6583   adcq(tmp2, 0);
6584   addq(sum, op1);
6585   adcq(tmp2, 0);
6586   movq(carry, tmp2);
6587 }
6588 
6589 /**
6590  * Perform the following multiply add operation:
6591  * carry:sum = sum + op1*op2 + carry
6592  * Preserves op1, op2 and modifies rest of registers
6593  */
6594 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
6595   // rdx:rax = op1 * op2
6596   movq(raxReg, op2);
6597   mulq(op1);
6598 
6599   //  rdx:rax = sum + carry + rdx:rax
6600   addq(sum, carry);
6601   adcq(rdxReg, 0);
6602   addq(sum, raxReg);
6603   adcq(rdxReg, 0);
6604 
6605   // carry:sum = rdx:sum
6606   movq(carry, rdxReg);
6607 }
6608 
6609 /**
6610  * Add 64 bit long carry into z[] with carry propagation.
6611  * Preserves z and carry register values and modifies rest of registers.
6612  *
6613  */
6614 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
6615   Label L_fourth_loop, L_fourth_loop_exit;
6616 
6617   movl(tmp1, 1);
6618   subl(zlen, 2);
6619   addq(Address(z, zlen, Address::times_4, 0), carry);
6620 
6621   bind(L_fourth_loop);
6622   jccb(Assembler::carryClear, L_fourth_loop_exit);
6623   subl(zlen, 2);
6624   jccb(Assembler::negative, L_fourth_loop_exit);
6625   addq(Address(z, zlen, Address::times_4, 0), tmp1);
6626   jmp(L_fourth_loop);
6627   bind(L_fourth_loop_exit);
6628 }
6629 
6630 /**
6631  * Shift z[] left by 1 bit.
6632  * Preserves x, len, z and zlen registers and modifies rest of the registers.
6633  *
6634  */
6635 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
6636 
6637   Label L_fifth_loop, L_fifth_loop_exit;
6638 
6639   // Fifth loop
6640   // Perform primitiveLeftShift(z, zlen, 1)
6641 
6642   const Register prev_carry = tmp1;
6643   const Register new_carry = tmp4;
6644   const Register value = tmp2;
6645   const Register zidx = tmp3;
6646 
6647   // int zidx, carry;
6648   // long value;
6649   // carry = 0;
6650   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
6651   //    (carry:value)  = (z[i] << 1) | carry ;
6652   //    z[i] = value;
6653   // }
6654 
6655   movl(zidx, zlen);
6656   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
6657 
6658   bind(L_fifth_loop);
6659   decl(zidx);  // Use decl to preserve carry flag
6660   decl(zidx);
6661   jccb(Assembler::negative, L_fifth_loop_exit);
6662 
6663   if (UseBMI2Instructions) {
6664      movq(value, Address(z, zidx, Address::times_4, 0));
6665      rclq(value, 1);
6666      rorxq(value, value, 32);
6667      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6668   }
6669   else {
6670     // clear new_carry
6671     xorl(new_carry, new_carry);
6672 
6673     // Shift z[i] by 1, or in previous carry and save new carry
6674     movq(value, Address(z, zidx, Address::times_4, 0));
6675     shlq(value, 1);
6676     adcl(new_carry, 0);
6677 
6678     orq(value, prev_carry);
6679     rorq(value, 0x20);
6680     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6681 
6682     // Set previous carry = new carry
6683     movl(prev_carry, new_carry);
6684   }
6685   jmp(L_fifth_loop);
6686 
6687   bind(L_fifth_loop_exit);
6688 }
6689 
6690 
6691 /**
6692  * Code for BigInteger::squareToLen() intrinsic
6693  *
6694  * rdi: x
6695  * rsi: len
6696  * r8:  z
6697  * rcx: zlen
6698  * r12: tmp1
6699  * r13: tmp2
6700  * r14: tmp3
6701  * r15: tmp4
6702  * rbx: tmp5
6703  *
6704  */
6705 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6706 
6707   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
6708   push(tmp1);
6709   push(tmp2);
6710   push(tmp3);
6711   push(tmp4);
6712   push(tmp5);
6713 
6714   // First loop
6715   // Store the squares, right shifted one bit (i.e., divided by 2).
6716   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
6717 
6718   // Add in off-diagonal sums.
6719   //
6720   // Second, third (nested) and fourth loops.
6721   // zlen +=2;
6722   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
6723   //    carry = 0;
6724   //    long op2 = x[xidx:xidx+1];
6725   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
6726   //       k -= 2;
6727   //       long op1 = x[j:j+1];
6728   //       long sum = z[k:k+1];
6729   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
6730   //       z[k:k+1] = sum;
6731   //    }
6732   //    add_one_64(z, k, carry, tmp_regs);
6733   // }
6734 
6735   const Register carry = tmp5;
6736   const Register sum = tmp3;
6737   const Register op1 = tmp4;
6738   Register op2 = tmp2;
6739 
6740   push(zlen);
6741   push(len);
6742   addl(zlen,2);
6743   bind(L_second_loop);
6744   xorq(carry, carry);
6745   subl(zlen, 4);
6746   subl(len, 2);
6747   push(zlen);
6748   push(len);
6749   cmpl(len, 0);
6750   jccb(Assembler::lessEqual, L_second_loop_exit);
6751 
6752   // Multiply an array by one 64 bit long.
6753   if (UseBMI2Instructions) {
6754     op2 = rdxReg;
6755     movq(op2, Address(x, len, Address::times_4,  0));
6756     rorxq(op2, op2, 32);
6757   }
6758   else {
6759     movq(op2, Address(x, len, Address::times_4,  0));
6760     rorq(op2, 32);
6761   }
6762 
6763   bind(L_third_loop);
6764   decrementl(len);
6765   jccb(Assembler::negative, L_third_loop_exit);
6766   decrementl(len);
6767   jccb(Assembler::negative, L_last_x);
6768 
6769   movq(op1, Address(x, len, Address::times_4,  0));
6770   rorq(op1, 32);
6771 
6772   bind(L_multiply);
6773   subl(zlen, 2);
6774   movq(sum, Address(z, zlen, Address::times_4,  0));
6775 
6776   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
6777   if (UseBMI2Instructions) {
6778     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
6779   }
6780   else {
6781     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6782   }
6783 
6784   movq(Address(z, zlen, Address::times_4, 0), sum);
6785 
6786   jmp(L_third_loop);
6787   bind(L_third_loop_exit);
6788 
6789   // Fourth loop
6790   // Add 64 bit long carry into z with carry propagation.
6791   // Uses offsetted zlen.
6792   add_one_64(z, zlen, carry, tmp1);
6793 
6794   pop(len);
6795   pop(zlen);
6796   jmp(L_second_loop);
6797 
6798   // Next infrequent code is moved outside loops.
6799   bind(L_last_x);
6800   movl(op1, Address(x, 0));
6801   jmp(L_multiply);
6802 
6803   bind(L_second_loop_exit);
6804   pop(len);
6805   pop(zlen);
6806   pop(len);
6807   pop(zlen);
6808 
6809   // Fifth loop
6810   // Shift z left 1 bit.
6811   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
6812 
6813   // z[zlen-1] |= x[len-1] & 1;
6814   movl(tmp3, Address(x, len, Address::times_4, -4));
6815   andl(tmp3, 1);
6816   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
6817 
6818   pop(tmp5);
6819   pop(tmp4);
6820   pop(tmp3);
6821   pop(tmp2);
6822   pop(tmp1);
6823 }
6824 
6825 /**
6826  * Helper function for mul_add()
6827  * Multiply the in[] by int k and add to out[] starting at offset offs using
6828  * 128 bit by 32 bit multiply and return the carry in tmp5.
6829  * Only quad int aligned length of in[] is operated on in this function.
6830  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
6831  * This function preserves out, in and k registers.
6832  * len and offset point to the appropriate index in "in" & "out" correspondingly
6833  * tmp5 has the carry.
6834  * other registers are temporary and are modified.
6835  *
6836  */
6837 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
6838   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
6839   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6840 
6841   Label L_first_loop, L_first_loop_exit;
6842 
6843   movl(tmp1, len);
6844   shrl(tmp1, 2);
6845 
6846   bind(L_first_loop);
6847   subl(tmp1, 1);
6848   jccb(Assembler::negative, L_first_loop_exit);
6849 
6850   subl(len, 4);
6851   subl(offset, 4);
6852 
6853   Register op2 = tmp2;
6854   const Register sum = tmp3;
6855   const Register op1 = tmp4;
6856   const Register carry = tmp5;
6857 
6858   if (UseBMI2Instructions) {
6859     op2 = rdxReg;
6860   }
6861 
6862   movq(op1, Address(in, len, Address::times_4,  8));
6863   rorq(op1, 32);
6864   movq(sum, Address(out, offset, Address::times_4,  8));
6865   rorq(sum, 32);
6866   if (UseBMI2Instructions) {
6867     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6868   }
6869   else {
6870     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6871   }
6872   // Store back in big endian from little endian
6873   rorq(sum, 0x20);
6874   movq(Address(out, offset, Address::times_4,  8), sum);
6875 
6876   movq(op1, Address(in, len, Address::times_4,  0));
6877   rorq(op1, 32);
6878   movq(sum, Address(out, offset, Address::times_4,  0));
6879   rorq(sum, 32);
6880   if (UseBMI2Instructions) {
6881     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6882   }
6883   else {
6884     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6885   }
6886   // Store back in big endian from little endian
6887   rorq(sum, 0x20);
6888   movq(Address(out, offset, Address::times_4,  0), sum);
6889 
6890   jmp(L_first_loop);
6891   bind(L_first_loop_exit);
6892 }
6893 
6894 /**
6895  * Code for BigInteger::mulAdd() intrinsic
6896  *
6897  * rdi: out
6898  * rsi: in
6899  * r11: offs (out.length - offset)
6900  * rcx: len
6901  * r8:  k
6902  * r12: tmp1
6903  * r13: tmp2
6904  * r14: tmp3
6905  * r15: tmp4
6906  * rbx: tmp5
6907  * Multiply the in[] by word k and add to out[], return the carry in rax
6908  */
6909 void MacroAssembler::mul_add(Register out, Register in, Register offs,
6910    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
6911    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6912 
6913   Label L_carry, L_last_in, L_done;
6914 
6915 // carry = 0;
6916 // for (int j=len-1; j >= 0; j--) {
6917 //    long product = (in[j] & LONG_MASK) * kLong +
6918 //                   (out[offs] & LONG_MASK) + carry;
6919 //    out[offs--] = (int)product;
6920 //    carry = product >>> 32;
6921 // }
6922 //
6923   push(tmp1);
6924   push(tmp2);
6925   push(tmp3);
6926   push(tmp4);
6927   push(tmp5);
6928 
6929   Register op2 = tmp2;
6930   const Register sum = tmp3;
6931   const Register op1 = tmp4;
6932   const Register carry =  tmp5;
6933 
6934   if (UseBMI2Instructions) {
6935     op2 = rdxReg;
6936     movl(op2, k);
6937   }
6938   else {
6939     movl(op2, k);
6940   }
6941 
6942   xorq(carry, carry);
6943 
6944   //First loop
6945 
6946   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
6947   //The carry is in tmp5
6948   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
6949 
6950   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
6951   decrementl(len);
6952   jccb(Assembler::negative, L_carry);
6953   decrementl(len);
6954   jccb(Assembler::negative, L_last_in);
6955 
6956   movq(op1, Address(in, len, Address::times_4,  0));
6957   rorq(op1, 32);
6958 
6959   subl(offs, 2);
6960   movq(sum, Address(out, offs, Address::times_4,  0));
6961   rorq(sum, 32);
6962 
6963   if (UseBMI2Instructions) {
6964     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6965   }
6966   else {
6967     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6968   }
6969 
6970   // Store back in big endian from little endian
6971   rorq(sum, 0x20);
6972   movq(Address(out, offs, Address::times_4,  0), sum);
6973 
6974   testl(len, len);
6975   jccb(Assembler::zero, L_carry);
6976 
6977   //Multiply the last in[] entry, if any
6978   bind(L_last_in);
6979   movl(op1, Address(in, 0));
6980   movl(sum, Address(out, offs, Address::times_4,  -4));
6981 
6982   movl(raxReg, k);
6983   mull(op1); //tmp4 * eax -> edx:eax
6984   addl(sum, carry);
6985   adcl(rdxReg, 0);
6986   addl(sum, raxReg);
6987   adcl(rdxReg, 0);
6988   movl(carry, rdxReg);
6989 
6990   movl(Address(out, offs, Address::times_4,  -4), sum);
6991 
6992   bind(L_carry);
6993   //return tmp5/carry as carry in rax
6994   movl(rax, carry);
6995 
6996   bind(L_done);
6997   pop(tmp5);
6998   pop(tmp4);
6999   pop(tmp3);
7000   pop(tmp2);
7001   pop(tmp1);
7002 }
7003 #endif
7004 
7005 /**
7006  * Emits code to update CRC-32 with a byte value according to constants in table
7007  *
7008  * @param [in,out]crc   Register containing the crc.
7009  * @param [in]val       Register containing the byte to fold into the CRC.
7010  * @param [in]table     Register containing the table of crc constants.
7011  *
7012  * uint32_t crc;
7013  * val = crc_table[(val ^ crc) & 0xFF];
7014  * crc = val ^ (crc >> 8);
7015  *
7016  */
7017 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7018   xorl(val, crc);
7019   andl(val, 0xFF);
7020   shrl(crc, 8); // unsigned shift
7021   xorl(crc, Address(table, val, Address::times_4, 0));
7022 }
7023 
7024 /**
7025  * Fold 128-bit data chunk
7026  */
7027 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7028   if (UseAVX > 0) {
7029     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7030     vpclmulldq(xcrc, xK, xcrc); // [63:0]
7031     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7032     pxor(xcrc, xtmp);
7033   } else {
7034     movdqa(xtmp, xcrc);
7035     pclmulhdq(xtmp, xK);   // [123:64]
7036     pclmulldq(xcrc, xK);   // [63:0]
7037     pxor(xcrc, xtmp);
7038     movdqu(xtmp, Address(buf, offset));
7039     pxor(xcrc, xtmp);
7040   }
7041 }
7042 
7043 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7044   if (UseAVX > 0) {
7045     vpclmulhdq(xtmp, xK, xcrc);
7046     vpclmulldq(xcrc, xK, xcrc);
7047     pxor(xcrc, xbuf);
7048     pxor(xcrc, xtmp);
7049   } else {
7050     movdqa(xtmp, xcrc);
7051     pclmulhdq(xtmp, xK);
7052     pclmulldq(xcrc, xK);
7053     pxor(xcrc, xbuf);
7054     pxor(xcrc, xtmp);
7055   }
7056 }
7057 
7058 /**
7059  * 8-bit folds to compute 32-bit CRC
7060  *
7061  * uint64_t xcrc;
7062  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7063  */
7064 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7065   movdl(tmp, xcrc);
7066   andl(tmp, 0xFF);
7067   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7068   psrldq(xcrc, 1); // unsigned shift one byte
7069   pxor(xcrc, xtmp);
7070 }
7071 
7072 /**
7073  * uint32_t crc;
7074  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7075  */
7076 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7077   movl(tmp, crc);
7078   andl(tmp, 0xFF);
7079   shrl(crc, 8);
7080   xorl(crc, Address(table, tmp, Address::times_4, 0));
7081 }
7082 
7083 /**
7084  * @param crc   register containing existing CRC (32-bit)
7085  * @param buf   register pointing to input byte buffer (byte*)
7086  * @param len   register containing number of bytes
7087  * @param table register that will contain address of CRC table
7088  * @param tmp   scratch register
7089  */
7090 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7091   assert_different_registers(crc, buf, len, table, tmp, rax);
7092 
7093   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7094   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7095 
7096   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7097   // context for the registers used, where all instructions below are using 128-bit mode
7098   // On EVEX without VL and BW, these instructions will all be AVX.
7099   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7100   notl(crc); // ~crc
7101   cmpl(len, 16);
7102   jcc(Assembler::less, L_tail);
7103 
7104   // Align buffer to 16 bytes
7105   movl(tmp, buf);
7106   andl(tmp, 0xF);
7107   jccb(Assembler::zero, L_aligned);
7108   subl(tmp,  16);
7109   addl(len, tmp);
7110 
7111   align(4);
7112   BIND(L_align_loop);
7113   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7114   update_byte_crc32(crc, rax, table);
7115   increment(buf);
7116   incrementl(tmp);
7117   jccb(Assembler::less, L_align_loop);
7118 
7119   BIND(L_aligned);
7120   movl(tmp, len); // save
7121   shrl(len, 4);
7122   jcc(Assembler::zero, L_tail_restore);
7123 
7124   // Fold crc into first bytes of vector
7125   movdqa(xmm1, Address(buf, 0));
7126   movdl(rax, xmm1);
7127   xorl(crc, rax);
7128   if (VM_Version::supports_sse4_1()) {
7129     pinsrd(xmm1, crc, 0);
7130   } else {
7131     pinsrw(xmm1, crc, 0);
7132     shrl(crc, 16);
7133     pinsrw(xmm1, crc, 1);
7134   }
7135   addptr(buf, 16);
7136   subl(len, 4); // len > 0
7137   jcc(Assembler::less, L_fold_tail);
7138 
7139   movdqa(xmm2, Address(buf,  0));
7140   movdqa(xmm3, Address(buf, 16));
7141   movdqa(xmm4, Address(buf, 32));
7142   addptr(buf, 48);
7143   subl(len, 3);
7144   jcc(Assembler::lessEqual, L_fold_512b);
7145 
7146   // Fold total 512 bits of polynomial on each iteration,
7147   // 128 bits per each of 4 parallel streams.
7148   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
7149 
7150   align32();
7151   BIND(L_fold_512b_loop);
7152   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7153   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7154   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7155   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7156   addptr(buf, 64);
7157   subl(len, 4);
7158   jcc(Assembler::greater, L_fold_512b_loop);
7159 
7160   // Fold 512 bits to 128 bits.
7161   BIND(L_fold_512b);
7162   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7163   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7164   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7165   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7166 
7167   // Fold the rest of 128 bits data chunks
7168   BIND(L_fold_tail);
7169   addl(len, 3);
7170   jccb(Assembler::lessEqual, L_fold_128b);
7171   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7172 
7173   BIND(L_fold_tail_loop);
7174   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7175   addptr(buf, 16);
7176   decrementl(len);
7177   jccb(Assembler::greater, L_fold_tail_loop);
7178 
7179   // Fold 128 bits in xmm1 down into 32 bits in crc register.
7180   BIND(L_fold_128b);
7181   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7182   if (UseAVX > 0) {
7183     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7184     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7185     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7186   } else {
7187     movdqa(xmm2, xmm0);
7188     pclmulqdq(xmm2, xmm1, 0x1);
7189     movdqa(xmm3, xmm0);
7190     pand(xmm3, xmm2);
7191     pclmulqdq(xmm0, xmm3, 0x1);
7192   }
7193   psrldq(xmm1, 8);
7194   psrldq(xmm2, 4);
7195   pxor(xmm0, xmm1);
7196   pxor(xmm0, xmm2);
7197 
7198   // 8 8-bit folds to compute 32-bit CRC.
7199   for (int j = 0; j < 4; j++) {
7200     fold_8bit_crc32(xmm0, table, xmm1, rax);
7201   }
7202   movdl(crc, xmm0); // mov 32 bits to general register
7203   for (int j = 0; j < 4; j++) {
7204     fold_8bit_crc32(crc, table, rax);
7205   }
7206 
7207   BIND(L_tail_restore);
7208   movl(len, tmp); // restore
7209   BIND(L_tail);
7210   andl(len, 0xf);
7211   jccb(Assembler::zero, L_exit);
7212 
7213   // Fold the rest of bytes
7214   align(4);
7215   BIND(L_tail_loop);
7216   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7217   update_byte_crc32(crc, rax, table);
7218   increment(buf);
7219   decrementl(len);
7220   jccb(Assembler::greater, L_tail_loop);
7221 
7222   BIND(L_exit);
7223   notl(crc); // ~c
7224 }
7225 
7226 #ifdef _LP64
7227 // Helper function for AVX 512 CRC32
7228 // Fold 512-bit data chunks
7229 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
7230                                              Register pos, int offset) {
7231   evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
7232   evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
7233   evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
7234   evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
7235   evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
7236 }
7237 
7238 // Helper function for AVX 512 CRC32
7239 // Compute CRC32 for < 256B buffers
7240 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
7241                                               Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7242                                               Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7243 
7244   Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
7245   Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
7246   Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
7247 
7248   // check if there is enough buffer to be able to fold 16B at a time
7249   cmpl(len, 32);
7250   jcc(Assembler::less, L_less_than_32);
7251 
7252   // if there is, load the constants
7253   movdqu(xmm10, Address(table, 1 * 16));    //rk1 and rk2 in xmm10
7254   movdl(xmm0, crc);                        // get the initial crc value
7255   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7256   pxor(xmm7, xmm0);
7257 
7258   // update the buffer pointer
7259   addl(pos, 16);
7260   //update the counter.subtract 32 instead of 16 to save one instruction from the loop
7261   subl(len, 32);
7262   jmp(L_16B_reduction_loop);
7263 
7264   bind(L_less_than_32);
7265   //mov initial crc to the return value. this is necessary for zero - length buffers.
7266   movl(rax, crc);
7267   testl(len, len);
7268   jcc(Assembler::equal, L_cleanup);
7269 
7270   movdl(xmm0, crc);                        //get the initial crc value
7271 
7272   cmpl(len, 16);
7273   jcc(Assembler::equal, L_exact_16_left);
7274   jcc(Assembler::less, L_less_than_16_left);
7275 
7276   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7277   pxor(xmm7, xmm0);                       //xor the initial crc value
7278   addl(pos, 16);
7279   subl(len, 16);
7280   movdqu(xmm10, Address(table, 1 * 16));    // rk1 and rk2 in xmm10
7281   jmp(L_get_last_two_xmms);
7282 
7283   bind(L_less_than_16_left);
7284   //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
7285   pxor(xmm1, xmm1);
7286   movptr(tmp1, rsp);
7287   movdqu(Address(tmp1, 0 * 16), xmm1);
7288 
7289   cmpl(len, 4);
7290   jcc(Assembler::less, L_only_less_than_4);
7291 
7292   //backup the counter value
7293   movl(tmp2, len);
7294   cmpl(len, 8);
7295   jcc(Assembler::less, L_less_than_8_left);
7296 
7297   //load 8 Bytes
7298   movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
7299   movq(Address(tmp1, 0 * 16), rax);
7300   addptr(tmp1, 8);
7301   subl(len, 8);
7302   addl(pos, 8);
7303 
7304   bind(L_less_than_8_left);
7305   cmpl(len, 4);
7306   jcc(Assembler::less, L_less_than_4_left);
7307 
7308   //load 4 Bytes
7309   movl(rax, Address(buf, pos, Address::times_1, 0));
7310   movl(Address(tmp1, 0 * 16), rax);
7311   addptr(tmp1, 4);
7312   subl(len, 4);
7313   addl(pos, 4);
7314 
7315   bind(L_less_than_4_left);
7316   cmpl(len, 2);
7317   jcc(Assembler::less, L_less_than_2_left);
7318 
7319   // load 2 Bytes
7320   movw(rax, Address(buf, pos, Address::times_1, 0));
7321   movl(Address(tmp1, 0 * 16), rax);
7322   addptr(tmp1, 2);
7323   subl(len, 2);
7324   addl(pos, 2);
7325 
7326   bind(L_less_than_2_left);
7327   cmpl(len, 1);
7328   jcc(Assembler::less, L_zero_left);
7329 
7330   // load 1 Byte
7331   movb(rax, Address(buf, pos, Address::times_1, 0));
7332   movb(Address(tmp1, 0 * 16), rax);
7333 
7334   bind(L_zero_left);
7335   movdqu(xmm7, Address(rsp, 0));
7336   pxor(xmm7, xmm0);                       //xor the initial crc value
7337 
7338   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7339   movdqu(xmm0, Address(rax, tmp2));
7340   pshufb(xmm7, xmm0);
7341   jmp(L_128_done);
7342 
7343   bind(L_exact_16_left);
7344   movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
7345   pxor(xmm7, xmm0);                       //xor the initial crc value
7346   jmp(L_128_done);
7347 
7348   bind(L_only_less_than_4);
7349   cmpl(len, 3);
7350   jcc(Assembler::less, L_only_less_than_3);
7351 
7352   // load 3 Bytes
7353   movb(rax, Address(buf, pos, Address::times_1, 0));
7354   movb(Address(tmp1, 0), rax);
7355 
7356   movb(rax, Address(buf, pos, Address::times_1, 1));
7357   movb(Address(tmp1, 1), rax);
7358 
7359   movb(rax, Address(buf, pos, Address::times_1, 2));
7360   movb(Address(tmp1, 2), rax);
7361 
7362   movdqu(xmm7, Address(rsp, 0));
7363   pxor(xmm7, xmm0);                     //xor the initial crc value
7364 
7365   pslldq(xmm7, 0x5);
7366   jmp(L_barrett);
7367   bind(L_only_less_than_3);
7368   cmpl(len, 2);
7369   jcc(Assembler::less, L_only_less_than_2);
7370 
7371   // load 2 Bytes
7372   movb(rax, Address(buf, pos, Address::times_1, 0));
7373   movb(Address(tmp1, 0), rax);
7374 
7375   movb(rax, Address(buf, pos, Address::times_1, 1));
7376   movb(Address(tmp1, 1), rax);
7377 
7378   movdqu(xmm7, Address(rsp, 0));
7379   pxor(xmm7, xmm0);                     //xor the initial crc value
7380 
7381   pslldq(xmm7, 0x6);
7382   jmp(L_barrett);
7383 
7384   bind(L_only_less_than_2);
7385   //load 1 Byte
7386   movb(rax, Address(buf, pos, Address::times_1, 0));
7387   movb(Address(tmp1, 0), rax);
7388 
7389   movdqu(xmm7, Address(rsp, 0));
7390   pxor(xmm7, xmm0);                     //xor the initial crc value
7391 
7392   pslldq(xmm7, 0x7);
7393 }
7394 
7395 /**
7396 * Compute CRC32 using AVX512 instructions
7397 * param crc   register containing existing CRC (32-bit)
7398 * param buf   register pointing to input byte buffer (byte*)
7399 * param len   register containing number of bytes
7400 * param table address of crc or crc32c table
7401 * param tmp1  scratch register
7402 * param tmp2  scratch register
7403 * return rax  result register
7404 *
7405 * This routine is identical for crc32c with the exception of the precomputed constant
7406 * table which will be passed as the table argument.  The calculation steps are
7407 * the same for both variants.
7408 */
7409 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
7410   assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
7411 
7412   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7413   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7414   Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
7415   Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
7416   Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
7417 
7418   const Register pos = r12;
7419   push(r12);
7420   subptr(rsp, 16 * 2 + 8);
7421 
7422   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7423   // context for the registers used, where all instructions below are using 128-bit mode
7424   // On EVEX without VL and BW, these instructions will all be AVX.
7425   movl(pos, 0);
7426 
7427   // check if smaller than 256B
7428   cmpl(len, 256);
7429   jcc(Assembler::less, L_less_than_256);
7430 
7431   // load the initial crc value
7432   movdl(xmm10, crc);
7433 
7434   // receive the initial 64B data, xor the initial crc value
7435   evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
7436   evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
7437   evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
7438   evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
7439 
7440   subl(len, 256);
7441   cmpl(len, 256);
7442   jcc(Assembler::less, L_fold_128_B_loop);
7443 
7444   evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
7445   evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
7446   evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
7447   subl(len, 256);
7448 
7449   bind(L_fold_256_B_loop);
7450   addl(pos, 256);
7451   fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
7452   fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
7453   fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
7454   fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
7455 
7456   subl(len, 256);
7457   jcc(Assembler::greaterEqual, L_fold_256_B_loop);
7458 
7459   // Fold 256 into 128
7460   addl(pos, 256);
7461   evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
7462   evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
7463   vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
7464 
7465   evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
7466   evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
7467   vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
7468 
7469   evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
7470   evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
7471 
7472   addl(len, 128);
7473   jmp(L_fold_128_B_register);
7474 
7475   // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
7476   // loop will fold 128B at a time until we have 128 + y Bytes of buffer
7477 
7478   // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
7479   bind(L_fold_128_B_loop);
7480   addl(pos, 128);
7481   fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
7482   fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
7483 
7484   subl(len, 128);
7485   jcc(Assembler::greaterEqual, L_fold_128_B_loop);
7486 
7487   addl(pos, 128);
7488 
7489   // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
7490   // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
7491   bind(L_fold_128_B_register);
7492   evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
7493   evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
7494   evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
7495   evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
7496   // save last that has no multiplicand
7497   vextracti64x2(xmm7, xmm4, 3);
7498 
7499   evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
7500   evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
7501   // Needed later in reduction loop
7502   movdqu(xmm10, Address(table, 1 * 16));
7503   vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
7504   vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
7505 
7506   // Swap 1,0,3,2 - 01 00 11 10
7507   evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
7508   evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
7509   vextracti128(xmm5, xmm8, 1);
7510   evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
7511 
7512   // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
7513   // instead of a cmp instruction, we use the negative flag with the jl instruction
7514   addl(len, 128 - 16);
7515   jcc(Assembler::less, L_final_reduction_for_128);
7516 
7517   bind(L_16B_reduction_loop);
7518   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7519   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7520   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7521   movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
7522   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7523   addl(pos, 16);
7524   subl(len, 16);
7525   jcc(Assembler::greaterEqual, L_16B_reduction_loop);
7526 
7527   bind(L_final_reduction_for_128);
7528   addl(len, 16);
7529   jcc(Assembler::equal, L_128_done);
7530 
7531   bind(L_get_last_two_xmms);
7532   movdqu(xmm2, xmm7);
7533   addl(pos, len);
7534   movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
7535   subl(pos, len);
7536 
7537   // get rid of the extra data that was loaded before
7538   // load the shift constant
7539   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7540   movdqu(xmm0, Address(rax, len));
7541   addl(rax, len);
7542 
7543   vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7544   //Change mask to 512
7545   vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
7546   vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
7547 
7548   blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
7549   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7550   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7551   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7552   vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
7553 
7554   bind(L_128_done);
7555   // compute crc of a 128-bit value
7556   movdqu(xmm10, Address(table, 3 * 16));
7557   movdqu(xmm0, xmm7);
7558 
7559   // 64b fold
7560   vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
7561   vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
7562   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7563 
7564   // 32b fold
7565   movdqu(xmm0, xmm7);
7566   vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
7567   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7568   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7569   jmp(L_barrett);
7570 
7571   bind(L_less_than_256);
7572   kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
7573 
7574   //barrett reduction
7575   bind(L_barrett);
7576   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
7577   movdqu(xmm1, xmm7);
7578   movdqu(xmm2, xmm7);
7579   movdqu(xmm10, Address(table, 4 * 16));
7580 
7581   pclmulqdq(xmm7, xmm10, 0x0);
7582   pxor(xmm7, xmm2);
7583   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
7584   movdqu(xmm2, xmm7);
7585   pclmulqdq(xmm7, xmm10, 0x10);
7586   pxor(xmm7, xmm2);
7587   pxor(xmm7, xmm1);
7588   pextrd(crc, xmm7, 2);
7589 
7590   bind(L_cleanup);
7591   addptr(rsp, 16 * 2 + 8);
7592   pop(r12);
7593 }
7594 
7595 // S. Gueron / Information Processing Letters 112 (2012) 184
7596 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
7597 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
7598 // Output: the 64-bit carry-less product of B * CONST
7599 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
7600                                      Register tmp1, Register tmp2, Register tmp3) {
7601   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7602   if (n > 0) {
7603     addq(tmp3, n * 256 * 8);
7604   }
7605   //    Q1 = TABLEExt[n][B & 0xFF];
7606   movl(tmp1, in);
7607   andl(tmp1, 0x000000FF);
7608   shll(tmp1, 3);
7609   addq(tmp1, tmp3);
7610   movq(tmp1, Address(tmp1, 0));
7611 
7612   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7613   movl(tmp2, in);
7614   shrl(tmp2, 8);
7615   andl(tmp2, 0x000000FF);
7616   shll(tmp2, 3);
7617   addq(tmp2, tmp3);
7618   movq(tmp2, Address(tmp2, 0));
7619 
7620   shlq(tmp2, 8);
7621   xorq(tmp1, tmp2);
7622 
7623   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7624   movl(tmp2, in);
7625   shrl(tmp2, 16);
7626   andl(tmp2, 0x000000FF);
7627   shll(tmp2, 3);
7628   addq(tmp2, tmp3);
7629   movq(tmp2, Address(tmp2, 0));
7630 
7631   shlq(tmp2, 16);
7632   xorq(tmp1, tmp2);
7633 
7634   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7635   shrl(in, 24);
7636   andl(in, 0x000000FF);
7637   shll(in, 3);
7638   addq(in, tmp3);
7639   movq(in, Address(in, 0));
7640 
7641   shlq(in, 24);
7642   xorq(in, tmp1);
7643   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7644 }
7645 
7646 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7647                                       Register in_out,
7648                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7649                                       XMMRegister w_xtmp2,
7650                                       Register tmp1,
7651                                       Register n_tmp2, Register n_tmp3) {
7652   if (is_pclmulqdq_supported) {
7653     movdl(w_xtmp1, in_out); // modified blindly
7654 
7655     movl(tmp1, const_or_pre_comp_const_index);
7656     movdl(w_xtmp2, tmp1);
7657     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7658 
7659     movdq(in_out, w_xtmp1);
7660   } else {
7661     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
7662   }
7663 }
7664 
7665 // Recombination Alternative 2: No bit-reflections
7666 // T1 = (CRC_A * U1) << 1
7667 // T2 = (CRC_B * U2) << 1
7668 // C1 = T1 >> 32
7669 // C2 = T2 >> 32
7670 // T1 = T1 & 0xFFFFFFFF
7671 // T2 = T2 & 0xFFFFFFFF
7672 // T1 = CRC32(0, T1)
7673 // T2 = CRC32(0, T2)
7674 // C1 = C1 ^ T1
7675 // C2 = C2 ^ T2
7676 // CRC = C1 ^ C2 ^ CRC_C
7677 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7678                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7679                                      Register tmp1, Register tmp2,
7680                                      Register n_tmp3) {
7681   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7682   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7683   shlq(in_out, 1);
7684   movl(tmp1, in_out);
7685   shrq(in_out, 32);
7686   xorl(tmp2, tmp2);
7687   crc32(tmp2, tmp1, 4);
7688   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
7689   shlq(in1, 1);
7690   movl(tmp1, in1);
7691   shrq(in1, 32);
7692   xorl(tmp2, tmp2);
7693   crc32(tmp2, tmp1, 4);
7694   xorl(in1, tmp2);
7695   xorl(in_out, in1);
7696   xorl(in_out, in2);
7697 }
7698 
7699 // Set N to predefined value
7700 // Subtract from a length of a buffer
7701 // execute in a loop:
7702 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
7703 // for i = 1 to N do
7704 //  CRC_A = CRC32(CRC_A, A[i])
7705 //  CRC_B = CRC32(CRC_B, B[i])
7706 //  CRC_C = CRC32(CRC_C, C[i])
7707 // end for
7708 // Recombine
7709 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7710                                        Register in_out1, Register in_out2, Register in_out3,
7711                                        Register tmp1, Register tmp2, Register tmp3,
7712                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7713                                        Register tmp4, Register tmp5,
7714                                        Register n_tmp6) {
7715   Label L_processPartitions;
7716   Label L_processPartition;
7717   Label L_exit;
7718 
7719   bind(L_processPartitions);
7720   cmpl(in_out1, 3 * size);
7721   jcc(Assembler::less, L_exit);
7722     xorl(tmp1, tmp1);
7723     xorl(tmp2, tmp2);
7724     movq(tmp3, in_out2);
7725     addq(tmp3, size);
7726 
7727     bind(L_processPartition);
7728       crc32(in_out3, Address(in_out2, 0), 8);
7729       crc32(tmp1, Address(in_out2, size), 8);
7730       crc32(tmp2, Address(in_out2, size * 2), 8);
7731       addq(in_out2, 8);
7732       cmpq(in_out2, tmp3);
7733       jcc(Assembler::less, L_processPartition);
7734     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7735             w_xtmp1, w_xtmp2, w_xtmp3,
7736             tmp4, tmp5,
7737             n_tmp6);
7738     addq(in_out2, 2 * size);
7739     subl(in_out1, 3 * size);
7740     jmp(L_processPartitions);
7741 
7742   bind(L_exit);
7743 }
7744 #else
7745 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
7746                                      Register tmp1, Register tmp2, Register tmp3,
7747                                      XMMRegister xtmp1, XMMRegister xtmp2) {
7748   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7749   if (n > 0) {
7750     addl(tmp3, n * 256 * 8);
7751   }
7752   //    Q1 = TABLEExt[n][B & 0xFF];
7753   movl(tmp1, in_out);
7754   andl(tmp1, 0x000000FF);
7755   shll(tmp1, 3);
7756   addl(tmp1, tmp3);
7757   movq(xtmp1, Address(tmp1, 0));
7758 
7759   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7760   movl(tmp2, in_out);
7761   shrl(tmp2, 8);
7762   andl(tmp2, 0x000000FF);
7763   shll(tmp2, 3);
7764   addl(tmp2, tmp3);
7765   movq(xtmp2, Address(tmp2, 0));
7766 
7767   psllq(xtmp2, 8);
7768   pxor(xtmp1, xtmp2);
7769 
7770   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7771   movl(tmp2, in_out);
7772   shrl(tmp2, 16);
7773   andl(tmp2, 0x000000FF);
7774   shll(tmp2, 3);
7775   addl(tmp2, tmp3);
7776   movq(xtmp2, Address(tmp2, 0));
7777 
7778   psllq(xtmp2, 16);
7779   pxor(xtmp1, xtmp2);
7780 
7781   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7782   shrl(in_out, 24);
7783   andl(in_out, 0x000000FF);
7784   shll(in_out, 3);
7785   addl(in_out, tmp3);
7786   movq(xtmp2, Address(in_out, 0));
7787 
7788   psllq(xtmp2, 24);
7789   pxor(xtmp1, xtmp2); // Result in CXMM
7790   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7791 }
7792 
7793 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7794                                       Register in_out,
7795                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7796                                       XMMRegister w_xtmp2,
7797                                       Register tmp1,
7798                                       Register n_tmp2, Register n_tmp3) {
7799   if (is_pclmulqdq_supported) {
7800     movdl(w_xtmp1, in_out);
7801 
7802     movl(tmp1, const_or_pre_comp_const_index);
7803     movdl(w_xtmp2, tmp1);
7804     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7805     // Keep result in XMM since GPR is 32 bit in length
7806   } else {
7807     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
7808   }
7809 }
7810 
7811 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7812                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7813                                      Register tmp1, Register tmp2,
7814                                      Register n_tmp3) {
7815   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7816   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7817 
7818   psllq(w_xtmp1, 1);
7819   movdl(tmp1, w_xtmp1);
7820   psrlq(w_xtmp1, 32);
7821   movdl(in_out, w_xtmp1);
7822 
7823   xorl(tmp2, tmp2);
7824   crc32(tmp2, tmp1, 4);
7825   xorl(in_out, tmp2);
7826 
7827   psllq(w_xtmp2, 1);
7828   movdl(tmp1, w_xtmp2);
7829   psrlq(w_xtmp2, 32);
7830   movdl(in1, w_xtmp2);
7831 
7832   xorl(tmp2, tmp2);
7833   crc32(tmp2, tmp1, 4);
7834   xorl(in1, tmp2);
7835   xorl(in_out, in1);
7836   xorl(in_out, in2);
7837 }
7838 
7839 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7840                                        Register in_out1, Register in_out2, Register in_out3,
7841                                        Register tmp1, Register tmp2, Register tmp3,
7842                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7843                                        Register tmp4, Register tmp5,
7844                                        Register n_tmp6) {
7845   Label L_processPartitions;
7846   Label L_processPartition;
7847   Label L_exit;
7848 
7849   bind(L_processPartitions);
7850   cmpl(in_out1, 3 * size);
7851   jcc(Assembler::less, L_exit);
7852     xorl(tmp1, tmp1);
7853     xorl(tmp2, tmp2);
7854     movl(tmp3, in_out2);
7855     addl(tmp3, size);
7856 
7857     bind(L_processPartition);
7858       crc32(in_out3, Address(in_out2, 0), 4);
7859       crc32(tmp1, Address(in_out2, size), 4);
7860       crc32(tmp2, Address(in_out2, size*2), 4);
7861       crc32(in_out3, Address(in_out2, 0+4), 4);
7862       crc32(tmp1, Address(in_out2, size+4), 4);
7863       crc32(tmp2, Address(in_out2, size*2+4), 4);
7864       addl(in_out2, 8);
7865       cmpl(in_out2, tmp3);
7866       jcc(Assembler::less, L_processPartition);
7867 
7868         push(tmp3);
7869         push(in_out1);
7870         push(in_out2);
7871         tmp4 = tmp3;
7872         tmp5 = in_out1;
7873         n_tmp6 = in_out2;
7874 
7875       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7876             w_xtmp1, w_xtmp2, w_xtmp3,
7877             tmp4, tmp5,
7878             n_tmp6);
7879 
7880         pop(in_out2);
7881         pop(in_out1);
7882         pop(tmp3);
7883 
7884     addl(in_out2, 2 * size);
7885     subl(in_out1, 3 * size);
7886     jmp(L_processPartitions);
7887 
7888   bind(L_exit);
7889 }
7890 #endif //LP64
7891 
7892 #ifdef _LP64
7893 // Algorithm 2: Pipelined usage of the CRC32 instruction.
7894 // Input: A buffer I of L bytes.
7895 // Output: the CRC32C value of the buffer.
7896 // Notations:
7897 // Write L = 24N + r, with N = floor (L/24).
7898 // r = L mod 24 (0 <= r < 24).
7899 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
7900 // N quadwords, and R consists of r bytes.
7901 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
7902 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
7903 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
7904 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
7905 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7906                                           Register tmp1, Register tmp2, Register tmp3,
7907                                           Register tmp4, Register tmp5, Register tmp6,
7908                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7909                                           bool is_pclmulqdq_supported) {
7910   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7911   Label L_wordByWord;
7912   Label L_byteByByteProlog;
7913   Label L_byteByByte;
7914   Label L_exit;
7915 
7916   if (is_pclmulqdq_supported ) {
7917     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7918     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
7919 
7920     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7921     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7922 
7923     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7924     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7925     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
7926   } else {
7927     const_or_pre_comp_const_index[0] = 1;
7928     const_or_pre_comp_const_index[1] = 0;
7929 
7930     const_or_pre_comp_const_index[2] = 3;
7931     const_or_pre_comp_const_index[3] = 2;
7932 
7933     const_or_pre_comp_const_index[4] = 5;
7934     const_or_pre_comp_const_index[5] = 4;
7935    }
7936   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
7937                     in2, in1, in_out,
7938                     tmp1, tmp2, tmp3,
7939                     w_xtmp1, w_xtmp2, w_xtmp3,
7940                     tmp4, tmp5,
7941                     tmp6);
7942   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
7943                     in2, in1, in_out,
7944                     tmp1, tmp2, tmp3,
7945                     w_xtmp1, w_xtmp2, w_xtmp3,
7946                     tmp4, tmp5,
7947                     tmp6);
7948   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
7949                     in2, in1, in_out,
7950                     tmp1, tmp2, tmp3,
7951                     w_xtmp1, w_xtmp2, w_xtmp3,
7952                     tmp4, tmp5,
7953                     tmp6);
7954   movl(tmp1, in2);
7955   andl(tmp1, 0x00000007);
7956   negl(tmp1);
7957   addl(tmp1, in2);
7958   addq(tmp1, in1);
7959 
7960   BIND(L_wordByWord);
7961   cmpq(in1, tmp1);
7962   jcc(Assembler::greaterEqual, L_byteByByteProlog);
7963     crc32(in_out, Address(in1, 0), 4);
7964     addq(in1, 4);
7965     jmp(L_wordByWord);
7966 
7967   BIND(L_byteByByteProlog);
7968   andl(in2, 0x00000007);
7969   movl(tmp2, 1);
7970 
7971   BIND(L_byteByByte);
7972   cmpl(tmp2, in2);
7973   jccb(Assembler::greater, L_exit);
7974     crc32(in_out, Address(in1, 0), 1);
7975     incq(in1);
7976     incl(tmp2);
7977     jmp(L_byteByByte);
7978 
7979   BIND(L_exit);
7980 }
7981 #else
7982 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7983                                           Register tmp1, Register  tmp2, Register tmp3,
7984                                           Register tmp4, Register  tmp5, Register tmp6,
7985                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7986                                           bool is_pclmulqdq_supported) {
7987   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7988   Label L_wordByWord;
7989   Label L_byteByByteProlog;
7990   Label L_byteByByte;
7991   Label L_exit;
7992 
7993   if (is_pclmulqdq_supported) {
7994     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7995     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
7996 
7997     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7998     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7999 
8000     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
8001     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
8002   } else {
8003     const_or_pre_comp_const_index[0] = 1;
8004     const_or_pre_comp_const_index[1] = 0;
8005 
8006     const_or_pre_comp_const_index[2] = 3;
8007     const_or_pre_comp_const_index[3] = 2;
8008 
8009     const_or_pre_comp_const_index[4] = 5;
8010     const_or_pre_comp_const_index[5] = 4;
8011   }
8012   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8013                     in2, in1, in_out,
8014                     tmp1, tmp2, tmp3,
8015                     w_xtmp1, w_xtmp2, w_xtmp3,
8016                     tmp4, tmp5,
8017                     tmp6);
8018   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8019                     in2, in1, in_out,
8020                     tmp1, tmp2, tmp3,
8021                     w_xtmp1, w_xtmp2, w_xtmp3,
8022                     tmp4, tmp5,
8023                     tmp6);
8024   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8025                     in2, in1, in_out,
8026                     tmp1, tmp2, tmp3,
8027                     w_xtmp1, w_xtmp2, w_xtmp3,
8028                     tmp4, tmp5,
8029                     tmp6);
8030   movl(tmp1, in2);
8031   andl(tmp1, 0x00000007);
8032   negl(tmp1);
8033   addl(tmp1, in2);
8034   addl(tmp1, in1);
8035 
8036   BIND(L_wordByWord);
8037   cmpl(in1, tmp1);
8038   jcc(Assembler::greaterEqual, L_byteByByteProlog);
8039     crc32(in_out, Address(in1,0), 4);
8040     addl(in1, 4);
8041     jmp(L_wordByWord);
8042 
8043   BIND(L_byteByByteProlog);
8044   andl(in2, 0x00000007);
8045   movl(tmp2, 1);
8046 
8047   BIND(L_byteByByte);
8048   cmpl(tmp2, in2);
8049   jccb(Assembler::greater, L_exit);
8050     movb(tmp1, Address(in1, 0));
8051     crc32(in_out, tmp1, 1);
8052     incl(in1);
8053     incl(tmp2);
8054     jmp(L_byteByByte);
8055 
8056   BIND(L_exit);
8057 }
8058 #endif // LP64
8059 #undef BIND
8060 #undef BLOCK_COMMENT
8061 
8062 // Compress char[] array to byte[].
8063 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
8064 //   @IntrinsicCandidate
8065 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8066 //     for (int i = 0; i < len; i++) {
8067 //       int c = src[srcOff++];
8068 //       if (c >>> 8 != 0) {
8069 //         return 0;
8070 //       }
8071 //       dst[dstOff++] = (byte)c;
8072 //     }
8073 //     return len;
8074 //   }
8075 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8076   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8077   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8078   Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8079   Label copy_chars_loop, return_length, return_zero, done;
8080 
8081   // rsi: src
8082   // rdi: dst
8083   // rdx: len
8084   // rcx: tmp5
8085   // rax: result
8086 
8087   // rsi holds start addr of source char[] to be compressed
8088   // rdi holds start addr of destination byte[]
8089   // rdx holds length
8090 
8091   assert(len != result, "");
8092 
8093   // save length for return
8094   push(len);
8095 
8096   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
8097     VM_Version::supports_avx512vlbw() &&
8098     VM_Version::supports_bmi2()) {
8099 
8100     Label copy_32_loop, copy_loop_tail, below_threshold;
8101 
8102     // alignment
8103     Label post_alignment;
8104 
8105     // if length of the string is less than 16, handle it in an old fashioned way
8106     testl(len, -32);
8107     jcc(Assembler::zero, below_threshold);
8108 
8109     // First check whether a character is compressible ( <= 0xFF).
8110     // Create mask to test for Unicode chars inside zmm vector
8111     movl(result, 0x00FF);
8112     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
8113 
8114     testl(len, -64);
8115     jcc(Assembler::zero, post_alignment);
8116 
8117     movl(tmp5, dst);
8118     andl(tmp5, (32 - 1));
8119     negl(tmp5);
8120     andl(tmp5, (32 - 1));
8121 
8122     // bail out when there is nothing to be done
8123     testl(tmp5, 0xFFFFFFFF);
8124     jcc(Assembler::zero, post_alignment);
8125 
8126     // ~(~0 << len), where len is the # of remaining elements to process
8127     movl(result, 0xFFFFFFFF);
8128     shlxl(result, result, tmp5);
8129     notl(result);
8130     kmovdl(mask2, result);
8131 
8132     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8133     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8134     ktestd(mask1, mask2);
8135     jcc(Assembler::carryClear, return_zero);
8136 
8137     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8138 
8139     addptr(src, tmp5);
8140     addptr(src, tmp5);
8141     addptr(dst, tmp5);
8142     subl(len, tmp5);
8143 
8144     bind(post_alignment);
8145     // end of alignment
8146 
8147     movl(tmp5, len);
8148     andl(tmp5, (32 - 1));    // tail count (in chars)
8149     andl(len, ~(32 - 1));    // vector count (in chars)
8150     jcc(Assembler::zero, copy_loop_tail);
8151 
8152     lea(src, Address(src, len, Address::times_2));
8153     lea(dst, Address(dst, len, Address::times_1));
8154     negptr(len);
8155 
8156     bind(copy_32_loop);
8157     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
8158     evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
8159     kortestdl(mask1, mask1);
8160     jcc(Assembler::carryClear, return_zero);
8161 
8162     // All elements in current processed chunk are valid candidates for
8163     // compression. Write a truncated byte elements to the memory.
8164     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
8165     addptr(len, 32);
8166     jcc(Assembler::notZero, copy_32_loop);
8167 
8168     bind(copy_loop_tail);
8169     // bail out when there is nothing to be done
8170     testl(tmp5, 0xFFFFFFFF);
8171     jcc(Assembler::zero, return_length);
8172 
8173     movl(len, tmp5);
8174 
8175     // ~(~0 << len), where len is the # of remaining elements to process
8176     movl(result, 0xFFFFFFFF);
8177     shlxl(result, result, len);
8178     notl(result);
8179 
8180     kmovdl(mask2, result);
8181 
8182     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8183     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8184     ktestd(mask1, mask2);
8185     jcc(Assembler::carryClear, return_zero);
8186 
8187     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8188     jmp(return_length);
8189 
8190     bind(below_threshold);
8191   }
8192 
8193   if (UseSSE42Intrinsics) {
8194     Label copy_32_loop, copy_16, copy_tail;
8195 
8196     movl(result, len);
8197 
8198     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
8199 
8200     // vectored compression
8201     andl(len, 0xfffffff0);    // vector count (in chars)
8202     andl(result, 0x0000000f);    // tail count (in chars)
8203     testl(len, len);
8204     jcc(Assembler::zero, copy_16);
8205 
8206     // compress 16 chars per iter
8207     movdl(tmp1Reg, tmp5);
8208     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8209     pxor(tmp4Reg, tmp4Reg);
8210 
8211     lea(src, Address(src, len, Address::times_2));
8212     lea(dst, Address(dst, len, Address::times_1));
8213     negptr(len);
8214 
8215     bind(copy_32_loop);
8216     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
8217     por(tmp4Reg, tmp2Reg);
8218     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
8219     por(tmp4Reg, tmp3Reg);
8220     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
8221     jcc(Assembler::notZero, return_zero);
8222     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
8223     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
8224     addptr(len, 16);
8225     jcc(Assembler::notZero, copy_32_loop);
8226 
8227     // compress next vector of 8 chars (if any)
8228     bind(copy_16);
8229     movl(len, result);
8230     andl(len, 0xfffffff8);    // vector count (in chars)
8231     andl(result, 0x00000007);    // tail count (in chars)
8232     testl(len, len);
8233     jccb(Assembler::zero, copy_tail);
8234 
8235     movdl(tmp1Reg, tmp5);
8236     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8237     pxor(tmp3Reg, tmp3Reg);
8238 
8239     movdqu(tmp2Reg, Address(src, 0));
8240     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
8241     jccb(Assembler::notZero, return_zero);
8242     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
8243     movq(Address(dst, 0), tmp2Reg);
8244     addptr(src, 16);
8245     addptr(dst, 8);
8246 
8247     bind(copy_tail);
8248     movl(len, result);
8249   }
8250   // compress 1 char per iter
8251   testl(len, len);
8252   jccb(Assembler::zero, return_length);
8253   lea(src, Address(src, len, Address::times_2));
8254   lea(dst, Address(dst, len, Address::times_1));
8255   negptr(len);
8256 
8257   bind(copy_chars_loop);
8258   load_unsigned_short(result, Address(src, len, Address::times_2));
8259   testl(result, 0xff00);      // check if Unicode char
8260   jccb(Assembler::notZero, return_zero);
8261   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
8262   increment(len);
8263   jcc(Assembler::notZero, copy_chars_loop);
8264 
8265   // if compression succeeded, return length
8266   bind(return_length);
8267   pop(result);
8268   jmpb(done);
8269 
8270   // if compression failed, return 0
8271   bind(return_zero);
8272   xorl(result, result);
8273   addptr(rsp, wordSize);
8274 
8275   bind(done);
8276 }
8277 
8278 // Inflate byte[] array to char[].
8279 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
8280 //   @IntrinsicCandidate
8281 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
8282 //     for (int i = 0; i < len; i++) {
8283 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
8284 //     }
8285 //   }
8286 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
8287   XMMRegister tmp1, Register tmp2, KRegister mask) {
8288   Label copy_chars_loop, done, below_threshold, avx3_threshold;
8289   // rsi: src
8290   // rdi: dst
8291   // rdx: len
8292   // rcx: tmp2
8293 
8294   // rsi holds start addr of source byte[] to be inflated
8295   // rdi holds start addr of destination char[]
8296   // rdx holds length
8297   assert_different_registers(src, dst, len, tmp2);
8298   movl(tmp2, len);
8299   if ((UseAVX > 2) && // AVX512
8300     VM_Version::supports_avx512vlbw() &&
8301     VM_Version::supports_bmi2()) {
8302 
8303     Label copy_32_loop, copy_tail;
8304     Register tmp3_aliased = len;
8305 
8306     // if length of the string is less than 16, handle it in an old fashioned way
8307     testl(len, -16);
8308     jcc(Assembler::zero, below_threshold);
8309 
8310     testl(len, -1 * AVX3Threshold);
8311     jcc(Assembler::zero, avx3_threshold);
8312 
8313     // In order to use only one arithmetic operation for the main loop we use
8314     // this pre-calculation
8315     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
8316     andl(len, -32);     // vector count
8317     jccb(Assembler::zero, copy_tail);
8318 
8319     lea(src, Address(src, len, Address::times_1));
8320     lea(dst, Address(dst, len, Address::times_2));
8321     negptr(len);
8322 
8323 
8324     // inflate 32 chars per iter
8325     bind(copy_32_loop);
8326     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
8327     evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
8328     addptr(len, 32);
8329     jcc(Assembler::notZero, copy_32_loop);
8330 
8331     bind(copy_tail);
8332     // bail out when there is nothing to be done
8333     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
8334     jcc(Assembler::zero, done);
8335 
8336     // ~(~0 << length), where length is the # of remaining elements to process
8337     movl(tmp3_aliased, -1);
8338     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
8339     notl(tmp3_aliased);
8340     kmovdl(mask, tmp3_aliased);
8341     evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
8342     evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
8343 
8344     jmp(done);
8345     bind(avx3_threshold);
8346   }
8347   if (UseSSE42Intrinsics) {
8348     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
8349 
8350     if (UseAVX > 1) {
8351       andl(tmp2, (16 - 1));
8352       andl(len, -16);
8353       jccb(Assembler::zero, copy_new_tail);
8354     } else {
8355       andl(tmp2, 0x00000007);   // tail count (in chars)
8356       andl(len, 0xfffffff8);    // vector count (in chars)
8357       jccb(Assembler::zero, copy_tail);
8358     }
8359 
8360     // vectored inflation
8361     lea(src, Address(src, len, Address::times_1));
8362     lea(dst, Address(dst, len, Address::times_2));
8363     negptr(len);
8364 
8365     if (UseAVX > 1) {
8366       bind(copy_16_loop);
8367       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
8368       vmovdqu(Address(dst, len, Address::times_2), tmp1);
8369       addptr(len, 16);
8370       jcc(Assembler::notZero, copy_16_loop);
8371 
8372       bind(below_threshold);
8373       bind(copy_new_tail);
8374       movl(len, tmp2);
8375       andl(tmp2, 0x00000007);
8376       andl(len, 0xFFFFFFF8);
8377       jccb(Assembler::zero, copy_tail);
8378 
8379       pmovzxbw(tmp1, Address(src, 0));
8380       movdqu(Address(dst, 0), tmp1);
8381       addptr(src, 8);
8382       addptr(dst, 2 * 8);
8383 
8384       jmp(copy_tail, true);
8385     }
8386 
8387     // inflate 8 chars per iter
8388     bind(copy_8_loop);
8389     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
8390     movdqu(Address(dst, len, Address::times_2), tmp1);
8391     addptr(len, 8);
8392     jcc(Assembler::notZero, copy_8_loop);
8393 
8394     bind(copy_tail);
8395     movl(len, tmp2);
8396 
8397     cmpl(len, 4);
8398     jccb(Assembler::less, copy_bytes);
8399 
8400     movdl(tmp1, Address(src, 0));  // load 4 byte chars
8401     pmovzxbw(tmp1, tmp1);
8402     movq(Address(dst, 0), tmp1);
8403     subptr(len, 4);
8404     addptr(src, 4);
8405     addptr(dst, 8);
8406 
8407     bind(copy_bytes);
8408   } else {
8409     bind(below_threshold);
8410   }
8411 
8412   testl(len, len);
8413   jccb(Assembler::zero, done);
8414   lea(src, Address(src, len, Address::times_1));
8415   lea(dst, Address(dst, len, Address::times_2));
8416   negptr(len);
8417 
8418   // inflate 1 char per iter
8419   bind(copy_chars_loop);
8420   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
8421   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
8422   increment(len);
8423   jcc(Assembler::notZero, copy_chars_loop);
8424 
8425   bind(done);
8426 }
8427 
8428 
8429 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
8430   switch(type) {
8431     case T_BYTE:
8432     case T_BOOLEAN:
8433       evmovdqub(dst, kmask, src, merge, vector_len);
8434       break;
8435     case T_CHAR:
8436     case T_SHORT:
8437       evmovdquw(dst, kmask, src, merge, vector_len);
8438       break;
8439     case T_INT:
8440     case T_FLOAT:
8441       evmovdqul(dst, kmask, src, merge, vector_len);
8442       break;
8443     case T_LONG:
8444     case T_DOUBLE:
8445       evmovdquq(dst, kmask, src, merge, vector_len);
8446       break;
8447     default:
8448       fatal("Unexpected type argument %s", type2name(type));
8449       break;
8450   }
8451 }
8452 
8453 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
8454   switch(type) {
8455     case T_BYTE:
8456     case T_BOOLEAN:
8457       evmovdqub(dst, kmask, src, merge, vector_len);
8458       break;
8459     case T_CHAR:
8460     case T_SHORT:
8461       evmovdquw(dst, kmask, src, merge, vector_len);
8462       break;
8463     case T_INT:
8464     case T_FLOAT:
8465       evmovdqul(dst, kmask, src, merge, vector_len);
8466       break;
8467     case T_LONG:
8468     case T_DOUBLE:
8469       evmovdquq(dst, kmask, src, merge, vector_len);
8470       break;
8471     default:
8472       fatal("Unexpected type argument %s", type2name(type));
8473       break;
8474   }
8475 }
8476 
8477 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) {
8478   switch(masklen) {
8479     case 2:
8480        knotbl(dst, src);
8481        movl(rtmp, 3);
8482        kmovbl(ktmp, rtmp);
8483        kandbl(dst, ktmp, dst);
8484        break;
8485     case 4:
8486        knotbl(dst, src);
8487        movl(rtmp, 15);
8488        kmovbl(ktmp, rtmp);
8489        kandbl(dst, ktmp, dst);
8490        break;
8491     case 8:
8492        knotbl(dst, src);
8493        break;
8494     case 16:
8495        knotwl(dst, src);
8496        break;
8497     case 32:
8498        knotdl(dst, src);
8499        break;
8500     case 64:
8501        knotql(dst, src);
8502        break;
8503     default:
8504       fatal("Unexpected vector length %d", masklen);
8505       break;
8506   }
8507 }
8508 
8509 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8510   switch(type) {
8511     case T_BOOLEAN:
8512     case T_BYTE:
8513        kandbl(dst, src1, src2);
8514        break;
8515     case T_CHAR:
8516     case T_SHORT:
8517        kandwl(dst, src1, src2);
8518        break;
8519     case T_INT:
8520     case T_FLOAT:
8521        kanddl(dst, src1, src2);
8522        break;
8523     case T_LONG:
8524     case T_DOUBLE:
8525        kandql(dst, src1, src2);
8526        break;
8527     default:
8528       fatal("Unexpected type argument %s", type2name(type));
8529       break;
8530   }
8531 }
8532 
8533 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8534   switch(type) {
8535     case T_BOOLEAN:
8536     case T_BYTE:
8537        korbl(dst, src1, src2);
8538        break;
8539     case T_CHAR:
8540     case T_SHORT:
8541        korwl(dst, src1, src2);
8542        break;
8543     case T_INT:
8544     case T_FLOAT:
8545        kordl(dst, src1, src2);
8546        break;
8547     case T_LONG:
8548     case T_DOUBLE:
8549        korql(dst, src1, src2);
8550        break;
8551     default:
8552       fatal("Unexpected type argument %s", type2name(type));
8553       break;
8554   }
8555 }
8556 
8557 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8558   switch(type) {
8559     case T_BOOLEAN:
8560     case T_BYTE:
8561        kxorbl(dst, src1, src2);
8562        break;
8563     case T_CHAR:
8564     case T_SHORT:
8565        kxorwl(dst, src1, src2);
8566        break;
8567     case T_INT:
8568     case T_FLOAT:
8569        kxordl(dst, src1, src2);
8570        break;
8571     case T_LONG:
8572     case T_DOUBLE:
8573        kxorql(dst, src1, src2);
8574        break;
8575     default:
8576       fatal("Unexpected type argument %s", type2name(type));
8577       break;
8578   }
8579 }
8580 
8581 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8582   switch(type) {
8583     case T_BOOLEAN:
8584     case T_BYTE:
8585       evpermb(dst, mask, nds, src, merge, vector_len); break;
8586     case T_CHAR:
8587     case T_SHORT:
8588       evpermw(dst, mask, nds, src, merge, vector_len); break;
8589     case T_INT:
8590     case T_FLOAT:
8591       evpermd(dst, mask, nds, src, merge, vector_len); break;
8592     case T_LONG:
8593     case T_DOUBLE:
8594       evpermq(dst, mask, nds, src, merge, vector_len); break;
8595     default:
8596       fatal("Unexpected type argument %s", type2name(type)); break;
8597   }
8598 }
8599 
8600 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8601   switch(type) {
8602     case T_BOOLEAN:
8603     case T_BYTE:
8604       evpermb(dst, mask, nds, src, merge, vector_len); break;
8605     case T_CHAR:
8606     case T_SHORT:
8607       evpermw(dst, mask, nds, src, merge, vector_len); break;
8608     case T_INT:
8609     case T_FLOAT:
8610       evpermd(dst, mask, nds, src, merge, vector_len); break;
8611     case T_LONG:
8612     case T_DOUBLE:
8613       evpermq(dst, mask, nds, src, merge, vector_len); break;
8614     default:
8615       fatal("Unexpected type argument %s", type2name(type)); break;
8616   }
8617 }
8618 
8619 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8620   switch(type) {
8621     case T_BYTE:
8622       evpminsb(dst, mask, nds, src, merge, vector_len); break;
8623     case T_SHORT:
8624       evpminsw(dst, mask, nds, src, merge, vector_len); break;
8625     case T_INT:
8626       evpminsd(dst, mask, nds, src, merge, vector_len); break;
8627     case T_LONG:
8628       evpminsq(dst, mask, nds, src, merge, vector_len); break;
8629     default:
8630       fatal("Unexpected type argument %s", type2name(type)); break;
8631   }
8632 }
8633 
8634 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8635   switch(type) {
8636     case T_BYTE:
8637       evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
8638     case T_SHORT:
8639       evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
8640     case T_INT:
8641       evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
8642     case T_LONG:
8643       evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
8644     default:
8645       fatal("Unexpected type argument %s", type2name(type)); break;
8646   }
8647 }
8648 
8649 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8650   switch(type) {
8651     case T_BYTE:
8652       evpminsb(dst, mask, nds, src, merge, vector_len); break;
8653     case T_SHORT:
8654       evpminsw(dst, mask, nds, src, merge, vector_len); break;
8655     case T_INT:
8656       evpminsd(dst, mask, nds, src, merge, vector_len); break;
8657     case T_LONG:
8658       evpminsq(dst, mask, nds, src, merge, vector_len); break;
8659     default:
8660       fatal("Unexpected type argument %s", type2name(type)); break;
8661   }
8662 }
8663 
8664 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8665   switch(type) {
8666     case T_BYTE:
8667       evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
8668     case T_SHORT:
8669       evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
8670     case T_INT:
8671       evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
8672     case T_LONG:
8673       evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
8674     default:
8675       fatal("Unexpected type argument %s", type2name(type)); break;
8676   }
8677 }
8678 
8679 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8680   switch(type) {
8681     case T_INT:
8682       evpxord(dst, mask, nds, src, merge, vector_len); break;
8683     case T_LONG:
8684       evpxorq(dst, mask, nds, src, merge, vector_len); break;
8685     default:
8686       fatal("Unexpected type argument %s", type2name(type)); break;
8687   }
8688 }
8689 
8690 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8691   switch(type) {
8692     case T_INT:
8693       evpxord(dst, mask, nds, src, merge, vector_len); break;
8694     case T_LONG:
8695       evpxorq(dst, mask, nds, src, merge, vector_len); break;
8696     default:
8697       fatal("Unexpected type argument %s", type2name(type)); break;
8698   }
8699 }
8700 
8701 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8702   switch(type) {
8703     case T_INT:
8704       Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
8705     case T_LONG:
8706       evporq(dst, mask, nds, src, merge, vector_len); break;
8707     default:
8708       fatal("Unexpected type argument %s", type2name(type)); break;
8709   }
8710 }
8711 
8712 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8713   switch(type) {
8714     case T_INT:
8715       Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
8716     case T_LONG:
8717       evporq(dst, mask, nds, src, merge, vector_len); break;
8718     default:
8719       fatal("Unexpected type argument %s", type2name(type)); break;
8720   }
8721 }
8722 
8723 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8724   switch(type) {
8725     case T_INT:
8726       evpandd(dst, mask, nds, src, merge, vector_len); break;
8727     case T_LONG:
8728       evpandq(dst, mask, nds, src, merge, vector_len); break;
8729     default:
8730       fatal("Unexpected type argument %s", type2name(type)); break;
8731   }
8732 }
8733 
8734 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8735   switch(type) {
8736     case T_INT:
8737       evpandd(dst, mask, nds, src, merge, vector_len); break;
8738     case T_LONG:
8739       evpandq(dst, mask, nds, src, merge, vector_len); break;
8740     default:
8741       fatal("Unexpected type argument %s", type2name(type)); break;
8742   }
8743 }
8744 
8745 void MacroAssembler::anytrue(Register dst, uint masklen, KRegister src1, KRegister src2) {
8746    masklen = masklen < 8 ? 8 : masklen;
8747    ktest(masklen, src1, src2);
8748    setb(Assembler::notZero, dst);
8749    movzbl(dst, dst);
8750 }
8751 
8752 void MacroAssembler::alltrue(Register dst, uint masklen, KRegister src1, KRegister src2, KRegister kscratch) {
8753   if (masklen < 8) {
8754     knotbl(kscratch, src2);
8755     kortestbl(src1, kscratch);
8756     setb(Assembler::carrySet, dst);
8757     movzbl(dst, dst);
8758   } else {
8759     ktest(masklen, src1, src2);
8760     setb(Assembler::carrySet, dst);
8761     movzbl(dst, dst);
8762   }
8763 }
8764 
8765 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) {
8766   switch(masklen) {
8767     case 8:
8768        kortestbl(src1, src2);
8769        break;
8770     case 16:
8771        kortestwl(src1, src2);
8772        break;
8773     case 32:
8774        kortestdl(src1, src2);
8775        break;
8776     case 64:
8777        kortestql(src1, src2);
8778        break;
8779     default:
8780       fatal("Unexpected mask length %d", masklen);
8781       break;
8782   }
8783 }
8784 
8785 
8786 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) {
8787   switch(masklen)  {
8788     case 8:
8789        ktestbl(src1, src2);
8790        break;
8791     case 16:
8792        ktestwl(src1, src2);
8793        break;
8794     case 32:
8795        ktestdl(src1, src2);
8796        break;
8797     case 64:
8798        ktestql(src1, src2);
8799        break;
8800     default:
8801       fatal("Unexpected mask length %d", masklen);
8802       break;
8803   }
8804 }
8805 
8806 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
8807   switch(type) {
8808     case T_INT:
8809       evprold(dst, mask, src, shift, merge, vlen_enc); break;
8810     case T_LONG:
8811       evprolq(dst, mask, src, shift, merge, vlen_enc); break;
8812     default:
8813       fatal("Unexpected type argument %s", type2name(type)); break;
8814       break;
8815   }
8816 }
8817 
8818 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
8819   switch(type) {
8820     case T_INT:
8821       evprord(dst, mask, src, shift, merge, vlen_enc); break;
8822     case T_LONG:
8823       evprorq(dst, mask, src, shift, merge, vlen_enc); break;
8824     default:
8825       fatal("Unexpected type argument %s", type2name(type)); break;
8826   }
8827 }
8828 
8829 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
8830   switch(type) {
8831     case T_INT:
8832       evprolvd(dst, mask, src1, src2, merge, vlen_enc); break;
8833     case T_LONG:
8834       evprolvq(dst, mask, src1, src2, merge, vlen_enc); break;
8835     default:
8836       fatal("Unexpected type argument %s", type2name(type)); break;
8837   }
8838 }
8839 
8840 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
8841   switch(type) {
8842     case T_INT:
8843       evprorvd(dst, mask, src1, src2, merge, vlen_enc); break;
8844     case T_LONG:
8845       evprorvq(dst, mask, src1, src2, merge, vlen_enc); break;
8846     default:
8847       fatal("Unexpected type argument %s", type2name(type)); break;
8848   }
8849 }
8850 #if COMPILER2_OR_JVMCI
8851 
8852 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
8853                                  Register length, Register temp, int vec_enc) {
8854   // Computing mask for predicated vector store.
8855   movptr(temp, -1);
8856   bzhiq(temp, temp, length);
8857   kmov(mask, temp);
8858   evmovdqu(bt, mask, dst, xmm, true, vec_enc);
8859 }
8860 
8861 // Set memory operation for length "less than" 64 bytes.
8862 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
8863                                        XMMRegister xmm, KRegister mask, Register length,
8864                                        Register temp, bool use64byteVector) {
8865   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8866   BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
8867   if (!use64byteVector) {
8868     fill32(dst, disp, xmm);
8869     subptr(length, 32 >> shift);
8870     fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
8871   } else {
8872     assert(MaxVectorSize == 64, "vector length != 64");
8873     fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
8874   }
8875 }
8876 
8877 
8878 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
8879                                        XMMRegister xmm, KRegister mask, Register length,
8880                                        Register temp) {
8881   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8882   BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
8883   fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
8884 }
8885 
8886 
8887 void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
8888   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8889   vmovdqu(dst, xmm);
8890 }
8891 
8892 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
8893   fill32(Address(dst, disp), xmm);
8894 }
8895 
8896 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
8897   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8898   if (!use64byteVector) {
8899     fill32(dst, xmm);
8900     fill32(dst.plus_disp(32), xmm);
8901   } else {
8902     evmovdquq(dst, xmm, Assembler::AVX_512bit);
8903   }
8904 }
8905 
8906 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
8907   fill64(Address(dst, disp), xmm, use64byteVector);
8908 }
8909 
8910 #ifdef _LP64
8911 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
8912                                         Register count, Register rtmp, XMMRegister xtmp) {
8913   Label L_exit;
8914   Label L_fill_start;
8915   Label L_fill_64_bytes;
8916   Label L_fill_96_bytes;
8917   Label L_fill_128_bytes;
8918   Label L_fill_128_bytes_loop;
8919   Label L_fill_128_loop_header;
8920   Label L_fill_128_bytes_loop_header;
8921   Label L_fill_128_bytes_loop_pre_header;
8922   Label L_fill_zmm_sequence;
8923 
8924   int shift = -1;
8925   int avx3threshold = VM_Version::avx3_threshold();
8926   switch(type) {
8927     case T_BYTE:  shift = 0;
8928       break;
8929     case T_SHORT: shift = 1;
8930       break;
8931     case T_INT:   shift = 2;
8932       break;
8933     /* Uncomment when LONG fill stubs are supported.
8934     case T_LONG:  shift = 3;
8935       break;
8936     */
8937     default:
8938       fatal("Unhandled type: %s\n", type2name(type));
8939   }
8940 
8941   if ((avx3threshold != 0)  || (MaxVectorSize == 32)) {
8942 
8943     if (MaxVectorSize == 64) {
8944       cmpq(count, avx3threshold >> shift);
8945       jcc(Assembler::greater, L_fill_zmm_sequence);
8946     }
8947 
8948     evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
8949 
8950     bind(L_fill_start);
8951 
8952     cmpq(count, 32 >> shift);
8953     jccb(Assembler::greater, L_fill_64_bytes);
8954     fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
8955     jmp(L_exit);
8956 
8957     bind(L_fill_64_bytes);
8958     cmpq(count, 64 >> shift);
8959     jccb(Assembler::greater, L_fill_96_bytes);
8960     fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
8961     jmp(L_exit);
8962 
8963     bind(L_fill_96_bytes);
8964     cmpq(count, 96 >> shift);
8965     jccb(Assembler::greater, L_fill_128_bytes);
8966     fill64(to, 0, xtmp);
8967     subq(count, 64 >> shift);
8968     fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
8969     jmp(L_exit);
8970 
8971     bind(L_fill_128_bytes);
8972     cmpq(count, 128 >> shift);
8973     jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
8974     fill64(to, 0, xtmp);
8975     fill32(to, 64, xtmp);
8976     subq(count, 96 >> shift);
8977     fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
8978     jmp(L_exit);
8979 
8980     bind(L_fill_128_bytes_loop_pre_header);
8981     {
8982       mov(rtmp, to);
8983       andq(rtmp, 31);
8984       jccb(Assembler::zero, L_fill_128_bytes_loop_header);
8985       negq(rtmp);
8986       addq(rtmp, 32);
8987       mov64(r8, -1L);
8988       bzhiq(r8, r8, rtmp);
8989       kmovql(k2, r8);
8990       evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_256bit);
8991       addq(to, rtmp);
8992       shrq(rtmp, shift);
8993       subq(count, rtmp);
8994     }
8995 
8996     cmpq(count, 128 >> shift);
8997     jcc(Assembler::less, L_fill_start);
8998 
8999     bind(L_fill_128_bytes_loop_header);
9000     subq(count, 128 >> shift);
9001 
9002     align32();
9003     bind(L_fill_128_bytes_loop);
9004       fill64(to, 0, xtmp);
9005       fill64(to, 64, xtmp);
9006       addq(to, 128);
9007       subq(count, 128 >> shift);
9008       jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
9009 
9010     addq(count, 128 >> shift);
9011     jcc(Assembler::zero, L_exit);
9012     jmp(L_fill_start);
9013   }
9014 
9015   if (MaxVectorSize == 64) {
9016     // Sequence using 64 byte ZMM register.
9017     Label L_fill_128_bytes_zmm;
9018     Label L_fill_192_bytes_zmm;
9019     Label L_fill_192_bytes_loop_zmm;
9020     Label L_fill_192_bytes_loop_header_zmm;
9021     Label L_fill_192_bytes_loop_pre_header_zmm;
9022     Label L_fill_start_zmm_sequence;
9023 
9024     bind(L_fill_zmm_sequence);
9025     evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
9026 
9027     bind(L_fill_start_zmm_sequence);
9028     cmpq(count, 64 >> shift);
9029     jccb(Assembler::greater, L_fill_128_bytes_zmm);
9030     fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
9031     jmp(L_exit);
9032 
9033     bind(L_fill_128_bytes_zmm);
9034     cmpq(count, 128 >> shift);
9035     jccb(Assembler::greater, L_fill_192_bytes_zmm);
9036     fill64(to, 0, xtmp, true);
9037     subq(count, 64 >> shift);
9038     fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
9039     jmp(L_exit);
9040 
9041     bind(L_fill_192_bytes_zmm);
9042     cmpq(count, 192 >> shift);
9043     jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
9044     fill64(to, 0, xtmp, true);
9045     fill64(to, 64, xtmp, true);
9046     subq(count, 128 >> shift);
9047     fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
9048     jmp(L_exit);
9049 
9050     bind(L_fill_192_bytes_loop_pre_header_zmm);
9051     {
9052       movq(rtmp, to);
9053       andq(rtmp, 63);
9054       jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
9055       negq(rtmp);
9056       addq(rtmp, 64);
9057       mov64(r8, -1L);
9058       bzhiq(r8, r8, rtmp);
9059       kmovql(k2, r8);
9060       evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_512bit);
9061       addq(to, rtmp);
9062       shrq(rtmp, shift);
9063       subq(count, rtmp);
9064     }
9065 
9066     cmpq(count, 192 >> shift);
9067     jcc(Assembler::less, L_fill_start_zmm_sequence);
9068 
9069     bind(L_fill_192_bytes_loop_header_zmm);
9070     subq(count, 192 >> shift);
9071 
9072     align32();
9073     bind(L_fill_192_bytes_loop_zmm);
9074       fill64(to, 0, xtmp, true);
9075       fill64(to, 64, xtmp, true);
9076       fill64(to, 128, xtmp, true);
9077       addq(to, 192);
9078       subq(count, 192 >> shift);
9079       jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
9080 
9081     addq(count, 192 >> shift);
9082     jcc(Assembler::zero, L_exit);
9083     jmp(L_fill_start_zmm_sequence);
9084   }
9085   bind(L_exit);
9086 }
9087 #endif
9088 #endif //COMPILER2_OR_JVMCI
9089 
9090 
9091 #ifdef _LP64
9092 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
9093   Label done;
9094   cvttss2sil(dst, src);
9095   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9096   cmpl(dst, 0x80000000); // float_sign_flip
9097   jccb(Assembler::notEqual, done);
9098   subptr(rsp, 8);
9099   movflt(Address(rsp, 0), src);
9100   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
9101   pop(dst);
9102   bind(done);
9103 }
9104 
9105 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
9106   Label done;
9107   cvttsd2sil(dst, src);
9108   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9109   cmpl(dst, 0x80000000); // float_sign_flip
9110   jccb(Assembler::notEqual, done);
9111   subptr(rsp, 8);
9112   movdbl(Address(rsp, 0), src);
9113   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
9114   pop(dst);
9115   bind(done);
9116 }
9117 
9118 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
9119   Label done;
9120   cvttss2siq(dst, src);
9121   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9122   jccb(Assembler::notEqual, done);
9123   subptr(rsp, 8);
9124   movflt(Address(rsp, 0), src);
9125   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
9126   pop(dst);
9127   bind(done);
9128 }
9129 
9130 void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9131   // Following code is line by line assembly translation rounding algorithm.
9132   // Please refer to java.lang.Math.round(float) algorithm for details.
9133   const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000;
9134   const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24;
9135   const int32_t FloatConsts_EXP_BIAS = 127;
9136   const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF;
9137   const int32_t MINUS_32 = 0xFFFFFFE0;
9138   Label L_special_case, L_block1, L_exit;
9139   movl(rtmp, FloatConsts_EXP_BIT_MASK);
9140   movdl(dst, src);
9141   andl(dst, rtmp);
9142   sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1);
9143   movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS);
9144   subl(rtmp, dst);
9145   movl(rcx, rtmp);
9146   movl(dst, MINUS_32);
9147   testl(rtmp, dst);
9148   jccb(Assembler::notEqual, L_special_case);
9149   movdl(dst, src);
9150   andl(dst, FloatConsts_SIGNIF_BIT_MASK);
9151   orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1);
9152   movdl(rtmp, src);
9153   testl(rtmp, rtmp);
9154   jccb(Assembler::greaterEqual, L_block1);
9155   negl(dst);
9156   bind(L_block1);
9157   sarl(dst);
9158   addl(dst, 0x1);
9159   sarl(dst, 0x1);
9160   jmp(L_exit);
9161   bind(L_special_case);
9162   convert_f2i(dst, src);
9163   bind(L_exit);
9164 }
9165 
9166 void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9167   // Following code is line by line assembly translation rounding algorithm.
9168   // Please refer to java.lang.Math.round(double) algorithm for details.
9169   const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L;
9170   const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53;
9171   const int64_t DoubleConsts_EXP_BIAS = 1023;
9172   const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL;
9173   const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L;
9174   Label L_special_case, L_block1, L_exit;
9175   mov64(rtmp, DoubleConsts_EXP_BIT_MASK);
9176   movq(dst, src);
9177   andq(dst, rtmp);
9178   sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1);
9179   mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS);
9180   subq(rtmp, dst);
9181   movq(rcx, rtmp);
9182   mov64(dst, MINUS_64);
9183   testq(rtmp, dst);
9184   jccb(Assembler::notEqual, L_special_case);
9185   movq(dst, src);
9186   mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK);
9187   andq(dst, rtmp);
9188   mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1);
9189   orq(dst, rtmp);
9190   movq(rtmp, src);
9191   testq(rtmp, rtmp);
9192   jccb(Assembler::greaterEqual, L_block1);
9193   negq(dst);
9194   bind(L_block1);
9195   sarq(dst);
9196   addq(dst, 0x1);
9197   sarq(dst, 0x1);
9198   jmp(L_exit);
9199   bind(L_special_case);
9200   convert_d2l(dst, src);
9201   bind(L_exit);
9202 }
9203 
9204 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
9205   Label done;
9206   cvttsd2siq(dst, src);
9207   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9208   jccb(Assembler::notEqual, done);
9209   subptr(rsp, 8);
9210   movdbl(Address(rsp, 0), src);
9211   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
9212   pop(dst);
9213   bind(done);
9214 }
9215 
9216 void MacroAssembler::cache_wb(Address line)
9217 {
9218   // 64 bit cpus always support clflush
9219   assert(VM_Version::supports_clflush(), "clflush should be available");
9220   bool optimized = VM_Version::supports_clflushopt();
9221   bool no_evict = VM_Version::supports_clwb();
9222 
9223   // prefer clwb (writeback without evict) otherwise
9224   // prefer clflushopt (potentially parallel writeback with evict)
9225   // otherwise fallback on clflush (serial writeback with evict)
9226 
9227   if (optimized) {
9228     if (no_evict) {
9229       clwb(line);
9230     } else {
9231       clflushopt(line);
9232     }
9233   } else {
9234     // no need for fence when using CLFLUSH
9235     clflush(line);
9236   }
9237 }
9238 
9239 void MacroAssembler::cache_wbsync(bool is_pre)
9240 {
9241   assert(VM_Version::supports_clflush(), "clflush should be available");
9242   bool optimized = VM_Version::supports_clflushopt();
9243   bool no_evict = VM_Version::supports_clwb();
9244 
9245   // pick the correct implementation
9246 
9247   if (!is_pre && (optimized || no_evict)) {
9248     // need an sfence for post flush when using clflushopt or clwb
9249     // otherwise no no need for any synchroniaztion
9250 
9251     sfence();
9252   }
9253 }
9254 
9255 #endif // _LP64
9256 
9257 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9258   switch (cond) {
9259     // Note some conditions are synonyms for others
9260     case Assembler::zero:         return Assembler::notZero;
9261     case Assembler::notZero:      return Assembler::zero;
9262     case Assembler::less:         return Assembler::greaterEqual;
9263     case Assembler::lessEqual:    return Assembler::greater;
9264     case Assembler::greater:      return Assembler::lessEqual;
9265     case Assembler::greaterEqual: return Assembler::less;
9266     case Assembler::below:        return Assembler::aboveEqual;
9267     case Assembler::belowEqual:   return Assembler::above;
9268     case Assembler::above:        return Assembler::belowEqual;
9269     case Assembler::aboveEqual:   return Assembler::below;
9270     case Assembler::overflow:     return Assembler::noOverflow;
9271     case Assembler::noOverflow:   return Assembler::overflow;
9272     case Assembler::negative:     return Assembler::positive;
9273     case Assembler::positive:     return Assembler::negative;
9274     case Assembler::parity:       return Assembler::noParity;
9275     case Assembler::noParity:     return Assembler::parity;
9276   }
9277   ShouldNotReachHere(); return Assembler::overflow;
9278 }
9279 
9280 SkipIfEqual::SkipIfEqual(
9281     MacroAssembler* masm, const bool* flag_addr, bool value) {
9282   _masm = masm;
9283   _masm->cmp8(ExternalAddress((address)flag_addr), value);
9284   _masm->jcc(Assembler::equal, _label);
9285 }
9286 
9287 SkipIfEqual::~SkipIfEqual() {
9288   _masm->bind(_label);
9289 }
9290 
9291 // 32-bit Windows has its own fast-path implementation
9292 // of get_thread
9293 #if !defined(WIN32) || defined(_LP64)
9294 
9295 // This is simply a call to Thread::current()
9296 void MacroAssembler::get_thread(Register thread) {
9297   if (thread != rax) {
9298     push(rax);
9299   }
9300   LP64_ONLY(push(rdi);)
9301   LP64_ONLY(push(rsi);)
9302   push(rdx);
9303   push(rcx);
9304 #ifdef _LP64
9305   push(r8);
9306   push(r9);
9307   push(r10);
9308   push(r11);
9309 #endif
9310 
9311   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
9312 
9313 #ifdef _LP64
9314   pop(r11);
9315   pop(r10);
9316   pop(r9);
9317   pop(r8);
9318 #endif
9319   pop(rcx);
9320   pop(rdx);
9321   LP64_ONLY(pop(rsi);)
9322   LP64_ONLY(pop(rdi);)
9323   if (thread != rax) {
9324     mov(thread, rax);
9325     pop(rax);
9326   }
9327 }
9328 
9329 
9330 #endif // !WIN32 || _LP64