1 /*
   2  * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "jvm.h"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "compiler/compiler_globals.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/collectedHeap.inline.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/bytecodeHistogram.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/resourceArea.hpp"
  38 #include "memory/universe.hpp"
  39 #include "oops/accessDecorators.hpp"
  40 #include "oops/compressedOops.inline.hpp"
  41 #include "oops/klass.inline.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "runtime/flags/flagSetting.hpp"
  44 #include "runtime/interfaceSupport.inline.hpp"
  45 #include "runtime/jniHandles.hpp"
  46 #include "runtime/objectMonitor.hpp"
  47 #include "runtime/os.hpp"
  48 #include "runtime/safepoint.hpp"
  49 #include "runtime/safepointMechanism.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "runtime/thread.hpp"
  53 #include "utilities/macros.hpp"
  54 #include "crc32c.h"
  55 
  56 #ifdef PRODUCT
  57 #define BLOCK_COMMENT(str) /* nothing */
  58 #define STOP(error) stop(error)
  59 #else
  60 #define BLOCK_COMMENT(str) block_comment(str)
  61 #define STOP(error) block_comment(error); stop(error)
  62 #endif
  63 
  64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  65 
  66 #ifdef ASSERT
  67 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  68 #endif
  69 
  70 static Assembler::Condition reverse[] = {
  71     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  72     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  73     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  74     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  75     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  76     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  77     Assembler::above          /* belowEqual    = 0x6 */ ,
  78     Assembler::belowEqual     /* above         = 0x7 */ ,
  79     Assembler::positive       /* negative      = 0x8 */ ,
  80     Assembler::negative       /* positive      = 0x9 */ ,
  81     Assembler::noParity       /* parity        = 0xa */ ,
  82     Assembler::parity         /* noParity      = 0xb */ ,
  83     Assembler::greaterEqual   /* less          = 0xc */ ,
  84     Assembler::less           /* greaterEqual  = 0xd */ ,
  85     Assembler::greater        /* lessEqual     = 0xe */ ,
  86     Assembler::lessEqual      /* greater       = 0xf, */
  87 
  88 };
  89 
  90 
  91 // Implementation of MacroAssembler
  92 
  93 // First all the versions that have distinct versions depending on 32/64 bit
  94 // Unless the difference is trivial (1 line or so).
  95 
  96 #ifndef _LP64
  97 
  98 // 32bit versions
  99 
 100 Address MacroAssembler::as_Address(AddressLiteral adr) {
 101   return Address(adr.target(), adr.rspec());
 102 }
 103 
 104 Address MacroAssembler::as_Address(ArrayAddress adr) {
 105   return Address::make_array(adr);
 106 }
 107 
 108 void MacroAssembler::call_VM_leaf_base(address entry_point,
 109                                        int number_of_arguments) {
 110   call(RuntimeAddress(entry_point));
 111   increment(rsp, number_of_arguments * wordSize);
 112 }
 113 
 114 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 115   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 116 }
 117 
 118 
 119 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 120   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 121 }
 122 
 123 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 124   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 125 }
 126 
 127 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 128   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 129 }
 130 
 131 void MacroAssembler::extend_sign(Register hi, Register lo) {
 132   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 133   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 134     cdql();
 135   } else {
 136     movl(hi, lo);
 137     sarl(hi, 31);
 138   }
 139 }
 140 
 141 void MacroAssembler::jC2(Register tmp, Label& L) {
 142   // set parity bit if FPU flag C2 is set (via rax)
 143   save_rax(tmp);
 144   fwait(); fnstsw_ax();
 145   sahf();
 146   restore_rax(tmp);
 147   // branch
 148   jcc(Assembler::parity, L);
 149 }
 150 
 151 void MacroAssembler::jnC2(Register tmp, Label& L) {
 152   // set parity bit if FPU flag C2 is set (via rax)
 153   save_rax(tmp);
 154   fwait(); fnstsw_ax();
 155   sahf();
 156   restore_rax(tmp);
 157   // branch
 158   jcc(Assembler::noParity, L);
 159 }
 160 
 161 // 32bit can do a case table jump in one instruction but we no longer allow the base
 162 // to be installed in the Address class
 163 void MacroAssembler::jump(ArrayAddress entry) {
 164   jmp(as_Address(entry));
 165 }
 166 
 167 // Note: y_lo will be destroyed
 168 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 169   // Long compare for Java (semantics as described in JVM spec.)
 170   Label high, low, done;
 171 
 172   cmpl(x_hi, y_hi);
 173   jcc(Assembler::less, low);
 174   jcc(Assembler::greater, high);
 175   // x_hi is the return register
 176   xorl(x_hi, x_hi);
 177   cmpl(x_lo, y_lo);
 178   jcc(Assembler::below, low);
 179   jcc(Assembler::equal, done);
 180 
 181   bind(high);
 182   xorl(x_hi, x_hi);
 183   increment(x_hi);
 184   jmp(done);
 185 
 186   bind(low);
 187   xorl(x_hi, x_hi);
 188   decrementl(x_hi);
 189 
 190   bind(done);
 191 }
 192 
 193 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 194     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 195 }
 196 
 197 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 198   // leal(dst, as_Address(adr));
 199   // see note in movl as to why we must use a move
 200   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 201 }
 202 
 203 void MacroAssembler::leave() {
 204   mov(rsp, rbp);
 205   pop(rbp);
 206 }
 207 
 208 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 209   // Multiplication of two Java long values stored on the stack
 210   // as illustrated below. Result is in rdx:rax.
 211   //
 212   // rsp ---> [  ??  ] \               \
 213   //            ....    | y_rsp_offset  |
 214   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 215   //          [ y_hi ]                  | (in bytes)
 216   //            ....                    |
 217   //          [ x_lo ]                 /
 218   //          [ x_hi ]
 219   //            ....
 220   //
 221   // Basic idea: lo(result) = lo(x_lo * y_lo)
 222   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 223   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 224   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 225   Label quick;
 226   // load x_hi, y_hi and check if quick
 227   // multiplication is possible
 228   movl(rbx, x_hi);
 229   movl(rcx, y_hi);
 230   movl(rax, rbx);
 231   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 232   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 233   // do full multiplication
 234   // 1st step
 235   mull(y_lo);                                    // x_hi * y_lo
 236   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 237   // 2nd step
 238   movl(rax, x_lo);
 239   mull(rcx);                                     // x_lo * y_hi
 240   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 241   // 3rd step
 242   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 243   movl(rax, x_lo);
 244   mull(y_lo);                                    // x_lo * y_lo
 245   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 246 }
 247 
 248 void MacroAssembler::lneg(Register hi, Register lo) {
 249   negl(lo);
 250   adcl(hi, 0);
 251   negl(hi);
 252 }
 253 
 254 void MacroAssembler::lshl(Register hi, Register lo) {
 255   // Java shift left long support (semantics as described in JVM spec., p.305)
 256   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 257   // shift value is in rcx !
 258   assert(hi != rcx, "must not use rcx");
 259   assert(lo != rcx, "must not use rcx");
 260   const Register s = rcx;                        // shift count
 261   const int      n = BitsPerWord;
 262   Label L;
 263   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 264   cmpl(s, n);                                    // if (s < n)
 265   jcc(Assembler::less, L);                       // else (s >= n)
 266   movl(hi, lo);                                  // x := x << n
 267   xorl(lo, lo);
 268   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 269   bind(L);                                       // s (mod n) < n
 270   shldl(hi, lo);                                 // x := x << s
 271   shll(lo);
 272 }
 273 
 274 
 275 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 276   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 277   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 278   assert(hi != rcx, "must not use rcx");
 279   assert(lo != rcx, "must not use rcx");
 280   const Register s = rcx;                        // shift count
 281   const int      n = BitsPerWord;
 282   Label L;
 283   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 284   cmpl(s, n);                                    // if (s < n)
 285   jcc(Assembler::less, L);                       // else (s >= n)
 286   movl(lo, hi);                                  // x := x >> n
 287   if (sign_extension) sarl(hi, 31);
 288   else                xorl(hi, hi);
 289   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 290   bind(L);                                       // s (mod n) < n
 291   shrdl(lo, hi);                                 // x := x >> s
 292   if (sign_extension) sarl(hi);
 293   else                shrl(hi);
 294 }
 295 
 296 void MacroAssembler::movoop(Register dst, jobject obj) {
 297   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 298 }
 299 
 300 void MacroAssembler::movoop(Address dst, jobject obj) {
 301   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 302 }
 303 
 304 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 305   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 306 }
 307 
 308 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 309   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 310 }
 311 
 312 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 313   // scratch register is not used,
 314   // it is defined to match parameters of 64-bit version of this method.
 315   if (src.is_lval()) {
 316     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 317   } else {
 318     movl(dst, as_Address(src));
 319   }
 320 }
 321 
 322 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 323   movl(as_Address(dst), src);
 324 }
 325 
 326 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 327   movl(dst, as_Address(src));
 328 }
 329 
 330 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 331 void MacroAssembler::movptr(Address dst, intptr_t src) {
 332   movl(dst, src);
 333 }
 334 
 335 void MacroAssembler::pushoop(jobject obj) {
 336   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 337 }
 338 
 339 void MacroAssembler::pushklass(Metadata* obj) {
 340   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 341 }
 342 
 343 void MacroAssembler::pushptr(AddressLiteral src) {
 344   if (src.is_lval()) {
 345     push_literal32((int32_t)src.target(), src.rspec());
 346   } else {
 347     pushl(as_Address(src));
 348   }
 349 }
 350 
 351 static void pass_arg0(MacroAssembler* masm, Register arg) {
 352   masm->push(arg);
 353 }
 354 
 355 static void pass_arg1(MacroAssembler* masm, Register arg) {
 356   masm->push(arg);
 357 }
 358 
 359 static void pass_arg2(MacroAssembler* masm, Register arg) {
 360   masm->push(arg);
 361 }
 362 
 363 static void pass_arg3(MacroAssembler* masm, Register arg) {
 364   masm->push(arg);
 365 }
 366 
 367 #ifndef PRODUCT
 368 extern "C" void findpc(intptr_t x);
 369 #endif
 370 
 371 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 372   // In order to get locks to work, we need to fake a in_VM state
 373   JavaThread* thread = JavaThread::current();
 374   JavaThreadState saved_state = thread->thread_state();
 375   thread->set_thread_state(_thread_in_vm);
 376   if (ShowMessageBoxOnError) {
 377     JavaThread* thread = JavaThread::current();
 378     JavaThreadState saved_state = thread->thread_state();
 379     thread->set_thread_state(_thread_in_vm);
 380     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 381       ttyLocker ttyl;
 382       BytecodeCounter::print();
 383     }
 384     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 385     // This is the value of eip which points to where verify_oop will return.
 386     if (os::message_box(msg, "Execution stopped, print registers?")) {
 387       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 388       BREAKPOINT;
 389     }
 390   }
 391   fatal("DEBUG MESSAGE: %s", msg);
 392 }
 393 
 394 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 395   ttyLocker ttyl;
 396   FlagSetting fs(Debugging, true);
 397   tty->print_cr("eip = 0x%08x", eip);
 398 #ifndef PRODUCT
 399   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 400     tty->cr();
 401     findpc(eip);
 402     tty->cr();
 403   }
 404 #endif
 405 #define PRINT_REG(rax) \
 406   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 407   PRINT_REG(rax);
 408   PRINT_REG(rbx);
 409   PRINT_REG(rcx);
 410   PRINT_REG(rdx);
 411   PRINT_REG(rdi);
 412   PRINT_REG(rsi);
 413   PRINT_REG(rbp);
 414   PRINT_REG(rsp);
 415 #undef PRINT_REG
 416   // Print some words near top of staack.
 417   int* dump_sp = (int*) rsp;
 418   for (int col1 = 0; col1 < 8; col1++) {
 419     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 420     os::print_location(tty, *dump_sp++);
 421   }
 422   for (int row = 0; row < 16; row++) {
 423     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 424     for (int col = 0; col < 8; col++) {
 425       tty->print(" 0x%08x", *dump_sp++);
 426     }
 427     tty->cr();
 428   }
 429   // Print some instructions around pc:
 430   Disassembler::decode((address)eip-64, (address)eip);
 431   tty->print_cr("--------");
 432   Disassembler::decode((address)eip, (address)eip+32);
 433 }
 434 
 435 void MacroAssembler::stop(const char* msg) {
 436   ExternalAddress message((address)msg);
 437   // push address of message
 438   pushptr(message.addr());
 439   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 440   pusha();                                            // push registers
 441   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 442   hlt();
 443 }
 444 
 445 void MacroAssembler::warn(const char* msg) {
 446   push_CPU_state();
 447 
 448   ExternalAddress message((address) msg);
 449   // push address of message
 450   pushptr(message.addr());
 451 
 452   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 453   addl(rsp, wordSize);       // discard argument
 454   pop_CPU_state();
 455 }
 456 
 457 void MacroAssembler::print_state() {
 458   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 459   pusha();                                            // push registers
 460 
 461   push_CPU_state();
 462   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 463   pop_CPU_state();
 464 
 465   popa();
 466   addl(rsp, wordSize);
 467 }
 468 
 469 #else // _LP64
 470 
 471 // 64 bit versions
 472 
 473 Address MacroAssembler::as_Address(AddressLiteral adr) {
 474   // amd64 always does this as a pc-rel
 475   // we can be absolute or disp based on the instruction type
 476   // jmp/call are displacements others are absolute
 477   assert(!adr.is_lval(), "must be rval");
 478   assert(reachable(adr), "must be");
 479   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 480 
 481 }
 482 
 483 Address MacroAssembler::as_Address(ArrayAddress adr) {
 484   AddressLiteral base = adr.base();
 485   lea(rscratch1, base);
 486   Address index = adr.index();
 487   assert(index._disp == 0, "must not have disp"); // maybe it can?
 488   Address array(rscratch1, index._index, index._scale, index._disp);
 489   return array;
 490 }
 491 
 492 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 493   Label L, E;
 494 
 495 #ifdef _WIN64
 496   // Windows always allocates space for it's register args
 497   assert(num_args <= 4, "only register arguments supported");
 498   subq(rsp,  frame::arg_reg_save_area_bytes);
 499 #endif
 500 
 501   // Align stack if necessary
 502   testl(rsp, 15);
 503   jcc(Assembler::zero, L);
 504 
 505   subq(rsp, 8);
 506   {
 507     call(RuntimeAddress(entry_point));
 508   }
 509   addq(rsp, 8);
 510   jmp(E);
 511 
 512   bind(L);
 513   {
 514     call(RuntimeAddress(entry_point));
 515   }
 516 
 517   bind(E);
 518 
 519 #ifdef _WIN64
 520   // restore stack pointer
 521   addq(rsp, frame::arg_reg_save_area_bytes);
 522 #endif
 523 
 524 }
 525 
 526 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 527   assert(!src2.is_lval(), "should use cmpptr");
 528 
 529   if (reachable(src2)) {
 530     cmpq(src1, as_Address(src2));
 531   } else {
 532     lea(rscratch1, src2);
 533     Assembler::cmpq(src1, Address(rscratch1, 0));
 534   }
 535 }
 536 
 537 int MacroAssembler::corrected_idivq(Register reg) {
 538   // Full implementation of Java ldiv and lrem; checks for special
 539   // case as described in JVM spec., p.243 & p.271.  The function
 540   // returns the (pc) offset of the idivl instruction - may be needed
 541   // for implicit exceptions.
 542   //
 543   //         normal case                           special case
 544   //
 545   // input : rax: dividend                         min_long
 546   //         reg: divisor   (may not be eax/edx)   -1
 547   //
 548   // output: rax: quotient  (= rax idiv reg)       min_long
 549   //         rdx: remainder (= rax irem reg)       0
 550   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 551   static const int64_t min_long = 0x8000000000000000;
 552   Label normal_case, special_case;
 553 
 554   // check for special case
 555   cmp64(rax, ExternalAddress((address) &min_long));
 556   jcc(Assembler::notEqual, normal_case);
 557   xorl(rdx, rdx); // prepare rdx for possible special case (where
 558                   // remainder = 0)
 559   cmpq(reg, -1);
 560   jcc(Assembler::equal, special_case);
 561 
 562   // handle normal case
 563   bind(normal_case);
 564   cdqq();
 565   int idivq_offset = offset();
 566   idivq(reg);
 567 
 568   // normal and special case exit
 569   bind(special_case);
 570 
 571   return idivq_offset;
 572 }
 573 
 574 void MacroAssembler::decrementq(Register reg, int value) {
 575   if (value == min_jint) { subq(reg, value); return; }
 576   if (value <  0) { incrementq(reg, -value); return; }
 577   if (value == 0) {                        ; return; }
 578   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 579   /* else */      { subq(reg, value)       ; return; }
 580 }
 581 
 582 void MacroAssembler::decrementq(Address dst, int value) {
 583   if (value == min_jint) { subq(dst, value); return; }
 584   if (value <  0) { incrementq(dst, -value); return; }
 585   if (value == 0) {                        ; return; }
 586   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 587   /* else */      { subq(dst, value)       ; return; }
 588 }
 589 
 590 void MacroAssembler::incrementq(AddressLiteral dst) {
 591   if (reachable(dst)) {
 592     incrementq(as_Address(dst));
 593   } else {
 594     lea(rscratch1, dst);
 595     incrementq(Address(rscratch1, 0));
 596   }
 597 }
 598 
 599 void MacroAssembler::incrementq(Register reg, int value) {
 600   if (value == min_jint) { addq(reg, value); return; }
 601   if (value <  0) { decrementq(reg, -value); return; }
 602   if (value == 0) {                        ; return; }
 603   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 604   /* else */      { addq(reg, value)       ; return; }
 605 }
 606 
 607 void MacroAssembler::incrementq(Address dst, int value) {
 608   if (value == min_jint) { addq(dst, value); return; }
 609   if (value <  0) { decrementq(dst, -value); return; }
 610   if (value == 0) {                        ; return; }
 611   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 612   /* else */      { addq(dst, value)       ; return; }
 613 }
 614 
 615 // 32bit can do a case table jump in one instruction but we no longer allow the base
 616 // to be installed in the Address class
 617 void MacroAssembler::jump(ArrayAddress entry) {
 618   lea(rscratch1, entry.base());
 619   Address dispatch = entry.index();
 620   assert(dispatch._base == noreg, "must be");
 621   dispatch._base = rscratch1;
 622   jmp(dispatch);
 623 }
 624 
 625 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 626   ShouldNotReachHere(); // 64bit doesn't use two regs
 627   cmpq(x_lo, y_lo);
 628 }
 629 
 630 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 631     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 632 }
 633 
 634 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 635   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 636   movptr(dst, rscratch1);
 637 }
 638 
 639 void MacroAssembler::leave() {
 640   // %%% is this really better? Why not on 32bit too?
 641   emit_int8((unsigned char)0xC9); // LEAVE
 642 }
 643 
 644 void MacroAssembler::lneg(Register hi, Register lo) {
 645   ShouldNotReachHere(); // 64bit doesn't use two regs
 646   negq(lo);
 647 }
 648 
 649 void MacroAssembler::movoop(Register dst, jobject obj) {
 650   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 651 }
 652 
 653 void MacroAssembler::movoop(Address dst, jobject obj) {
 654   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 655   movq(dst, rscratch1);
 656 }
 657 
 658 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 659   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 660 }
 661 
 662 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 663   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 664   movq(dst, rscratch1);
 665 }
 666 
 667 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 668   if (src.is_lval()) {
 669     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 670   } else {
 671     if (reachable(src)) {
 672       movq(dst, as_Address(src));
 673     } else {
 674       lea(scratch, src);
 675       movq(dst, Address(scratch, 0));
 676     }
 677   }
 678 }
 679 
 680 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 681   movq(as_Address(dst), src);
 682 }
 683 
 684 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 685   movq(dst, as_Address(src));
 686 }
 687 
 688 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 689 void MacroAssembler::movptr(Address dst, intptr_t src) {
 690   if (is_simm32(src)) {
 691     movptr(dst, checked_cast<int32_t>(src));
 692   } else {
 693     mov64(rscratch1, src);
 694     movq(dst, rscratch1);
 695   }
 696 }
 697 
 698 // These are mostly for initializing NULL
 699 void MacroAssembler::movptr(Address dst, int32_t src) {
 700   movslq(dst, src);
 701 }
 702 
 703 void MacroAssembler::movptr(Register dst, int32_t src) {
 704   mov64(dst, (intptr_t)src);
 705 }
 706 
 707 void MacroAssembler::pushoop(jobject obj) {
 708   movoop(rscratch1, obj);
 709   push(rscratch1);
 710 }
 711 
 712 void MacroAssembler::pushklass(Metadata* obj) {
 713   mov_metadata(rscratch1, obj);
 714   push(rscratch1);
 715 }
 716 
 717 void MacroAssembler::pushptr(AddressLiteral src) {
 718   lea(rscratch1, src);
 719   if (src.is_lval()) {
 720     push(rscratch1);
 721   } else {
 722     pushq(Address(rscratch1, 0));
 723   }
 724 }
 725 
 726 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 727   reset_last_Java_frame(r15_thread, clear_fp);
 728 }
 729 
 730 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 731                                          Register last_java_fp,
 732                                          address  last_java_pc) {
 733   vzeroupper();
 734   // determine last_java_sp register
 735   if (!last_java_sp->is_valid()) {
 736     last_java_sp = rsp;
 737   }
 738 
 739   // last_java_fp is optional
 740   if (last_java_fp->is_valid()) {
 741     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 742            last_java_fp);
 743   }
 744 
 745   // last_java_pc is optional
 746   if (last_java_pc != NULL) {
 747     Address java_pc(r15_thread,
 748                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 749     lea(rscratch1, InternalAddress(last_java_pc));
 750     movptr(java_pc, rscratch1);
 751   }
 752 
 753   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 754 }
 755 
 756 static void pass_arg0(MacroAssembler* masm, Register arg) {
 757   if (c_rarg0 != arg ) {
 758     masm->mov(c_rarg0, arg);
 759   }
 760 }
 761 
 762 static void pass_arg1(MacroAssembler* masm, Register arg) {
 763   if (c_rarg1 != arg ) {
 764     masm->mov(c_rarg1, arg);
 765   }
 766 }
 767 
 768 static void pass_arg2(MacroAssembler* masm, Register arg) {
 769   if (c_rarg2 != arg ) {
 770     masm->mov(c_rarg2, arg);
 771   }
 772 }
 773 
 774 static void pass_arg3(MacroAssembler* masm, Register arg) {
 775   if (c_rarg3 != arg ) {
 776     masm->mov(c_rarg3, arg);
 777   }
 778 }
 779 
 780 void MacroAssembler::stop(const char* msg) {
 781   if (ShowMessageBoxOnError) {
 782     address rip = pc();
 783     pusha(); // get regs on stack
 784     lea(c_rarg1, InternalAddress(rip));
 785     movq(c_rarg2, rsp); // pass pointer to regs array
 786   }
 787   lea(c_rarg0, ExternalAddress((address) msg));
 788   andq(rsp, -16); // align stack as required by ABI
 789   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 790   hlt();
 791 }
 792 
 793 void MacroAssembler::warn(const char* msg) {
 794   push(rbp);
 795   movq(rbp, rsp);
 796   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 797   push_CPU_state();   // keeps alignment at 16 bytes
 798   lea(c_rarg0, ExternalAddress((address) msg));
 799   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
 800   call(rax);
 801   pop_CPU_state();
 802   mov(rsp, rbp);
 803   pop(rbp);
 804 }
 805 
 806 void MacroAssembler::print_state() {
 807   address rip = pc();
 808   pusha();            // get regs on stack
 809   push(rbp);
 810   movq(rbp, rsp);
 811   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 812   push_CPU_state();   // keeps alignment at 16 bytes
 813 
 814   lea(c_rarg0, InternalAddress(rip));
 815   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 816   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 817 
 818   pop_CPU_state();
 819   mov(rsp, rbp);
 820   pop(rbp);
 821   popa();
 822 }
 823 
 824 #ifndef PRODUCT
 825 extern "C" void findpc(intptr_t x);
 826 #endif
 827 
 828 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 829   // In order to get locks to work, we need to fake a in_VM state
 830   if (ShowMessageBoxOnError) {
 831     JavaThread* thread = JavaThread::current();
 832     JavaThreadState saved_state = thread->thread_state();
 833     thread->set_thread_state(_thread_in_vm);
 834 #ifndef PRODUCT
 835     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 836       ttyLocker ttyl;
 837       BytecodeCounter::print();
 838     }
 839 #endif
 840     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 841     // XXX correct this offset for amd64
 842     // This is the value of eip which points to where verify_oop will return.
 843     if (os::message_box(msg, "Execution stopped, print registers?")) {
 844       print_state64(pc, regs);
 845       BREAKPOINT;
 846     }
 847   }
 848   fatal("DEBUG MESSAGE: %s", msg);
 849 }
 850 
 851 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 852   ttyLocker ttyl;
 853   FlagSetting fs(Debugging, true);
 854   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 855 #ifndef PRODUCT
 856   tty->cr();
 857   findpc(pc);
 858   tty->cr();
 859 #endif
 860 #define PRINT_REG(rax, value) \
 861   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 862   PRINT_REG(rax, regs[15]);
 863   PRINT_REG(rbx, regs[12]);
 864   PRINT_REG(rcx, regs[14]);
 865   PRINT_REG(rdx, regs[13]);
 866   PRINT_REG(rdi, regs[8]);
 867   PRINT_REG(rsi, regs[9]);
 868   PRINT_REG(rbp, regs[10]);
 869   // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
 870   PRINT_REG(rsp, (intptr_t)(&regs[16]));
 871   PRINT_REG(r8 , regs[7]);
 872   PRINT_REG(r9 , regs[6]);
 873   PRINT_REG(r10, regs[5]);
 874   PRINT_REG(r11, regs[4]);
 875   PRINT_REG(r12, regs[3]);
 876   PRINT_REG(r13, regs[2]);
 877   PRINT_REG(r14, regs[1]);
 878   PRINT_REG(r15, regs[0]);
 879 #undef PRINT_REG
 880   // Print some words near the top of the stack.
 881   int64_t* rsp = &regs[16];
 882   int64_t* dump_sp = rsp;
 883   for (int col1 = 0; col1 < 8; col1++) {
 884     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 885     os::print_location(tty, *dump_sp++);
 886   }
 887   for (int row = 0; row < 25; row++) {
 888     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 889     for (int col = 0; col < 4; col++) {
 890       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 891     }
 892     tty->cr();
 893   }
 894   // Print some instructions around pc:
 895   Disassembler::decode((address)pc-64, (address)pc);
 896   tty->print_cr("--------");
 897   Disassembler::decode((address)pc, (address)pc+32);
 898 }
 899 
 900 // The java_calling_convention describes stack locations as ideal slots on
 901 // a frame with no abi restrictions. Since we must observe abi restrictions
 902 // (like the placement of the register window) the slots must be biased by
 903 // the following value.
 904 static int reg2offset_in(VMReg r) {
 905   // Account for saved rbp and return address
 906   // This should really be in_preserve_stack_slots
 907   return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
 908 }
 909 
 910 static int reg2offset_out(VMReg r) {
 911   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
 912 }
 913 
 914 // A long move
 915 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 916 
 917   // The calling conventions assures us that each VMregpair is either
 918   // all really one physical register or adjacent stack slots.
 919 
 920   if (src.is_single_phys_reg() ) {
 921     if (dst.is_single_phys_reg()) {
 922       if (dst.first() != src.first()) {
 923         mov(dst.first()->as_Register(), src.first()->as_Register());
 924       }
 925     } else {
 926       assert(dst.is_single_reg(), "not a stack pair: (%s, %s), (%s, %s)",
 927        src.first()->name(), src.second()->name(), dst.first()->name(), dst.second()->name());
 928       movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
 929     }
 930   } else if (dst.is_single_phys_reg()) {
 931     assert(src.is_single_reg(),  "not a stack pair");
 932     movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 933   } else {
 934     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 935     movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 936     movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 937   }
 938 }
 939 
 940 // A double move
 941 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 942 
 943   // The calling conventions assures us that each VMregpair is either
 944   // all really one physical register or adjacent stack slots.
 945 
 946   if (src.is_single_phys_reg() ) {
 947     if (dst.is_single_phys_reg()) {
 948       // In theory these overlap but the ordering is such that this is likely a nop
 949       if ( src.first() != dst.first()) {
 950         movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
 951       }
 952     } else {
 953       assert(dst.is_single_reg(), "not a stack pair");
 954       movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
 955     }
 956   } else if (dst.is_single_phys_reg()) {
 957     assert(src.is_single_reg(),  "not a stack pair");
 958     movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 959   } else {
 960     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 961     movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 962     movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 963   }
 964 }
 965 
 966 
 967 // A float arg may have to do float reg int reg conversion
 968 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 969   assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
 970 
 971   // The calling conventions assures us that each VMregpair is either
 972   // all really one physical register or adjacent stack slots.
 973 
 974   if (src.first()->is_stack()) {
 975     if (dst.first()->is_stack()) {
 976       movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 977       movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 978     } else {
 979       // stack to reg
 980       assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
 981       movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 982     }
 983   } else if (dst.first()->is_stack()) {
 984     // reg to stack
 985     assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
 986     movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
 987   } else {
 988     // reg to reg
 989     // In theory these overlap but the ordering is such that this is likely a nop
 990     if ( src.first() != dst.first()) {
 991       movdbl(dst.first()->as_XMMRegister(),  src.first()->as_XMMRegister());
 992     }
 993   }
 994 }
 995 
 996 // On 64 bit we will store integer like items to the stack as
 997 // 64 bits items (x86_32/64 abi) even though java would only store
 998 // 32bits for a parameter. On 32bit it will simply be 32 bits
 999 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
1000 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
1001   if (src.first()->is_stack()) {
1002     if (dst.first()->is_stack()) {
1003       // stack to stack
1004       movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
1005       movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
1006     } else {
1007       // stack to reg
1008       movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
1009     }
1010   } else if (dst.first()->is_stack()) {
1011     // reg to stack
1012     // Do we really have to sign extend???
1013     // __ movslq(src.first()->as_Register(), src.first()->as_Register());
1014     movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
1015   } else {
1016     // Do we really have to sign extend???
1017     // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
1018     if (dst.first() != src.first()) {
1019       movq(dst.first()->as_Register(), src.first()->as_Register());
1020     }
1021   }
1022 }
1023 
1024 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
1025   if (src.first()->is_stack()) {
1026     if (dst.first()->is_stack()) {
1027       // stack to stack
1028       movq(rax, Address(rbp, reg2offset_in(src.first())));
1029       movq(Address(rsp, reg2offset_out(dst.first())), rax);
1030     } else {
1031       // stack to reg
1032       movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1033     }
1034   } else if (dst.first()->is_stack()) {
1035     // reg to stack
1036     movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1037   } else {
1038     if (dst.first() != src.first()) {
1039       movq(dst.first()->as_Register(), src.first()->as_Register());
1040     }
1041   }
1042 }
1043 
1044 // An oop arg. Must pass a handle not the oop itself
1045 void MacroAssembler::object_move(OopMap* map,
1046                         int oop_handle_offset,
1047                         int framesize_in_slots,
1048                         VMRegPair src,
1049                         VMRegPair dst,
1050                         bool is_receiver,
1051                         int* receiver_offset) {
1052 
1053   // must pass a handle. First figure out the location we use as a handle
1054 
1055   Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1056 
1057   // See if oop is NULL if it is we need no handle
1058 
1059   if (src.first()->is_stack()) {
1060 
1061     // Oop is already on the stack as an argument
1062     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1063     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1064     if (is_receiver) {
1065       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1066     }
1067 
1068     cmpptr(Address(rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD);
1069     lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1070     // conditionally move a NULL
1071     cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1072   } else {
1073 
1074     // Oop is in an a register we must store it to the space we reserve
1075     // on the stack for oop_handles and pass a handle if oop is non-NULL
1076 
1077     const Register rOop = src.first()->as_Register();
1078     int oop_slot;
1079     if (rOop == j_rarg0)
1080       oop_slot = 0;
1081     else if (rOop == j_rarg1)
1082       oop_slot = 1;
1083     else if (rOop == j_rarg2)
1084       oop_slot = 2;
1085     else if (rOop == j_rarg3)
1086       oop_slot = 3;
1087     else if (rOop == j_rarg4)
1088       oop_slot = 4;
1089     else {
1090       assert(rOop == j_rarg5, "wrong register");
1091       oop_slot = 5;
1092     }
1093 
1094     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1095     int offset = oop_slot*VMRegImpl::stack_slot_size;
1096 
1097     map->set_oop(VMRegImpl::stack2reg(oop_slot));
1098     // Store oop in handle area, may be NULL
1099     movptr(Address(rsp, offset), rOop);
1100     if (is_receiver) {
1101       *receiver_offset = offset;
1102     }
1103 
1104     cmpptr(rOop, (int32_t)NULL_WORD);
1105     lea(rHandle, Address(rsp, offset));
1106     // conditionally move a NULL from the handle area where it was just stored
1107     cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1108   }
1109 
1110   // If arg is on the stack then place it otherwise it is already in correct reg.
1111   if (dst.first()->is_stack()) {
1112     movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1113   }
1114 }
1115 
1116 #endif // _LP64
1117 
1118 // Now versions that are common to 32/64 bit
1119 
1120 void MacroAssembler::addptr(Register dst, int32_t imm32) {
1121   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
1122 }
1123 
1124 void MacroAssembler::addptr(Register dst, Register src) {
1125   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1126 }
1127 
1128 void MacroAssembler::addptr(Address dst, Register src) {
1129   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1130 }
1131 
1132 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
1133   if (reachable(src)) {
1134     Assembler::addsd(dst, as_Address(src));
1135   } else {
1136     lea(rscratch1, src);
1137     Assembler::addsd(dst, Address(rscratch1, 0));
1138   }
1139 }
1140 
1141 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
1142   if (reachable(src)) {
1143     addss(dst, as_Address(src));
1144   } else {
1145     lea(rscratch1, src);
1146     addss(dst, Address(rscratch1, 0));
1147   }
1148 }
1149 
1150 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
1151   if (reachable(src)) {
1152     Assembler::addpd(dst, as_Address(src));
1153   } else {
1154     lea(rscratch1, src);
1155     Assembler::addpd(dst, Address(rscratch1, 0));
1156   }
1157 }
1158 
1159 // See 8273459.  Function for ensuring 64-byte alignment, intended for stubs only.
1160 // Stub code is generated once and never copied.
1161 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
1162 void MacroAssembler::align64() {
1163   align(64, (unsigned long long) pc());
1164 }
1165 
1166 void MacroAssembler::align32() {
1167   align(32, (unsigned long long) pc());
1168 }
1169 
1170 void MacroAssembler::align(int modulus) {
1171   // 8273459: Ensure alignment is possible with current segment alignment
1172   assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
1173   align(modulus, offset());
1174 }
1175 
1176 void MacroAssembler::align(int modulus, int target) {
1177   if (target % modulus != 0) {
1178     nop(modulus - (target % modulus));
1179   }
1180 }
1181 
1182 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1183   // Used in sign-masking with aligned address.
1184   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1185   if (reachable(src)) {
1186     Assembler::andpd(dst, as_Address(src));
1187   } else {
1188     lea(scratch_reg, src);
1189     Assembler::andpd(dst, Address(scratch_reg, 0));
1190   }
1191 }
1192 
1193 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1194   // Used in sign-masking with aligned address.
1195   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1196   if (reachable(src)) {
1197     Assembler::andps(dst, as_Address(src));
1198   } else {
1199     lea(scratch_reg, src);
1200     Assembler::andps(dst, Address(scratch_reg, 0));
1201   }
1202 }
1203 
1204 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1205   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1206 }
1207 
1208 void MacroAssembler::atomic_incl(Address counter_addr) {
1209   lock();
1210   incrementl(counter_addr);
1211 }
1212 
1213 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1214   if (reachable(counter_addr)) {
1215     atomic_incl(as_Address(counter_addr));
1216   } else {
1217     lea(scr, counter_addr);
1218     atomic_incl(Address(scr, 0));
1219   }
1220 }
1221 
1222 #ifdef _LP64
1223 void MacroAssembler::atomic_incq(Address counter_addr) {
1224   lock();
1225   incrementq(counter_addr);
1226 }
1227 
1228 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1229   if (reachable(counter_addr)) {
1230     atomic_incq(as_Address(counter_addr));
1231   } else {
1232     lea(scr, counter_addr);
1233     atomic_incq(Address(scr, 0));
1234   }
1235 }
1236 #endif
1237 
1238 // Writes to stack successive pages until offset reached to check for
1239 // stack overflow + shadow pages.  This clobbers tmp.
1240 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1241   movptr(tmp, rsp);
1242   // Bang stack for total size given plus shadow page size.
1243   // Bang one page at a time because large size can bang beyond yellow and
1244   // red zones.
1245   Label loop;
1246   bind(loop);
1247   movl(Address(tmp, (-os::vm_page_size())), size );
1248   subptr(tmp, os::vm_page_size());
1249   subl(size, os::vm_page_size());
1250   jcc(Assembler::greater, loop);
1251 
1252   // Bang down shadow pages too.
1253   // At this point, (tmp-0) is the last address touched, so don't
1254   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1255   // was post-decremented.)  Skip this address by starting at i=1, and
1256   // touch a few more pages below.  N.B.  It is important to touch all
1257   // the way down including all pages in the shadow zone.
1258   for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1259     // this could be any sized move but this is can be a debugging crumb
1260     // so the bigger the better.
1261     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1262   }
1263 }
1264 
1265 void MacroAssembler::reserved_stack_check() {
1266     // testing if reserved zone needs to be enabled
1267     Label no_reserved_zone_enabling;
1268     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1269     NOT_LP64(get_thread(rsi);)
1270 
1271     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1272     jcc(Assembler::below, no_reserved_zone_enabling);
1273 
1274     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1275     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1276     should_not_reach_here();
1277 
1278     bind(no_reserved_zone_enabling);
1279 }
1280 
1281 void MacroAssembler::c2bool(Register x) {
1282   // implements x == 0 ? 0 : 1
1283   // note: must only look at least-significant byte of x
1284   //       since C-style booleans are stored in one byte
1285   //       only! (was bug)
1286   andl(x, 0xFF);
1287   setb(Assembler::notZero, x);
1288 }
1289 
1290 // Wouldn't need if AddressLiteral version had new name
1291 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1292   Assembler::call(L, rtype);
1293 }
1294 
1295 void MacroAssembler::call(Register entry) {
1296   Assembler::call(entry);
1297 }
1298 
1299 void MacroAssembler::call(AddressLiteral entry) {
1300   if (reachable(entry)) {
1301     Assembler::call_literal(entry.target(), entry.rspec());
1302   } else {
1303     lea(rscratch1, entry);
1304     Assembler::call(rscratch1);
1305   }
1306 }
1307 
1308 void MacroAssembler::ic_call(address entry, jint method_index) {
1309   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
1310   movptr(rax, (intptr_t)Universe::non_oop_word());
1311   call(AddressLiteral(entry, rh));
1312 }
1313 
1314 // Implementation of call_VM versions
1315 
1316 void MacroAssembler::call_VM(Register oop_result,
1317                              address entry_point,
1318                              bool check_exceptions) {
1319   Label C, E;
1320   call(C, relocInfo::none);
1321   jmp(E);
1322 
1323   bind(C);
1324   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1325   ret(0);
1326 
1327   bind(E);
1328 }
1329 
1330 void MacroAssembler::call_VM(Register oop_result,
1331                              address entry_point,
1332                              Register arg_1,
1333                              bool check_exceptions) {
1334   Label C, E;
1335   call(C, relocInfo::none);
1336   jmp(E);
1337 
1338   bind(C);
1339   pass_arg1(this, arg_1);
1340   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1341   ret(0);
1342 
1343   bind(E);
1344 }
1345 
1346 void MacroAssembler::call_VM(Register oop_result,
1347                              address entry_point,
1348                              Register arg_1,
1349                              Register arg_2,
1350                              bool check_exceptions) {
1351   Label C, E;
1352   call(C, relocInfo::none);
1353   jmp(E);
1354 
1355   bind(C);
1356 
1357   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1358 
1359   pass_arg2(this, arg_2);
1360   pass_arg1(this, arg_1);
1361   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1362   ret(0);
1363 
1364   bind(E);
1365 }
1366 
1367 void MacroAssembler::call_VM(Register oop_result,
1368                              address entry_point,
1369                              Register arg_1,
1370                              Register arg_2,
1371                              Register arg_3,
1372                              bool check_exceptions) {
1373   Label C, E;
1374   call(C, relocInfo::none);
1375   jmp(E);
1376 
1377   bind(C);
1378 
1379   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1380   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1381   pass_arg3(this, arg_3);
1382 
1383   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1384   pass_arg2(this, arg_2);
1385 
1386   pass_arg1(this, arg_1);
1387   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1388   ret(0);
1389 
1390   bind(E);
1391 }
1392 
1393 void MacroAssembler::call_VM(Register oop_result,
1394                              Register last_java_sp,
1395                              address entry_point,
1396                              int number_of_arguments,
1397                              bool check_exceptions) {
1398   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1399   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1400 }
1401 
1402 void MacroAssembler::call_VM(Register oop_result,
1403                              Register last_java_sp,
1404                              address entry_point,
1405                              Register arg_1,
1406                              bool check_exceptions) {
1407   pass_arg1(this, arg_1);
1408   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1409 }
1410 
1411 void MacroAssembler::call_VM(Register oop_result,
1412                              Register last_java_sp,
1413                              address entry_point,
1414                              Register arg_1,
1415                              Register arg_2,
1416                              bool check_exceptions) {
1417 
1418   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1419   pass_arg2(this, arg_2);
1420   pass_arg1(this, arg_1);
1421   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1422 }
1423 
1424 void MacroAssembler::call_VM(Register oop_result,
1425                              Register last_java_sp,
1426                              address entry_point,
1427                              Register arg_1,
1428                              Register arg_2,
1429                              Register arg_3,
1430                              bool check_exceptions) {
1431   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1432   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1433   pass_arg3(this, arg_3);
1434   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1435   pass_arg2(this, arg_2);
1436   pass_arg1(this, arg_1);
1437   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1438 }
1439 
1440 void MacroAssembler::super_call_VM(Register oop_result,
1441                                    Register last_java_sp,
1442                                    address entry_point,
1443                                    int number_of_arguments,
1444                                    bool check_exceptions) {
1445   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1446   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1447 }
1448 
1449 void MacroAssembler::super_call_VM(Register oop_result,
1450                                    Register last_java_sp,
1451                                    address entry_point,
1452                                    Register arg_1,
1453                                    bool check_exceptions) {
1454   pass_arg1(this, arg_1);
1455   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1456 }
1457 
1458 void MacroAssembler::super_call_VM(Register oop_result,
1459                                    Register last_java_sp,
1460                                    address entry_point,
1461                                    Register arg_1,
1462                                    Register arg_2,
1463                                    bool check_exceptions) {
1464 
1465   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1466   pass_arg2(this, arg_2);
1467   pass_arg1(this, arg_1);
1468   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1469 }
1470 
1471 void MacroAssembler::super_call_VM(Register oop_result,
1472                                    Register last_java_sp,
1473                                    address entry_point,
1474                                    Register arg_1,
1475                                    Register arg_2,
1476                                    Register arg_3,
1477                                    bool check_exceptions) {
1478   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1479   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1480   pass_arg3(this, arg_3);
1481   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1482   pass_arg2(this, arg_2);
1483   pass_arg1(this, arg_1);
1484   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1485 }
1486 
1487 void MacroAssembler::call_VM_base(Register oop_result,
1488                                   Register java_thread,
1489                                   Register last_java_sp,
1490                                   address  entry_point,
1491                                   int      number_of_arguments,
1492                                   bool     check_exceptions) {
1493   // determine java_thread register
1494   if (!java_thread->is_valid()) {
1495 #ifdef _LP64
1496     java_thread = r15_thread;
1497 #else
1498     java_thread = rdi;
1499     get_thread(java_thread);
1500 #endif // LP64
1501   }
1502   // determine last_java_sp register
1503   if (!last_java_sp->is_valid()) {
1504     last_java_sp = rsp;
1505   }
1506   // debugging support
1507   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
1508   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
1509 #ifdef ASSERT
1510   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1511   // r12 is the heapbase.
1512   LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
1513 #endif // ASSERT
1514 
1515   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
1516   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1517 
1518   // push java thread (becomes first argument of C function)
1519 
1520   NOT_LP64(push(java_thread); number_of_arguments++);
1521   LP64_ONLY(mov(c_rarg0, r15_thread));
1522 
1523   // set last Java frame before call
1524   assert(last_java_sp != rbp, "can't use ebp/rbp");
1525 
1526   // Only interpreter should have to set fp
1527   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
1528 
1529   // do the call, remove parameters
1530   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1531 
1532   // restore the thread (cannot use the pushed argument since arguments
1533   // may be overwritten by C code generated by an optimizing compiler);
1534   // however can use the register value directly if it is callee saved.
1535   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
1536     // rdi & rsi (also r15) are callee saved -> nothing to do
1537 #ifdef ASSERT
1538     guarantee(java_thread != rax, "change this code");
1539     push(rax);
1540     { Label L;
1541       get_thread(rax);
1542       cmpptr(java_thread, rax);
1543       jcc(Assembler::equal, L);
1544       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
1545       bind(L);
1546     }
1547     pop(rax);
1548 #endif
1549   } else {
1550     get_thread(java_thread);
1551   }
1552   // reset last Java frame
1553   // Only interpreter should have to clear fp
1554   reset_last_Java_frame(java_thread, true);
1555 
1556    // C++ interp handles this in the interpreter
1557   check_and_handle_popframe(java_thread);
1558   check_and_handle_earlyret(java_thread);
1559 
1560   if (check_exceptions) {
1561     // check for pending exceptions (java_thread is set upon return)
1562     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
1563 #ifndef _LP64
1564     jump_cc(Assembler::notEqual,
1565             RuntimeAddress(StubRoutines::forward_exception_entry()));
1566 #else
1567     // This used to conditionally jump to forward_exception however it is
1568     // possible if we relocate that the branch will not reach. So we must jump
1569     // around so we can always reach
1570 
1571     Label ok;
1572     jcc(Assembler::equal, ok);
1573     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1574     bind(ok);
1575 #endif // LP64
1576   }
1577 
1578   // get oop result if there is one and reset the value in the thread
1579   if (oop_result->is_valid()) {
1580     get_vm_result(oop_result, java_thread);
1581   }
1582 }
1583 
1584 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1585 
1586   // Calculate the value for last_Java_sp
1587   // somewhat subtle. call_VM does an intermediate call
1588   // which places a return address on the stack just under the
1589   // stack pointer as the user finished with it. This allows
1590   // use to retrieve last_Java_pc from last_Java_sp[-1].
1591   // On 32bit we then have to push additional args on the stack to accomplish
1592   // the actual requested call. On 64bit call_VM only can use register args
1593   // so the only extra space is the return address that call_VM created.
1594   // This hopefully explains the calculations here.
1595 
1596 #ifdef _LP64
1597   // We've pushed one address, correct last_Java_sp
1598   lea(rax, Address(rsp, wordSize));
1599 #else
1600   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
1601 #endif // LP64
1602 
1603   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
1604 
1605 }
1606 
1607 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1608 void MacroAssembler::call_VM_leaf0(address entry_point) {
1609   MacroAssembler::call_VM_leaf_base(entry_point, 0);
1610 }
1611 
1612 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1613   call_VM_leaf_base(entry_point, number_of_arguments);
1614 }
1615 
1616 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1617   pass_arg0(this, arg_0);
1618   call_VM_leaf(entry_point, 1);
1619 }
1620 
1621 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1622 
1623   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1624   pass_arg1(this, arg_1);
1625   pass_arg0(this, arg_0);
1626   call_VM_leaf(entry_point, 2);
1627 }
1628 
1629 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1630   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1631   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1632   pass_arg2(this, arg_2);
1633   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1634   pass_arg1(this, arg_1);
1635   pass_arg0(this, arg_0);
1636   call_VM_leaf(entry_point, 3);
1637 }
1638 
1639 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1640   pass_arg0(this, arg_0);
1641   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1642 }
1643 
1644 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1645 
1646   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1647   pass_arg1(this, arg_1);
1648   pass_arg0(this, arg_0);
1649   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1650 }
1651 
1652 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1653   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1654   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1655   pass_arg2(this, arg_2);
1656   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1657   pass_arg1(this, arg_1);
1658   pass_arg0(this, arg_0);
1659   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1660 }
1661 
1662 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1663   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1664   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1665   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1666   pass_arg3(this, arg_3);
1667   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1668   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1669   pass_arg2(this, arg_2);
1670   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1671   pass_arg1(this, arg_1);
1672   pass_arg0(this, arg_0);
1673   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1674 }
1675 
1676 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1677   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1678   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
1679   verify_oop_msg(oop_result, "broken oop in call_VM_base");
1680 }
1681 
1682 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1683   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1684   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
1685 }
1686 
1687 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
1688 }
1689 
1690 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
1691 }
1692 
1693 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
1694   if (reachable(src1)) {
1695     cmpl(as_Address(src1), imm);
1696   } else {
1697     lea(rscratch1, src1);
1698     cmpl(Address(rscratch1, 0), imm);
1699   }
1700 }
1701 
1702 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
1703   assert(!src2.is_lval(), "use cmpptr");
1704   if (reachable(src2)) {
1705     cmpl(src1, as_Address(src2));
1706   } else {
1707     lea(rscratch1, src2);
1708     cmpl(src1, Address(rscratch1, 0));
1709   }
1710 }
1711 
1712 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1713   Assembler::cmpl(src1, imm);
1714 }
1715 
1716 void MacroAssembler::cmp32(Register src1, Address src2) {
1717   Assembler::cmpl(src1, src2);
1718 }
1719 
1720 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1721   ucomisd(opr1, opr2);
1722 
1723   Label L;
1724   if (unordered_is_less) {
1725     movl(dst, -1);
1726     jcc(Assembler::parity, L);
1727     jcc(Assembler::below , L);
1728     movl(dst, 0);
1729     jcc(Assembler::equal , L);
1730     increment(dst);
1731   } else { // unordered is greater
1732     movl(dst, 1);
1733     jcc(Assembler::parity, L);
1734     jcc(Assembler::above , L);
1735     movl(dst, 0);
1736     jcc(Assembler::equal , L);
1737     decrementl(dst);
1738   }
1739   bind(L);
1740 }
1741 
1742 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1743   ucomiss(opr1, opr2);
1744 
1745   Label L;
1746   if (unordered_is_less) {
1747     movl(dst, -1);
1748     jcc(Assembler::parity, L);
1749     jcc(Assembler::below , L);
1750     movl(dst, 0);
1751     jcc(Assembler::equal , L);
1752     increment(dst);
1753   } else { // unordered is greater
1754     movl(dst, 1);
1755     jcc(Assembler::parity, L);
1756     jcc(Assembler::above , L);
1757     movl(dst, 0);
1758     jcc(Assembler::equal , L);
1759     decrementl(dst);
1760   }
1761   bind(L);
1762 }
1763 
1764 
1765 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
1766   if (reachable(src1)) {
1767     cmpb(as_Address(src1), imm);
1768   } else {
1769     lea(rscratch1, src1);
1770     cmpb(Address(rscratch1, 0), imm);
1771   }
1772 }
1773 
1774 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
1775 #ifdef _LP64
1776   if (src2.is_lval()) {
1777     movptr(rscratch1, src2);
1778     Assembler::cmpq(src1, rscratch1);
1779   } else if (reachable(src2)) {
1780     cmpq(src1, as_Address(src2));
1781   } else {
1782     lea(rscratch1, src2);
1783     Assembler::cmpq(src1, Address(rscratch1, 0));
1784   }
1785 #else
1786   if (src2.is_lval()) {
1787     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1788   } else {
1789     cmpl(src1, as_Address(src2));
1790   }
1791 #endif // _LP64
1792 }
1793 
1794 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
1795   assert(src2.is_lval(), "not a mem-mem compare");
1796 #ifdef _LP64
1797   // moves src2's literal address
1798   movptr(rscratch1, src2);
1799   Assembler::cmpq(src1, rscratch1);
1800 #else
1801   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1802 #endif // _LP64
1803 }
1804 
1805 void MacroAssembler::cmpoop(Register src1, Register src2) {
1806   cmpptr(src1, src2);
1807 }
1808 
1809 void MacroAssembler::cmpoop(Register src1, Address src2) {
1810   cmpptr(src1, src2);
1811 }
1812 
1813 #ifdef _LP64
1814 void MacroAssembler::cmpoop(Register src1, jobject src2) {
1815   movoop(rscratch1, src2);
1816   cmpptr(src1, rscratch1);
1817 }
1818 #endif
1819 
1820 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
1821   if (reachable(adr)) {
1822     lock();
1823     cmpxchgptr(reg, as_Address(adr));
1824   } else {
1825     lea(rscratch1, adr);
1826     lock();
1827     cmpxchgptr(reg, Address(rscratch1, 0));
1828   }
1829 }
1830 
1831 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1832   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
1833 }
1834 
1835 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
1836   if (reachable(src)) {
1837     Assembler::comisd(dst, as_Address(src));
1838   } else {
1839     lea(rscratch1, src);
1840     Assembler::comisd(dst, Address(rscratch1, 0));
1841   }
1842 }
1843 
1844 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
1845   if (reachable(src)) {
1846     Assembler::comiss(dst, as_Address(src));
1847   } else {
1848     lea(rscratch1, src);
1849     Assembler::comiss(dst, Address(rscratch1, 0));
1850   }
1851 }
1852 
1853 
1854 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
1855   Condition negated_cond = negate_condition(cond);
1856   Label L;
1857   jcc(negated_cond, L);
1858   pushf(); // Preserve flags
1859   atomic_incl(counter_addr);
1860   popf();
1861   bind(L);
1862 }
1863 
1864 int MacroAssembler::corrected_idivl(Register reg) {
1865   // Full implementation of Java idiv and irem; checks for
1866   // special case as described in JVM spec., p.243 & p.271.
1867   // The function returns the (pc) offset of the idivl
1868   // instruction - may be needed for implicit exceptions.
1869   //
1870   //         normal case                           special case
1871   //
1872   // input : rax,: dividend                         min_int
1873   //         reg: divisor   (may not be rax,/rdx)   -1
1874   //
1875   // output: rax,: quotient  (= rax, idiv reg)       min_int
1876   //         rdx: remainder (= rax, irem reg)       0
1877   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1878   const int min_int = 0x80000000;
1879   Label normal_case, special_case;
1880 
1881   // check for special case
1882   cmpl(rax, min_int);
1883   jcc(Assembler::notEqual, normal_case);
1884   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1885   cmpl(reg, -1);
1886   jcc(Assembler::equal, special_case);
1887 
1888   // handle normal case
1889   bind(normal_case);
1890   cdql();
1891   int idivl_offset = offset();
1892   idivl(reg);
1893 
1894   // normal and special case exit
1895   bind(special_case);
1896 
1897   return idivl_offset;
1898 }
1899 
1900 
1901 
1902 void MacroAssembler::decrementl(Register reg, int value) {
1903   if (value == min_jint) {subl(reg, value) ; return; }
1904   if (value <  0) { incrementl(reg, -value); return; }
1905   if (value == 0) {                        ; return; }
1906   if (value == 1 && UseIncDec) { decl(reg) ; return; }
1907   /* else */      { subl(reg, value)       ; return; }
1908 }
1909 
1910 void MacroAssembler::decrementl(Address dst, int value) {
1911   if (value == min_jint) {subl(dst, value) ; return; }
1912   if (value <  0) { incrementl(dst, -value); return; }
1913   if (value == 0) {                        ; return; }
1914   if (value == 1 && UseIncDec) { decl(dst) ; return; }
1915   /* else */      { subl(dst, value)       ; return; }
1916 }
1917 
1918 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
1919   assert (shift_value > 0, "illegal shift value");
1920   Label _is_positive;
1921   testl (reg, reg);
1922   jcc (Assembler::positive, _is_positive);
1923   int offset = (1 << shift_value) - 1 ;
1924 
1925   if (offset == 1) {
1926     incrementl(reg);
1927   } else {
1928     addl(reg, offset);
1929   }
1930 
1931   bind (_is_positive);
1932   sarl(reg, shift_value);
1933 }
1934 
1935 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
1936   if (reachable(src)) {
1937     Assembler::divsd(dst, as_Address(src));
1938   } else {
1939     lea(rscratch1, src);
1940     Assembler::divsd(dst, Address(rscratch1, 0));
1941   }
1942 }
1943 
1944 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
1945   if (reachable(src)) {
1946     Assembler::divss(dst, as_Address(src));
1947   } else {
1948     lea(rscratch1, src);
1949     Assembler::divss(dst, Address(rscratch1, 0));
1950   }
1951 }
1952 
1953 void MacroAssembler::enter() {
1954   push(rbp);
1955   mov(rbp, rsp);
1956 }
1957 
1958 // A 5 byte nop that is safe for patching (see patch_verified_entry)
1959 void MacroAssembler::fat_nop() {
1960   if (UseAddressNop) {
1961     addr_nop_5();
1962   } else {
1963     emit_int8(0x26); // es:
1964     emit_int8(0x2e); // cs:
1965     emit_int8(0x64); // fs:
1966     emit_int8(0x65); // gs:
1967     emit_int8((unsigned char)0x90);
1968   }
1969 }
1970 
1971 #ifndef _LP64
1972 void MacroAssembler::fcmp(Register tmp) {
1973   fcmp(tmp, 1, true, true);
1974 }
1975 
1976 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
1977   assert(!pop_right || pop_left, "usage error");
1978   if (VM_Version::supports_cmov()) {
1979     assert(tmp == noreg, "unneeded temp");
1980     if (pop_left) {
1981       fucomip(index);
1982     } else {
1983       fucomi(index);
1984     }
1985     if (pop_right) {
1986       fpop();
1987     }
1988   } else {
1989     assert(tmp != noreg, "need temp");
1990     if (pop_left) {
1991       if (pop_right) {
1992         fcompp();
1993       } else {
1994         fcomp(index);
1995       }
1996     } else {
1997       fcom(index);
1998     }
1999     // convert FPU condition into eflags condition via rax,
2000     save_rax(tmp);
2001     fwait(); fnstsw_ax();
2002     sahf();
2003     restore_rax(tmp);
2004   }
2005   // condition codes set as follows:
2006   //
2007   // CF (corresponds to C0) if x < y
2008   // PF (corresponds to C2) if unordered
2009   // ZF (corresponds to C3) if x = y
2010 }
2011 
2012 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2013   fcmp2int(dst, unordered_is_less, 1, true, true);
2014 }
2015 
2016 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2017   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2018   Label L;
2019   if (unordered_is_less) {
2020     movl(dst, -1);
2021     jcc(Assembler::parity, L);
2022     jcc(Assembler::below , L);
2023     movl(dst, 0);
2024     jcc(Assembler::equal , L);
2025     increment(dst);
2026   } else { // unordered is greater
2027     movl(dst, 1);
2028     jcc(Assembler::parity, L);
2029     jcc(Assembler::above , L);
2030     movl(dst, 0);
2031     jcc(Assembler::equal , L);
2032     decrementl(dst);
2033   }
2034   bind(L);
2035 }
2036 
2037 void MacroAssembler::fld_d(AddressLiteral src) {
2038   fld_d(as_Address(src));
2039 }
2040 
2041 void MacroAssembler::fld_s(AddressLiteral src) {
2042   fld_s(as_Address(src));
2043 }
2044 
2045 void MacroAssembler::fldcw(AddressLiteral src) {
2046   Assembler::fldcw(as_Address(src));
2047 }
2048 
2049 void MacroAssembler::fpop() {
2050   ffree();
2051   fincstp();
2052 }
2053 
2054 void MacroAssembler::fremr(Register tmp) {
2055   save_rax(tmp);
2056   { Label L;
2057     bind(L);
2058     fprem();
2059     fwait(); fnstsw_ax();
2060     sahf();
2061     jcc(Assembler::parity, L);
2062   }
2063   restore_rax(tmp);
2064   // Result is in ST0.
2065   // Note: fxch & fpop to get rid of ST1
2066   // (otherwise FPU stack could overflow eventually)
2067   fxch(1);
2068   fpop();
2069 }
2070 
2071 void MacroAssembler::empty_FPU_stack() {
2072   if (VM_Version::supports_mmx()) {
2073     emms();
2074   } else {
2075     for (int i = 8; i-- > 0; ) ffree(i);
2076   }
2077 }
2078 #endif // !LP64
2079 
2080 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2081   if (reachable(src)) {
2082     Assembler::mulpd(dst, as_Address(src));
2083   } else {
2084     lea(rscratch1, src);
2085     Assembler::mulpd(dst, Address(rscratch1, 0));
2086   }
2087 }
2088 
2089 void MacroAssembler::load_float(Address src) {
2090 #ifdef _LP64
2091   movflt(xmm0, src);
2092 #else
2093   if (UseSSE >= 1) {
2094     movflt(xmm0, src);
2095   } else {
2096     fld_s(src);
2097   }
2098 #endif // LP64
2099 }
2100 
2101 void MacroAssembler::store_float(Address dst) {
2102 #ifdef _LP64
2103   movflt(dst, xmm0);
2104 #else
2105   if (UseSSE >= 1) {
2106     movflt(dst, xmm0);
2107   } else {
2108     fstp_s(dst);
2109   }
2110 #endif // LP64
2111 }
2112 
2113 void MacroAssembler::load_double(Address src) {
2114 #ifdef _LP64
2115   movdbl(xmm0, src);
2116 #else
2117   if (UseSSE >= 2) {
2118     movdbl(xmm0, src);
2119   } else {
2120     fld_d(src);
2121   }
2122 #endif // LP64
2123 }
2124 
2125 void MacroAssembler::store_double(Address dst) {
2126 #ifdef _LP64
2127   movdbl(dst, xmm0);
2128 #else
2129   if (UseSSE >= 2) {
2130     movdbl(dst, xmm0);
2131   } else {
2132     fstp_d(dst);
2133   }
2134 #endif // LP64
2135 }
2136 
2137 // dst = c = a * b + c
2138 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2139   Assembler::vfmadd231sd(c, a, b);
2140   if (dst != c) {
2141     movdbl(dst, c);
2142   }
2143 }
2144 
2145 // dst = c = a * b + c
2146 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2147   Assembler::vfmadd231ss(c, a, b);
2148   if (dst != c) {
2149     movflt(dst, c);
2150   }
2151 }
2152 
2153 // dst = c = a * b + c
2154 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2155   Assembler::vfmadd231pd(c, a, b, vector_len);
2156   if (dst != c) {
2157     vmovdqu(dst, c);
2158   }
2159 }
2160 
2161 // dst = c = a * b + c
2162 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2163   Assembler::vfmadd231ps(c, a, b, vector_len);
2164   if (dst != c) {
2165     vmovdqu(dst, c);
2166   }
2167 }
2168 
2169 // dst = c = a * b + c
2170 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2171   Assembler::vfmadd231pd(c, a, b, vector_len);
2172   if (dst != c) {
2173     vmovdqu(dst, c);
2174   }
2175 }
2176 
2177 // dst = c = a * b + c
2178 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2179   Assembler::vfmadd231ps(c, a, b, vector_len);
2180   if (dst != c) {
2181     vmovdqu(dst, c);
2182   }
2183 }
2184 
2185 void MacroAssembler::incrementl(AddressLiteral dst) {
2186   if (reachable(dst)) {
2187     incrementl(as_Address(dst));
2188   } else {
2189     lea(rscratch1, dst);
2190     incrementl(Address(rscratch1, 0));
2191   }
2192 }
2193 
2194 void MacroAssembler::incrementl(ArrayAddress dst) {
2195   incrementl(as_Address(dst));
2196 }
2197 
2198 void MacroAssembler::incrementl(Register reg, int value) {
2199   if (value == min_jint) {addl(reg, value) ; return; }
2200   if (value <  0) { decrementl(reg, -value); return; }
2201   if (value == 0) {                        ; return; }
2202   if (value == 1 && UseIncDec) { incl(reg) ; return; }
2203   /* else */      { addl(reg, value)       ; return; }
2204 }
2205 
2206 void MacroAssembler::incrementl(Address dst, int value) {
2207   if (value == min_jint) {addl(dst, value) ; return; }
2208   if (value <  0) { decrementl(dst, -value); return; }
2209   if (value == 0) {                        ; return; }
2210   if (value == 1 && UseIncDec) { incl(dst) ; return; }
2211   /* else */      { addl(dst, value)       ; return; }
2212 }
2213 
2214 void MacroAssembler::jump(AddressLiteral dst) {
2215   if (reachable(dst)) {
2216     jmp_literal(dst.target(), dst.rspec());
2217   } else {
2218     lea(rscratch1, dst);
2219     jmp(rscratch1);
2220   }
2221 }
2222 
2223 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
2224   if (reachable(dst)) {
2225     InstructionMark im(this);
2226     relocate(dst.reloc());
2227     const int short_size = 2;
2228     const int long_size = 6;
2229     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2230     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2231       // 0111 tttn #8-bit disp
2232       emit_int8(0x70 | cc);
2233       emit_int8((offs - short_size) & 0xFF);
2234     } else {
2235       // 0000 1111 1000 tttn #32-bit disp
2236       emit_int8(0x0F);
2237       emit_int8((unsigned char)(0x80 | cc));
2238       emit_int32(offs - long_size);
2239     }
2240   } else {
2241 #ifdef ASSERT
2242     warning("reversing conditional branch");
2243 #endif /* ASSERT */
2244     Label skip;
2245     jccb(reverse[cc], skip);
2246     lea(rscratch1, dst);
2247     Assembler::jmp(rscratch1);
2248     bind(skip);
2249   }
2250 }
2251 
2252 void MacroAssembler::fld_x(AddressLiteral src) {
2253   Assembler::fld_x(as_Address(src));
2254 }
2255 
2256 void MacroAssembler::ldmxcsr(AddressLiteral src, Register scratchReg) {
2257   if (reachable(src)) {
2258     Assembler::ldmxcsr(as_Address(src));
2259   } else {
2260     lea(scratchReg, src);
2261     Assembler::ldmxcsr(Address(scratchReg, 0));
2262   }
2263 }
2264 
2265 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2266   int off;
2267   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2268     off = offset();
2269     movsbl(dst, src); // movsxb
2270   } else {
2271     off = load_unsigned_byte(dst, src);
2272     shll(dst, 24);
2273     sarl(dst, 24);
2274   }
2275   return off;
2276 }
2277 
2278 // Note: load_signed_short used to be called load_signed_word.
2279 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
2280 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
2281 // The term "word" in HotSpot means a 32- or 64-bit machine word.
2282 int MacroAssembler::load_signed_short(Register dst, Address src) {
2283   int off;
2284   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2285     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
2286     // version but this is what 64bit has always done. This seems to imply
2287     // that users are only using 32bits worth.
2288     off = offset();
2289     movswl(dst, src); // movsxw
2290   } else {
2291     off = load_unsigned_short(dst, src);
2292     shll(dst, 16);
2293     sarl(dst, 16);
2294   }
2295   return off;
2296 }
2297 
2298 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2299   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2300   // and "3.9 Partial Register Penalties", p. 22).
2301   int off;
2302   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
2303     off = offset();
2304     movzbl(dst, src); // movzxb
2305   } else {
2306     xorl(dst, dst);
2307     off = offset();
2308     movb(dst, src);
2309   }
2310   return off;
2311 }
2312 
2313 // Note: load_unsigned_short used to be called load_unsigned_word.
2314 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2315   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2316   // and "3.9 Partial Register Penalties", p. 22).
2317   int off;
2318   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
2319     off = offset();
2320     movzwl(dst, src); // movzxw
2321   } else {
2322     xorl(dst, dst);
2323     off = offset();
2324     movw(dst, src);
2325   }
2326   return off;
2327 }
2328 
2329 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2330   switch (size_in_bytes) {
2331 #ifndef _LP64
2332   case  8:
2333     assert(dst2 != noreg, "second dest register required");
2334     movl(dst,  src);
2335     movl(dst2, src.plus_disp(BytesPerInt));
2336     break;
2337 #else
2338   case  8:  movq(dst, src); break;
2339 #endif
2340   case  4:  movl(dst, src); break;
2341   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2342   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2343   default:  ShouldNotReachHere();
2344   }
2345 }
2346 
2347 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2348   switch (size_in_bytes) {
2349 #ifndef _LP64
2350   case  8:
2351     assert(src2 != noreg, "second source register required");
2352     movl(dst,                        src);
2353     movl(dst.plus_disp(BytesPerInt), src2);
2354     break;
2355 #else
2356   case  8:  movq(dst, src); break;
2357 #endif
2358   case  4:  movl(dst, src); break;
2359   case  2:  movw(dst, src); break;
2360   case  1:  movb(dst, src); break;
2361   default:  ShouldNotReachHere();
2362   }
2363 }
2364 
2365 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
2366   if (reachable(dst)) {
2367     movl(as_Address(dst), src);
2368   } else {
2369     lea(rscratch1, dst);
2370     movl(Address(rscratch1, 0), src);
2371   }
2372 }
2373 
2374 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
2375   if (reachable(src)) {
2376     movl(dst, as_Address(src));
2377   } else {
2378     lea(rscratch1, src);
2379     movl(dst, Address(rscratch1, 0));
2380   }
2381 }
2382 
2383 // C++ bool manipulation
2384 
2385 void MacroAssembler::movbool(Register dst, Address src) {
2386   if(sizeof(bool) == 1)
2387     movb(dst, src);
2388   else if(sizeof(bool) == 2)
2389     movw(dst, src);
2390   else if(sizeof(bool) == 4)
2391     movl(dst, src);
2392   else
2393     // unsupported
2394     ShouldNotReachHere();
2395 }
2396 
2397 void MacroAssembler::movbool(Address dst, bool boolconst) {
2398   if(sizeof(bool) == 1)
2399     movb(dst, (int) boolconst);
2400   else if(sizeof(bool) == 2)
2401     movw(dst, (int) boolconst);
2402   else if(sizeof(bool) == 4)
2403     movl(dst, (int) boolconst);
2404   else
2405     // unsupported
2406     ShouldNotReachHere();
2407 }
2408 
2409 void MacroAssembler::movbool(Address dst, Register src) {
2410   if(sizeof(bool) == 1)
2411     movb(dst, src);
2412   else if(sizeof(bool) == 2)
2413     movw(dst, src);
2414   else if(sizeof(bool) == 4)
2415     movl(dst, src);
2416   else
2417     // unsupported
2418     ShouldNotReachHere();
2419 }
2420 
2421 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
2422   movb(as_Address(dst), src);
2423 }
2424 
2425 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
2426   if (reachable(src)) {
2427     movdl(dst, as_Address(src));
2428   } else {
2429     lea(rscratch1, src);
2430     movdl(dst, Address(rscratch1, 0));
2431   }
2432 }
2433 
2434 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
2435   if (reachable(src)) {
2436     movq(dst, as_Address(src));
2437   } else {
2438     lea(rscratch1, src);
2439     movq(dst, Address(rscratch1, 0));
2440   }
2441 }
2442 
2443 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
2444   if (reachable(src)) {
2445     if (UseXmmLoadAndClearUpper) {
2446       movsd (dst, as_Address(src));
2447     } else {
2448       movlpd(dst, as_Address(src));
2449     }
2450   } else {
2451     lea(rscratch1, src);
2452     if (UseXmmLoadAndClearUpper) {
2453       movsd (dst, Address(rscratch1, 0));
2454     } else {
2455       movlpd(dst, Address(rscratch1, 0));
2456     }
2457   }
2458 }
2459 
2460 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
2461   if (reachable(src)) {
2462     movss(dst, as_Address(src));
2463   } else {
2464     lea(rscratch1, src);
2465     movss(dst, Address(rscratch1, 0));
2466   }
2467 }
2468 
2469 void MacroAssembler::movptr(Register dst, Register src) {
2470   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2471 }
2472 
2473 void MacroAssembler::movptr(Register dst, Address src) {
2474   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2475 }
2476 
2477 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
2478 void MacroAssembler::movptr(Register dst, intptr_t src) {
2479   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
2480 }
2481 
2482 void MacroAssembler::movptr(Address dst, Register src) {
2483   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2484 }
2485 
2486 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2487     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2488     Assembler::movdqu(dst, src);
2489 }
2490 
2491 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2492     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2493     Assembler::movdqu(dst, src);
2494 }
2495 
2496 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2497     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2498     Assembler::movdqu(dst, src);
2499 }
2500 
2501 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
2502   if (reachable(src)) {
2503     movdqu(dst, as_Address(src));
2504   } else {
2505     lea(scratchReg, src);
2506     movdqu(dst, Address(scratchReg, 0));
2507   }
2508 }
2509 
2510 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2511     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2512     Assembler::vmovdqu(dst, src);
2513 }
2514 
2515 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2516     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2517     Assembler::vmovdqu(dst, src);
2518 }
2519 
2520 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2521     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2522     Assembler::vmovdqu(dst, src);
2523 }
2524 
2525 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
2526   if (reachable(src)) {
2527     vmovdqu(dst, as_Address(src));
2528   }
2529   else {
2530     lea(scratch_reg, src);
2531     vmovdqu(dst, Address(scratch_reg, 0));
2532   }
2533 }
2534 
2535 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg, int vector_len) {
2536   assert(vector_len <= AVX_256bit, "AVX2 vector length");
2537   if (vector_len == AVX_256bit) {
2538     vmovdqu(dst, src, scratch_reg);
2539   } else {
2540     movdqu(dst, src, scratch_reg);
2541   }
2542 }
2543 
2544 void MacroAssembler::kmov(KRegister dst, Address src) {
2545   if (VM_Version::supports_avx512bw()) {
2546     kmovql(dst, src);
2547   } else {
2548     assert(VM_Version::supports_evex(), "");
2549     kmovwl(dst, src);
2550   }
2551 }
2552 
2553 void MacroAssembler::kmov(Address dst, KRegister src) {
2554   if (VM_Version::supports_avx512bw()) {
2555     kmovql(dst, src);
2556   } else {
2557     assert(VM_Version::supports_evex(), "");
2558     kmovwl(dst, src);
2559   }
2560 }
2561 
2562 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2563   if (VM_Version::supports_avx512bw()) {
2564     kmovql(dst, src);
2565   } else {
2566     assert(VM_Version::supports_evex(), "");
2567     kmovwl(dst, src);
2568   }
2569 }
2570 
2571 void MacroAssembler::kmov(Register dst, KRegister src) {
2572   if (VM_Version::supports_avx512bw()) {
2573     kmovql(dst, src);
2574   } else {
2575     assert(VM_Version::supports_evex(), "");
2576     kmovwl(dst, src);
2577   }
2578 }
2579 
2580 void MacroAssembler::kmov(KRegister dst, Register src) {
2581   if (VM_Version::supports_avx512bw()) {
2582     kmovql(dst, src);
2583   } else {
2584     assert(VM_Version::supports_evex(), "");
2585     kmovwl(dst, src);
2586   }
2587 }
2588 
2589 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register scratch_reg) {
2590   if (reachable(src)) {
2591     kmovql(dst, as_Address(src));
2592   } else {
2593     lea(scratch_reg, src);
2594     kmovql(dst, Address(scratch_reg, 0));
2595   }
2596 }
2597 
2598 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg) {
2599   if (reachable(src)) {
2600     kmovwl(dst, as_Address(src));
2601   } else {
2602     lea(scratch_reg, src);
2603     kmovwl(dst, Address(scratch_reg, 0));
2604   }
2605 }
2606 
2607 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2608                                int vector_len, Register scratch_reg) {
2609   if (reachable(src)) {
2610     Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2611   } else {
2612     lea(scratch_reg, src);
2613     Assembler::evmovdqub(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2614   }
2615 }
2616 
2617 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2618                                int vector_len, Register scratch_reg) {
2619   if (reachable(src)) {
2620     Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2621   } else {
2622     lea(scratch_reg, src);
2623     Assembler::evmovdquw(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2624   }
2625 }
2626 
2627 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2628                                int vector_len, Register scratch_reg) {
2629   if (reachable(src)) {
2630     Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2631   } else {
2632     lea(scratch_reg, src);
2633     Assembler::evmovdqul(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2634   }
2635 }
2636 
2637 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2638                                int vector_len, Register scratch_reg) {
2639   if (reachable(src)) {
2640     Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2641   } else {
2642     lea(scratch_reg, src);
2643     Assembler::evmovdquq(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2644   }
2645 }
2646 
2647 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2648   if (reachable(src)) {
2649     Assembler::evmovdquq(dst, as_Address(src), vector_len);
2650   } else {
2651     lea(rscratch, src);
2652     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2653   }
2654 }
2655 
2656 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
2657   if (reachable(src)) {
2658     Assembler::movdqa(dst, as_Address(src));
2659   } else {
2660     lea(rscratch1, src);
2661     Assembler::movdqa(dst, Address(rscratch1, 0));
2662   }
2663 }
2664 
2665 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
2666   if (reachable(src)) {
2667     Assembler::movsd(dst, as_Address(src));
2668   } else {
2669     lea(rscratch1, src);
2670     Assembler::movsd(dst, Address(rscratch1, 0));
2671   }
2672 }
2673 
2674 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
2675   if (reachable(src)) {
2676     Assembler::movss(dst, as_Address(src));
2677   } else {
2678     lea(rscratch1, src);
2679     Assembler::movss(dst, Address(rscratch1, 0));
2680   }
2681 }
2682 
2683 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2684   if (reachable(src)) {
2685     Assembler::vmovddup(dst, as_Address(src), vector_len);
2686   } else {
2687     lea(rscratch, src);
2688     Assembler::vmovddup(dst, Address(rscratch, 0), vector_len);
2689   }
2690 }
2691 
2692 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
2693   if (reachable(src)) {
2694     Assembler::mulsd(dst, as_Address(src));
2695   } else {
2696     lea(rscratch1, src);
2697     Assembler::mulsd(dst, Address(rscratch1, 0));
2698   }
2699 }
2700 
2701 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
2702   if (reachable(src)) {
2703     Assembler::mulss(dst, as_Address(src));
2704   } else {
2705     lea(rscratch1, src);
2706     Assembler::mulss(dst, Address(rscratch1, 0));
2707   }
2708 }
2709 
2710 void MacroAssembler::null_check(Register reg, int offset) {
2711   if (needs_explicit_null_check(offset)) {
2712     // provoke OS NULL exception if reg = NULL by
2713     // accessing M[reg] w/o changing any (non-CC) registers
2714     // NOTE: cmpl is plenty here to provoke a segv
2715     cmpptr(rax, Address(reg, 0));
2716     // Note: should probably use testl(rax, Address(reg, 0));
2717     //       may be shorter code (however, this version of
2718     //       testl needs to be implemented first)
2719   } else {
2720     // nothing to do, (later) access of M[reg + offset]
2721     // will provoke OS NULL exception if reg = NULL
2722   }
2723 }
2724 
2725 void MacroAssembler::os_breakpoint() {
2726   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2727   // (e.g., MSVC can't call ps() otherwise)
2728   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2729 }
2730 
2731 void MacroAssembler::unimplemented(const char* what) {
2732   const char* buf = NULL;
2733   {
2734     ResourceMark rm;
2735     stringStream ss;
2736     ss.print("unimplemented: %s", what);
2737     buf = code_string(ss.as_string());
2738   }
2739   stop(buf);
2740 }
2741 
2742 #ifdef _LP64
2743 #define XSTATE_BV 0x200
2744 #endif
2745 
2746 void MacroAssembler::pop_CPU_state() {
2747   pop_FPU_state();
2748   pop_IU_state();
2749 }
2750 
2751 void MacroAssembler::pop_FPU_state() {
2752 #ifndef _LP64
2753   frstor(Address(rsp, 0));
2754 #else
2755   fxrstor(Address(rsp, 0));
2756 #endif
2757   addptr(rsp, FPUStateSizeInWords * wordSize);
2758 }
2759 
2760 void MacroAssembler::pop_IU_state() {
2761   popa();
2762   LP64_ONLY(addq(rsp, 8));
2763   popf();
2764 }
2765 
2766 // Save Integer and Float state
2767 // Warning: Stack must be 16 byte aligned (64bit)
2768 void MacroAssembler::push_CPU_state() {
2769   push_IU_state();
2770   push_FPU_state();
2771 }
2772 
2773 void MacroAssembler::push_FPU_state() {
2774   subptr(rsp, FPUStateSizeInWords * wordSize);
2775 #ifndef _LP64
2776   fnsave(Address(rsp, 0));
2777   fwait();
2778 #else
2779   fxsave(Address(rsp, 0));
2780 #endif // LP64
2781 }
2782 
2783 void MacroAssembler::push_IU_state() {
2784   // Push flags first because pusha kills them
2785   pushf();
2786   // Make sure rsp stays 16-byte aligned
2787   LP64_ONLY(subq(rsp, 8));
2788   pusha();
2789 }
2790 
2791 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
2792   if (!java_thread->is_valid()) {
2793     java_thread = rdi;
2794     get_thread(java_thread);
2795   }
2796   // we must set sp to zero to clear frame
2797   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
2798   // must clear fp, so that compiled frames are not confused; it is
2799   // possible that we need it only for debugging
2800   if (clear_fp) {
2801     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2802   }
2803   // Always clear the pc because it could have been set by make_walkable()
2804   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
2805   vzeroupper();
2806 }
2807 
2808 void MacroAssembler::restore_rax(Register tmp) {
2809   if (tmp == noreg) pop(rax);
2810   else if (tmp != rax) mov(rax, tmp);
2811 }
2812 
2813 void MacroAssembler::round_to(Register reg, int modulus) {
2814   addptr(reg, modulus - 1);
2815   andptr(reg, -modulus);
2816 }
2817 
2818 void MacroAssembler::save_rax(Register tmp) {
2819   if (tmp == noreg) push(rax);
2820   else if (tmp != rax) mov(tmp, rax);
2821 }
2822 
2823 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) {
2824   if (at_return) {
2825     // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
2826     // we may safely use rsp instead to perform the stack watermark check.
2827     cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset()));
2828     jcc(Assembler::above, slow_path);
2829     return;
2830   }
2831   testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2832   jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
2833 }
2834 
2835 // Calls to C land
2836 //
2837 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
2838 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
2839 // has to be reset to 0. This is required to allow proper stack traversal.
2840 void MacroAssembler::set_last_Java_frame(Register java_thread,
2841                                          Register last_java_sp,
2842                                          Register last_java_fp,
2843                                          address  last_java_pc) {
2844   vzeroupper();
2845   // determine java_thread register
2846   if (!java_thread->is_valid()) {
2847     java_thread = rdi;
2848     get_thread(java_thread);
2849   }
2850   // determine last_java_sp register
2851   if (!last_java_sp->is_valid()) {
2852     last_java_sp = rsp;
2853   }
2854 
2855   // last_java_fp is optional
2856 
2857   if (last_java_fp->is_valid()) {
2858     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
2859   }
2860 
2861   // last_java_pc is optional
2862 
2863   if (last_java_pc != NULL) {
2864     lea(Address(java_thread,
2865                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
2866         InternalAddress(last_java_pc));
2867 
2868   }
2869   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
2870 }
2871 
2872 void MacroAssembler::shlptr(Register dst, int imm8) {
2873   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
2874 }
2875 
2876 void MacroAssembler::shrptr(Register dst, int imm8) {
2877   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
2878 }
2879 
2880 void MacroAssembler::sign_extend_byte(Register reg) {
2881   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
2882     movsbl(reg, reg); // movsxb
2883   } else {
2884     shll(reg, 24);
2885     sarl(reg, 24);
2886   }
2887 }
2888 
2889 void MacroAssembler::sign_extend_short(Register reg) {
2890   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2891     movswl(reg, reg); // movsxw
2892   } else {
2893     shll(reg, 16);
2894     sarl(reg, 16);
2895   }
2896 }
2897 
2898 void MacroAssembler::testl(Register dst, AddressLiteral src) {
2899   assert(reachable(src), "Address should be reachable");
2900   testl(dst, as_Address(src));
2901 }
2902 
2903 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
2904   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2905   Assembler::pcmpeqb(dst, src);
2906 }
2907 
2908 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
2909   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2910   Assembler::pcmpeqw(dst, src);
2911 }
2912 
2913 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2914   assert((dst->encoding() < 16),"XMM register should be 0-15");
2915   Assembler::pcmpestri(dst, src, imm8);
2916 }
2917 
2918 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2919   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2920   Assembler::pcmpestri(dst, src, imm8);
2921 }
2922 
2923 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2924   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2925   Assembler::pmovzxbw(dst, src);
2926 }
2927 
2928 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
2929   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2930   Assembler::pmovzxbw(dst, src);
2931 }
2932 
2933 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
2934   assert((src->encoding() < 16),"XMM register should be 0-15");
2935   Assembler::pmovmskb(dst, src);
2936 }
2937 
2938 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
2939   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2940   Assembler::ptest(dst, src);
2941 }
2942 
2943 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
2944   if (reachable(src)) {
2945     Assembler::sqrtsd(dst, as_Address(src));
2946   } else {
2947     lea(rscratch1, src);
2948     Assembler::sqrtsd(dst, Address(rscratch1, 0));
2949   }
2950 }
2951 
2952 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
2953   if (reachable(src)) {
2954     Assembler::sqrtss(dst, as_Address(src));
2955   } else {
2956     lea(rscratch1, src);
2957     Assembler::sqrtss(dst, Address(rscratch1, 0));
2958   }
2959 }
2960 
2961 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
2962   if (reachable(src)) {
2963     Assembler::subsd(dst, as_Address(src));
2964   } else {
2965     lea(rscratch1, src);
2966     Assembler::subsd(dst, Address(rscratch1, 0));
2967   }
2968 }
2969 
2970 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg) {
2971   if (reachable(src)) {
2972     Assembler::roundsd(dst, as_Address(src), rmode);
2973   } else {
2974     lea(scratch_reg, src);
2975     Assembler::roundsd(dst, Address(scratch_reg, 0), rmode);
2976   }
2977 }
2978 
2979 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
2980   if (reachable(src)) {
2981     Assembler::subss(dst, as_Address(src));
2982   } else {
2983     lea(rscratch1, src);
2984     Assembler::subss(dst, Address(rscratch1, 0));
2985   }
2986 }
2987 
2988 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
2989   if (reachable(src)) {
2990     Assembler::ucomisd(dst, as_Address(src));
2991   } else {
2992     lea(rscratch1, src);
2993     Assembler::ucomisd(dst, Address(rscratch1, 0));
2994   }
2995 }
2996 
2997 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
2998   if (reachable(src)) {
2999     Assembler::ucomiss(dst, as_Address(src));
3000   } else {
3001     lea(rscratch1, src);
3002     Assembler::ucomiss(dst, Address(rscratch1, 0));
3003   }
3004 }
3005 
3006 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3007   // Used in sign-bit flipping with aligned address.
3008   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3009   if (reachable(src)) {
3010     Assembler::xorpd(dst, as_Address(src));
3011   } else {
3012     lea(scratch_reg, src);
3013     Assembler::xorpd(dst, Address(scratch_reg, 0));
3014   }
3015 }
3016 
3017 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3018   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3019     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3020   }
3021   else {
3022     Assembler::xorpd(dst, src);
3023   }
3024 }
3025 
3026 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3027   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3028     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3029   } else {
3030     Assembler::xorps(dst, src);
3031   }
3032 }
3033 
3034 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3035   // Used in sign-bit flipping with aligned address.
3036   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3037   if (reachable(src)) {
3038     Assembler::xorps(dst, as_Address(src));
3039   } else {
3040     lea(scratch_reg, src);
3041     Assembler::xorps(dst, Address(scratch_reg, 0));
3042   }
3043 }
3044 
3045 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3046   // Used in sign-bit flipping with aligned address.
3047   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3048   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3049   if (reachable(src)) {
3050     Assembler::pshufb(dst, as_Address(src));
3051   } else {
3052     lea(rscratch1, src);
3053     Assembler::pshufb(dst, Address(rscratch1, 0));
3054   }
3055 }
3056 
3057 // AVX 3-operands instructions
3058 
3059 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3060   if (reachable(src)) {
3061     vaddsd(dst, nds, as_Address(src));
3062   } else {
3063     lea(rscratch1, src);
3064     vaddsd(dst, nds, Address(rscratch1, 0));
3065   }
3066 }
3067 
3068 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3069   if (reachable(src)) {
3070     vaddss(dst, nds, as_Address(src));
3071   } else {
3072     lea(rscratch1, src);
3073     vaddss(dst, nds, Address(rscratch1, 0));
3074   }
3075 }
3076 
3077 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3078   assert(UseAVX > 0, "requires some form of AVX");
3079   if (reachable(src)) {
3080     Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
3081   } else {
3082     lea(rscratch, src);
3083     Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
3084   }
3085 }
3086 
3087 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3088   assert(UseAVX > 0, "requires some form of AVX");
3089   if (reachable(src)) {
3090     Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
3091   } else {
3092     lea(rscratch, src);
3093     Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
3094   }
3095 }
3096 
3097 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3098   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3099   vandps(dst, nds, negate_field, vector_len);
3100 }
3101 
3102 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3103   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3104   vandpd(dst, nds, negate_field, vector_len);
3105 }
3106 
3107 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3108   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3109   Assembler::vpaddb(dst, nds, src, vector_len);
3110 }
3111 
3112 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3113   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3114   Assembler::vpaddb(dst, nds, src, vector_len);
3115 }
3116 
3117 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3118   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3119   Assembler::vpaddw(dst, nds, src, vector_len);
3120 }
3121 
3122 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3123   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3124   Assembler::vpaddw(dst, nds, src, vector_len);
3125 }
3126 
3127 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3128   if (reachable(src)) {
3129     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3130   } else {
3131     lea(scratch_reg, src);
3132     Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3133   }
3134 }
3135 
3136 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3137   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3138   Assembler::vpbroadcastw(dst, src, vector_len);
3139 }
3140 
3141 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3142   if (reachable(src)) {
3143     Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
3144   } else {
3145     lea(rscratch, src);
3146     Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len);
3147   }
3148 }
3149 
3150 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3151   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3152   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3153 }
3154 
3155 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3156   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3157   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3158 }
3159 
3160 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds,
3161                                AddressLiteral src, int vector_len, Register scratch_reg) {
3162   if (reachable(src)) {
3163     Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3164   } else {
3165     lea(scratch_reg, src);
3166     Assembler::evpcmpeqd(kdst, mask, nds, Address(scratch_reg, 0), vector_len);
3167   }
3168 }
3169 
3170 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3171                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3172   if (reachable(src)) {
3173     Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3174   } else {
3175     lea(scratch_reg, src);
3176     Assembler::evpcmpd(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3177   }
3178 }
3179 
3180 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3181                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3182   if (reachable(src)) {
3183     Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3184   } else {
3185     lea(scratch_reg, src);
3186     Assembler::evpcmpq(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3187   }
3188 }
3189 
3190 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3191                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3192   if (reachable(src)) {
3193     Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3194   } else {
3195     lea(scratch_reg, src);
3196     Assembler::evpcmpb(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3197   }
3198 }
3199 
3200 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3201                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3202   if (reachable(src)) {
3203     Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3204   } else {
3205     lea(scratch_reg, src);
3206     Assembler::evpcmpw(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3207   }
3208 }
3209 
3210 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3211   if (width == Assembler::Q) {
3212     Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3213   } else {
3214     Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3215   }
3216 }
3217 
3218 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) {
3219   int eq_cond_enc = 0x29;
3220   int gt_cond_enc = 0x37;
3221   if (width != Assembler::Q) {
3222     eq_cond_enc = 0x74 + width;
3223     gt_cond_enc = 0x64 + width;
3224   }
3225   switch (cond) {
3226   case eq:
3227     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3228     break;
3229   case neq:
3230     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3231     vallones(xtmp, vector_len);
3232     vpxor(dst, xtmp, dst, vector_len);
3233     break;
3234   case le:
3235     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3236     vallones(xtmp, vector_len);
3237     vpxor(dst, xtmp, dst, vector_len);
3238     break;
3239   case nlt:
3240     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3241     vallones(xtmp, vector_len);
3242     vpxor(dst, xtmp, dst, vector_len);
3243     break;
3244   case lt:
3245     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3246     break;
3247   case nle:
3248     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3249     break;
3250   default:
3251     assert(false, "Should not reach here");
3252   }
3253 }
3254 
3255 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3256   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3257   Assembler::vpmovzxbw(dst, src, vector_len);
3258 }
3259 
3260 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3261   assert((src->encoding() < 16),"XMM register should be 0-15");
3262   Assembler::vpmovmskb(dst, src, vector_len);
3263 }
3264 
3265 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3266   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3267   Assembler::vpmullw(dst, nds, src, vector_len);
3268 }
3269 
3270 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3271   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3272   Assembler::vpmullw(dst, nds, src, vector_len);
3273 }
3274 
3275 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3276   assert((UseAVX > 0), "AVX support is needed");
3277   if (reachable(src)) {
3278     Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3279   } else {
3280     lea(scratch_reg, src);
3281     Assembler::vpmulld(dst, nds, Address(scratch_reg, 0), vector_len);
3282   }
3283 }
3284 
3285 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3286   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3287   Assembler::vpsubb(dst, nds, src, vector_len);
3288 }
3289 
3290 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3291   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3292   Assembler::vpsubb(dst, nds, src, vector_len);
3293 }
3294 
3295 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3296   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3297   Assembler::vpsubw(dst, nds, src, vector_len);
3298 }
3299 
3300 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3301   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3302   Assembler::vpsubw(dst, nds, src, vector_len);
3303 }
3304 
3305 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3306   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3307   Assembler::vpsraw(dst, nds, shift, vector_len);
3308 }
3309 
3310 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3311   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3312   Assembler::vpsraw(dst, nds, shift, vector_len);
3313 }
3314 
3315 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3316   assert(UseAVX > 2,"");
3317   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3318      vector_len = 2;
3319   }
3320   Assembler::evpsraq(dst, nds, shift, vector_len);
3321 }
3322 
3323 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3324   assert(UseAVX > 2,"");
3325   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3326      vector_len = 2;
3327   }
3328   Assembler::evpsraq(dst, nds, shift, vector_len);
3329 }
3330 
3331 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3332   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3333   Assembler::vpsrlw(dst, nds, shift, vector_len);
3334 }
3335 
3336 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3337   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3338   Assembler::vpsrlw(dst, nds, shift, vector_len);
3339 }
3340 
3341 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3342   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3343   Assembler::vpsllw(dst, nds, shift, vector_len);
3344 }
3345 
3346 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3347   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3348   Assembler::vpsllw(dst, nds, shift, vector_len);
3349 }
3350 
3351 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3352   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3353   Assembler::vptest(dst, src);
3354 }
3355 
3356 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3357   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3358   Assembler::punpcklbw(dst, src);
3359 }
3360 
3361 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3362   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3363   Assembler::pshufd(dst, src, mode);
3364 }
3365 
3366 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3367   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3368   Assembler::pshuflw(dst, src, mode);
3369 }
3370 
3371 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3372   if (reachable(src)) {
3373     vandpd(dst, nds, as_Address(src), vector_len);
3374   } else {
3375     lea(scratch_reg, src);
3376     vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
3377   }
3378 }
3379 
3380 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3381   if (reachable(src)) {
3382     vandps(dst, nds, as_Address(src), vector_len);
3383   } else {
3384     lea(scratch_reg, src);
3385     vandps(dst, nds, Address(scratch_reg, 0), vector_len);
3386   }
3387 }
3388 
3389 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3390                             bool merge, int vector_len, Register scratch_reg) {
3391   if (reachable(src)) {
3392     Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3393   } else {
3394     lea(scratch_reg, src);
3395     Assembler::evpord(dst, mask, nds, Address(scratch_reg, 0), merge, vector_len);
3396   }
3397 }
3398 
3399 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3400   if (reachable(src)) {
3401     vdivsd(dst, nds, as_Address(src));
3402   } else {
3403     lea(rscratch1, src);
3404     vdivsd(dst, nds, Address(rscratch1, 0));
3405   }
3406 }
3407 
3408 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3409   if (reachable(src)) {
3410     vdivss(dst, nds, as_Address(src));
3411   } else {
3412     lea(rscratch1, src);
3413     vdivss(dst, nds, Address(rscratch1, 0));
3414   }
3415 }
3416 
3417 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3418   if (reachable(src)) {
3419     vmulsd(dst, nds, as_Address(src));
3420   } else {
3421     lea(rscratch1, src);
3422     vmulsd(dst, nds, Address(rscratch1, 0));
3423   }
3424 }
3425 
3426 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3427   if (reachable(src)) {
3428     vmulss(dst, nds, as_Address(src));
3429   } else {
3430     lea(rscratch1, src);
3431     vmulss(dst, nds, Address(rscratch1, 0));
3432   }
3433 }
3434 
3435 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3436   if (reachable(src)) {
3437     vsubsd(dst, nds, as_Address(src));
3438   } else {
3439     lea(rscratch1, src);
3440     vsubsd(dst, nds, Address(rscratch1, 0));
3441   }
3442 }
3443 
3444 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3445   if (reachable(src)) {
3446     vsubss(dst, nds, as_Address(src));
3447   } else {
3448     lea(rscratch1, src);
3449     vsubss(dst, nds, Address(rscratch1, 0));
3450   }
3451 }
3452 
3453 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3454   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3455   vxorps(dst, nds, src, Assembler::AVX_128bit);
3456 }
3457 
3458 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3459   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3460   vxorpd(dst, nds, src, Assembler::AVX_128bit);
3461 }
3462 
3463 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3464   if (reachable(src)) {
3465     vxorpd(dst, nds, as_Address(src), vector_len);
3466   } else {
3467     lea(scratch_reg, src);
3468     vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
3469   }
3470 }
3471 
3472 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3473   if (reachable(src)) {
3474     vxorps(dst, nds, as_Address(src), vector_len);
3475   } else {
3476     lea(scratch_reg, src);
3477     vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
3478   }
3479 }
3480 
3481 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3482   if (UseAVX > 1 || (vector_len < 1)) {
3483     if (reachable(src)) {
3484       Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3485     } else {
3486       lea(scratch_reg, src);
3487       Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
3488     }
3489   }
3490   else {
3491     MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
3492   }
3493 }
3494 
3495 void MacroAssembler::vpermd(XMMRegister dst,  XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3496   if (reachable(src)) {
3497     Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3498   } else {
3499     lea(scratch_reg, src);
3500     Assembler::vpermd(dst, nds, Address(scratch_reg, 0), vector_len);
3501   }
3502 }
3503 
3504 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
3505   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
3506   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
3507   // The inverted mask is sign-extended
3508   andptr(possibly_jweak, inverted_jweak_mask);
3509 }
3510 
3511 void MacroAssembler::resolve_jobject(Register value,
3512                                      Register thread,
3513                                      Register tmp) {
3514   assert_different_registers(value, thread, tmp);
3515   Label done, not_weak;
3516   testptr(value, value);
3517   jcc(Assembler::zero, done);                // Use NULL as-is.
3518   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
3519   jcc(Assembler::zero, not_weak);
3520   // Resolve jweak.
3521   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3522                  value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
3523   verify_oop(value);
3524   jmp(done);
3525   bind(not_weak);
3526   // Resolve (untagged) jobject.
3527   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
3528   verify_oop(value);
3529   bind(done);
3530 }
3531 
3532 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3533   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
3534 }
3535 
3536 // Force generation of a 4 byte immediate value even if it fits into 8bit
3537 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3538   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
3539 }
3540 
3541 void MacroAssembler::subptr(Register dst, Register src) {
3542   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
3543 }
3544 
3545 // C++ bool manipulation
3546 void MacroAssembler::testbool(Register dst) {
3547   if(sizeof(bool) == 1)
3548     testb(dst, 0xff);
3549   else if(sizeof(bool) == 2) {
3550     // testw implementation needed for two byte bools
3551     ShouldNotReachHere();
3552   } else if(sizeof(bool) == 4)
3553     testl(dst, dst);
3554   else
3555     // unsupported
3556     ShouldNotReachHere();
3557 }
3558 
3559 void MacroAssembler::testptr(Register dst, Register src) {
3560   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
3561 }
3562 
3563 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3564 void MacroAssembler::tlab_allocate(Register thread, Register obj,
3565                                    Register var_size_in_bytes,
3566                                    int con_size_in_bytes,
3567                                    Register t1,
3568                                    Register t2,
3569                                    Label& slow_case) {
3570   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3571   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3572 }
3573 
3574 RegSet MacroAssembler::call_clobbered_gp_registers() {
3575   RegSet regs;
3576 #ifdef _LP64
3577   regs += RegSet::of(rax, rcx, rdx);
3578 #ifndef WINDOWS
3579   regs += RegSet::of(rsi, rdi);
3580 #endif
3581   regs += RegSet::range(r8, r11);
3582 #else
3583   regs += RegSet::of(rax, rcx, rdx);
3584 #endif
3585   return regs;
3586 }
3587 
3588 XMMRegSet MacroAssembler::call_clobbered_xmm_registers() {
3589   int num_xmm_registers = XMMRegisterImpl::available_xmm_registers();
3590 #if defined(WINDOWS) && defined(_LP64)
3591   XMMRegSet result = XMMRegSet::range(xmm0, xmm5);
3592   if (num_xmm_registers > 16) {
3593      result += XMMRegSet::range(xmm16, as_XMMRegister(num_xmm_registers - 1));
3594   }
3595   return result;
3596 #else
3597   return XMMRegSet::range(xmm0, as_XMMRegister(num_xmm_registers - 1));
3598 #endif
3599 }
3600 
3601 static int FPUSaveAreaSize = align_up(108, StackAlignmentInBytes); // 108 bytes needed for FPU state by fsave/frstor
3602 
3603 #ifndef _LP64
3604 static bool use_x87_registers() { return UseSSE < 2; }
3605 #endif
3606 static bool use_xmm_registers() { return UseSSE >= 1; }
3607 
3608 // C1 only ever uses the first double/float of the XMM register.
3609 static int xmm_save_size() { return UseSSE >= 2 ? sizeof(double) : sizeof(float); }
3610 
3611 static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3612   if (UseSSE == 1) {
3613     masm->movflt(Address(rsp, offset), reg);
3614   } else {
3615     masm->movdbl(Address(rsp, offset), reg);
3616   }
3617 }
3618 
3619 static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3620   if (UseSSE == 1) {
3621     masm->movflt(reg, Address(rsp, offset));
3622   } else {
3623     masm->movdbl(reg, Address(rsp, offset));
3624   }
3625 }
3626 
3627 int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers, bool save_fpu,
3628                            int& gp_area_size, int& fp_area_size, int& xmm_area_size) {
3629 
3630   gp_area_size = align_up(gp_registers.size() * RegisterImpl::max_slots_per_register * VMRegImpl::stack_slot_size,
3631                          StackAlignmentInBytes);
3632 #ifdef _LP64
3633   fp_area_size = 0;
3634 #else
3635   fp_area_size = (save_fpu && use_x87_registers()) ? FPUSaveAreaSize : 0;
3636 #endif
3637   xmm_area_size = (save_fpu && use_xmm_registers()) ? xmm_registers.size() * xmm_save_size() : 0;
3638 
3639   return gp_area_size + fp_area_size + xmm_area_size;
3640 }
3641 
3642 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) {
3643   block_comment("push_call_clobbered_registers start");
3644   // Regular registers
3645   RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude;
3646 
3647   int gp_area_size;
3648   int fp_area_size;
3649   int xmm_area_size;
3650   int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu,
3651                                                gp_area_size, fp_area_size, xmm_area_size);
3652   subptr(rsp, total_save_size);
3653 
3654   push_set(gp_registers_to_push, 0);
3655 
3656 #ifndef _LP64
3657   if (save_fpu && use_x87_registers()) {
3658     fnsave(Address(rsp, gp_area_size));
3659     fwait();
3660   }
3661 #endif
3662   if (save_fpu && use_xmm_registers()) {
3663     push_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size);
3664   }
3665 
3666   block_comment("push_call_clobbered_registers end");
3667 }
3668 
3669 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) {
3670   block_comment("pop_call_clobbered_registers start");
3671 
3672   RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude;
3673 
3674   int gp_area_size;
3675   int fp_area_size;
3676   int xmm_area_size;
3677   int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu,
3678                                                gp_area_size, fp_area_size, xmm_area_size);
3679 
3680   if (restore_fpu && use_xmm_registers()) {
3681     pop_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size);
3682   }
3683 #ifndef _LP64
3684   if (restore_fpu && use_x87_registers()) {
3685     frstor(Address(rsp, gp_area_size));
3686   }
3687 #endif
3688 
3689   pop_set(gp_registers_to_pop, 0);
3690 
3691   addptr(rsp, total_save_size);
3692 
3693   vzeroupper();
3694 
3695   block_comment("pop_call_clobbered_registers end");
3696 }
3697 
3698 void MacroAssembler::push_set(XMMRegSet set, int offset) {
3699   assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be");
3700   int spill_offset = offset;
3701 
3702   for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) {
3703     save_xmm_register(this, spill_offset, *it);
3704     spill_offset += xmm_save_size();
3705   }
3706 }
3707 
3708 void MacroAssembler::pop_set(XMMRegSet set, int offset) {
3709   int restore_size = set.size() * xmm_save_size();
3710   assert(is_aligned(restore_size, StackAlignmentInBytes), "must be");
3711 
3712   int restore_offset = offset + restore_size - xmm_save_size();
3713 
3714   for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) {
3715     restore_xmm_register(this, restore_offset, *it);
3716     restore_offset -= xmm_save_size();
3717   }
3718 }
3719 
3720 void MacroAssembler::push_set(RegSet set, int offset) {
3721   int spill_offset;
3722   if (offset == -1) {
3723     int register_push_size = set.size() * RegisterImpl::max_slots_per_register * VMRegImpl::stack_slot_size;
3724     int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
3725     subptr(rsp, aligned_size);
3726     spill_offset = 0;
3727   } else {
3728     spill_offset = offset;
3729   }
3730 
3731   for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
3732     movptr(Address(rsp, spill_offset), *it);
3733     spill_offset += RegisterImpl::max_slots_per_register * VMRegImpl::stack_slot_size;
3734   }
3735 }
3736 
3737 void MacroAssembler::pop_set(RegSet set, int offset) {
3738 
3739   int gp_reg_size = RegisterImpl::max_slots_per_register * VMRegImpl::stack_slot_size;
3740   int restore_size = set.size() * gp_reg_size;
3741   int aligned_size = align_up(restore_size, StackAlignmentInBytes);
3742 
3743   int restore_offset;
3744   if (offset == -1) {
3745     restore_offset = restore_size - gp_reg_size;
3746   } else {
3747     restore_offset = offset + restore_size - gp_reg_size;
3748   }
3749   for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) {
3750     movptr(*it, Address(rsp, restore_offset));
3751     restore_offset -= gp_reg_size;
3752   }
3753 
3754   if (offset == -1) {
3755     addptr(rsp, aligned_size);
3756   }
3757 }
3758 
3759 // Defines obj, preserves var_size_in_bytes
3760 void MacroAssembler::eden_allocate(Register thread, Register obj,
3761                                    Register var_size_in_bytes,
3762                                    int con_size_in_bytes,
3763                                    Register t1,
3764                                    Label& slow_case) {
3765   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3766   bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
3767 }
3768 
3769 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3770 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3771   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3772   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3773   Label done;
3774 
3775   testptr(length_in_bytes, length_in_bytes);
3776   jcc(Assembler::zero, done);
3777 
3778   // initialize topmost word, divide index by 2, check if odd and test if zero
3779   // note: for the remaining code to work, index must be a multiple of BytesPerWord
3780 #ifdef ASSERT
3781   {
3782     Label L;
3783     testptr(length_in_bytes, BytesPerWord - 1);
3784     jcc(Assembler::zero, L);
3785     stop("length must be a multiple of BytesPerWord");
3786     bind(L);
3787   }
3788 #endif
3789   Register index = length_in_bytes;
3790   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
3791   if (UseIncDec) {
3792     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
3793   } else {
3794     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
3795     shrptr(index, 1);
3796   }
3797 #ifndef _LP64
3798   // index could have not been a multiple of 8 (i.e., bit 2 was set)
3799   {
3800     Label even;
3801     // note: if index was a multiple of 8, then it cannot
3802     //       be 0 now otherwise it must have been 0 before
3803     //       => if it is even, we don't need to check for 0 again
3804     jcc(Assembler::carryClear, even);
3805     // clear topmost word (no jump would be needed if conditional assignment worked here)
3806     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
3807     // index could be 0 now, must check again
3808     jcc(Assembler::zero, done);
3809     bind(even);
3810   }
3811 #endif // !_LP64
3812   // initialize remaining object fields: index is a multiple of 2 now
3813   {
3814     Label loop;
3815     bind(loop);
3816     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3817     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
3818     decrement(index);
3819     jcc(Assembler::notZero, loop);
3820   }
3821 
3822   bind(done);
3823 }
3824 
3825 // Look up the method for a megamorphic invokeinterface call.
3826 // The target method is determined by <intf_klass, itable_index>.
3827 // The receiver klass is in recv_klass.
3828 // On success, the result will be in method_result, and execution falls through.
3829 // On failure, execution transfers to the given label.
3830 void MacroAssembler::lookup_interface_method(Register recv_klass,
3831                                              Register intf_klass,
3832                                              RegisterOrConstant itable_index,
3833                                              Register method_result,
3834                                              Register scan_temp,
3835                                              Label& L_no_such_interface,
3836                                              bool return_method) {
3837   assert_different_registers(recv_klass, intf_klass, scan_temp);
3838   assert_different_registers(method_result, intf_klass, scan_temp);
3839   assert(recv_klass != method_result || !return_method,
3840          "recv_klass can be destroyed when method isn't needed");
3841 
3842   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3843          "caller must use same register for non-constant itable index as for method");
3844 
3845   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3846   int vtable_base = in_bytes(Klass::vtable_start_offset());
3847   int itentry_off = itableMethodEntry::method_offset_in_bytes();
3848   int scan_step   = itableOffsetEntry::size() * wordSize;
3849   int vte_size    = vtableEntry::size_in_bytes();
3850   Address::ScaleFactor times_vte_scale = Address::times_ptr;
3851   assert(vte_size == wordSize, "else adjust times_vte_scale");
3852 
3853   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3854 
3855   // %%% Could store the aligned, prescaled offset in the klassoop.
3856   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3857 
3858   if (return_method) {
3859     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3860     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3861     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
3862   }
3863 
3864   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
3865   //   if (scan->interface() == intf) {
3866   //     result = (klass + scan->offset() + itable_index);
3867   //   }
3868   // }
3869   Label search, found_method;
3870 
3871   for (int peel = 1; peel >= 0; peel--) {
3872     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
3873     cmpptr(intf_klass, method_result);
3874 
3875     if (peel) {
3876       jccb(Assembler::equal, found_method);
3877     } else {
3878       jccb(Assembler::notEqual, search);
3879       // (invert the test to fall through to found_method...)
3880     }
3881 
3882     if (!peel)  break;
3883 
3884     bind(search);
3885 
3886     // Check that the previous entry is non-null.  A null entry means that
3887     // the receiver class doesn't implement the interface, and wasn't the
3888     // same as when the caller was compiled.
3889     testptr(method_result, method_result);
3890     jcc(Assembler::zero, L_no_such_interface);
3891     addptr(scan_temp, scan_step);
3892   }
3893 
3894   bind(found_method);
3895 
3896   if (return_method) {
3897     // Got a hit.
3898     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
3899     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
3900   }
3901 }
3902 
3903 
3904 // virtual method calling
3905 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3906                                            RegisterOrConstant vtable_index,
3907                                            Register method_result) {
3908   const int base = in_bytes(Klass::vtable_start_offset());
3909   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
3910   Address vtable_entry_addr(recv_klass,
3911                             vtable_index, Address::times_ptr,
3912                             base + vtableEntry::method_offset_in_bytes());
3913   movptr(method_result, vtable_entry_addr);
3914 }
3915 
3916 
3917 void MacroAssembler::check_klass_subtype(Register sub_klass,
3918                            Register super_klass,
3919                            Register temp_reg,
3920                            Label& L_success) {
3921   Label L_failure;
3922   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
3923   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
3924   bind(L_failure);
3925 }
3926 
3927 
3928 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3929                                                    Register super_klass,
3930                                                    Register temp_reg,
3931                                                    Label* L_success,
3932                                                    Label* L_failure,
3933                                                    Label* L_slow_path,
3934                                         RegisterOrConstant super_check_offset) {
3935   assert_different_registers(sub_klass, super_klass, temp_reg);
3936   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
3937   if (super_check_offset.is_register()) {
3938     assert_different_registers(sub_klass, super_klass,
3939                                super_check_offset.as_register());
3940   } else if (must_load_sco) {
3941     assert(temp_reg != noreg, "supply either a temp or a register offset");
3942   }
3943 
3944   Label L_fallthrough;
3945   int label_nulls = 0;
3946   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
3947   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
3948   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
3949   assert(label_nulls <= 1, "at most one NULL in the batch");
3950 
3951   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3952   int sco_offset = in_bytes(Klass::super_check_offset_offset());
3953   Address super_check_offset_addr(super_klass, sco_offset);
3954 
3955   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
3956   // range of a jccb.  If this routine grows larger, reconsider at
3957   // least some of these.
3958 #define local_jcc(assembler_cond, label)                                \
3959   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
3960   else                             jcc( assembler_cond, label) /*omit semi*/
3961 
3962   // Hacked jmp, which may only be used just before L_fallthrough.
3963 #define final_jmp(label)                                                \
3964   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
3965   else                            jmp(label)                /*omit semi*/
3966 
3967   // If the pointers are equal, we are done (e.g., String[] elements).
3968   // This self-check enables sharing of secondary supertype arrays among
3969   // non-primary types such as array-of-interface.  Otherwise, each such
3970   // type would need its own customized SSA.
3971   // We move this check to the front of the fast path because many
3972   // type checks are in fact trivially successful in this manner,
3973   // so we get a nicely predicted branch right at the start of the check.
3974   cmpptr(sub_klass, super_klass);
3975   local_jcc(Assembler::equal, *L_success);
3976 
3977   // Check the supertype display:
3978   if (must_load_sco) {
3979     // Positive movl does right thing on LP64.
3980     movl(temp_reg, super_check_offset_addr);
3981     super_check_offset = RegisterOrConstant(temp_reg);
3982   }
3983   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
3984   cmpptr(super_klass, super_check_addr); // load displayed supertype
3985 
3986   // This check has worked decisively for primary supers.
3987   // Secondary supers are sought in the super_cache ('super_cache_addr').
3988   // (Secondary supers are interfaces and very deeply nested subtypes.)
3989   // This works in the same check above because of a tricky aliasing
3990   // between the super_cache and the primary super display elements.
3991   // (The 'super_check_addr' can address either, as the case requires.)
3992   // Note that the cache is updated below if it does not help us find
3993   // what we need immediately.
3994   // So if it was a primary super, we can just fail immediately.
3995   // Otherwise, it's the slow path for us (no success at this point).
3996 
3997   if (super_check_offset.is_register()) {
3998     local_jcc(Assembler::equal, *L_success);
3999     cmpl(super_check_offset.as_register(), sc_offset);
4000     if (L_failure == &L_fallthrough) {
4001       local_jcc(Assembler::equal, *L_slow_path);
4002     } else {
4003       local_jcc(Assembler::notEqual, *L_failure);
4004       final_jmp(*L_slow_path);
4005     }
4006   } else if (super_check_offset.as_constant() == sc_offset) {
4007     // Need a slow path; fast failure is impossible.
4008     if (L_slow_path == &L_fallthrough) {
4009       local_jcc(Assembler::equal, *L_success);
4010     } else {
4011       local_jcc(Assembler::notEqual, *L_slow_path);
4012       final_jmp(*L_success);
4013     }
4014   } else {
4015     // No slow path; it's a fast decision.
4016     if (L_failure == &L_fallthrough) {
4017       local_jcc(Assembler::equal, *L_success);
4018     } else {
4019       local_jcc(Assembler::notEqual, *L_failure);
4020       final_jmp(*L_success);
4021     }
4022   }
4023 
4024   bind(L_fallthrough);
4025 
4026 #undef local_jcc
4027 #undef final_jmp
4028 }
4029 
4030 
4031 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4032                                                    Register super_klass,
4033                                                    Register temp_reg,
4034                                                    Register temp2_reg,
4035                                                    Label* L_success,
4036                                                    Label* L_failure,
4037                                                    bool set_cond_codes) {
4038   assert_different_registers(sub_klass, super_klass, temp_reg);
4039   if (temp2_reg != noreg)
4040     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4041 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4042 
4043   Label L_fallthrough;
4044   int label_nulls = 0;
4045   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4046   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4047   assert(label_nulls <= 1, "at most one NULL in the batch");
4048 
4049   // a couple of useful fields in sub_klass:
4050   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4051   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4052   Address secondary_supers_addr(sub_klass, ss_offset);
4053   Address super_cache_addr(     sub_klass, sc_offset);
4054 
4055   // Do a linear scan of the secondary super-klass chain.
4056   // This code is rarely used, so simplicity is a virtue here.
4057   // The repne_scan instruction uses fixed registers, which we must spill.
4058   // Don't worry too much about pre-existing connections with the input regs.
4059 
4060   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4061   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4062 
4063   // Get super_klass value into rax (even if it was in rdi or rcx).
4064   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4065   if (super_klass != rax || UseCompressedOops) {
4066     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4067     mov(rax, super_klass);
4068   }
4069   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4070   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4071 
4072 #ifndef PRODUCT
4073   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4074   ExternalAddress pst_counter_addr((address) pst_counter);
4075   NOT_LP64(  incrementl(pst_counter_addr) );
4076   LP64_ONLY( lea(rcx, pst_counter_addr) );
4077   LP64_ONLY( incrementl(Address(rcx, 0)) );
4078 #endif //PRODUCT
4079 
4080   // We will consult the secondary-super array.
4081   movptr(rdi, secondary_supers_addr);
4082   // Load the array length.  (Positive movl does right thing on LP64.)
4083   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4084   // Skip to start of data.
4085   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4086 
4087   // Scan RCX words at [RDI] for an occurrence of RAX.
4088   // Set NZ/Z based on last compare.
4089   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4090   // not change flags (only scas instruction which is repeated sets flags).
4091   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4092 
4093     testptr(rax,rax); // Set Z = 0
4094     repne_scan();
4095 
4096   // Unspill the temp. registers:
4097   if (pushed_rdi)  pop(rdi);
4098   if (pushed_rcx)  pop(rcx);
4099   if (pushed_rax)  pop(rax);
4100 
4101   if (set_cond_codes) {
4102     // Special hack for the AD files:  rdi is guaranteed non-zero.
4103     assert(!pushed_rdi, "rdi must be left non-NULL");
4104     // Also, the condition codes are properly set Z/NZ on succeed/failure.
4105   }
4106 
4107   if (L_failure == &L_fallthrough)
4108         jccb(Assembler::notEqual, *L_failure);
4109   else  jcc(Assembler::notEqual, *L_failure);
4110 
4111   // Success.  Cache the super we found and proceed in triumph.
4112   movptr(super_cache_addr, super_klass);
4113 
4114   if (L_success != &L_fallthrough) {
4115     jmp(*L_success);
4116   }
4117 
4118 #undef IS_A_TEMP
4119 
4120   bind(L_fallthrough);
4121 }
4122 
4123 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
4124   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
4125 
4126   Label L_fallthrough;
4127   if (L_fast_path == NULL) {
4128     L_fast_path = &L_fallthrough;
4129   } else if (L_slow_path == NULL) {
4130     L_slow_path = &L_fallthrough;
4131   }
4132 
4133   // Fast path check: class is fully initialized
4134   cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4135   jcc(Assembler::equal, *L_fast_path);
4136 
4137   // Fast path check: current thread is initializer thread
4138   cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
4139   if (L_slow_path == &L_fallthrough) {
4140     jcc(Assembler::equal, *L_fast_path);
4141     bind(*L_slow_path);
4142   } else if (L_fast_path == &L_fallthrough) {
4143     jcc(Assembler::notEqual, *L_slow_path);
4144     bind(*L_fast_path);
4145   } else {
4146     Unimplemented();
4147   }
4148 }
4149 
4150 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4151   if (VM_Version::supports_cmov()) {
4152     cmovl(cc, dst, src);
4153   } else {
4154     Label L;
4155     jccb(negate_condition(cc), L);
4156     movl(dst, src);
4157     bind(L);
4158   }
4159 }
4160 
4161 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4162   if (VM_Version::supports_cmov()) {
4163     cmovl(cc, dst, src);
4164   } else {
4165     Label L;
4166     jccb(negate_condition(cc), L);
4167     movl(dst, src);
4168     bind(L);
4169   }
4170 }
4171 
4172 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4173   if (!VerifyOops) return;
4174 
4175   // Pass register number to verify_oop_subroutine
4176   const char* b = NULL;
4177   {
4178     ResourceMark rm;
4179     stringStream ss;
4180     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4181     b = code_string(ss.as_string());
4182   }
4183   BLOCK_COMMENT("verify_oop {");
4184 #ifdef _LP64
4185   push(rscratch1);                    // save r10, trashed by movptr()
4186 #endif
4187   push(rax);                          // save rax,
4188   push(reg);                          // pass register argument
4189   ExternalAddress buffer((address) b);
4190   // avoid using pushptr, as it modifies scratch registers
4191   // and our contract is not to modify anything
4192   movptr(rax, buffer.addr());
4193   push(rax);
4194   // call indirectly to solve generation ordering problem
4195   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4196   call(rax);
4197   // Caller pops the arguments (oop, message) and restores rax, r10
4198   BLOCK_COMMENT("} verify_oop");
4199 }
4200 
4201 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4202   if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4203     vpternlogd(dst, 0xFF, dst, dst, vector_len);
4204   } else {
4205     assert(UseAVX > 0, "");
4206     vpcmpeqb(dst, dst, dst, vector_len);
4207   }
4208 }
4209 
4210 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4211                                          int extra_slot_offset) {
4212   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4213   int stackElementSize = Interpreter::stackElementSize;
4214   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4215 #ifdef ASSERT
4216   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4217   assert(offset1 - offset == stackElementSize, "correct arithmetic");
4218 #endif
4219   Register             scale_reg    = noreg;
4220   Address::ScaleFactor scale_factor = Address::no_scale;
4221   if (arg_slot.is_constant()) {
4222     offset += arg_slot.as_constant() * stackElementSize;
4223   } else {
4224     scale_reg    = arg_slot.as_register();
4225     scale_factor = Address::times(stackElementSize);
4226   }
4227   offset += wordSize;           // return PC is on stack
4228   return Address(rsp, scale_reg, scale_factor, offset);
4229 }
4230 
4231 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
4232   if (!VerifyOops) return;
4233 
4234   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4235   // Pass register number to verify_oop_subroutine
4236   const char* b = NULL;
4237   {
4238     ResourceMark rm;
4239     stringStream ss;
4240     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
4241     b = code_string(ss.as_string());
4242   }
4243 #ifdef _LP64
4244   push(rscratch1);                    // save r10, trashed by movptr()
4245 #endif
4246   push(rax);                          // save rax,
4247   // addr may contain rsp so we will have to adjust it based on the push
4248   // we just did (and on 64 bit we do two pushes)
4249   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4250   // stores rax into addr which is backwards of what was intended.
4251   if (addr.uses(rsp)) {
4252     lea(rax, addr);
4253     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4254   } else {
4255     pushptr(addr);
4256   }
4257 
4258   ExternalAddress buffer((address) b);
4259   // pass msg argument
4260   // avoid using pushptr, as it modifies scratch registers
4261   // and our contract is not to modify anything
4262   movptr(rax, buffer.addr());
4263   push(rax);
4264 
4265   // call indirectly to solve generation ordering problem
4266   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4267   call(rax);
4268   // Caller pops the arguments (addr, message) and restores rax, r10.
4269 }
4270 
4271 void MacroAssembler::verify_tlab() {
4272 #ifdef ASSERT
4273   if (UseTLAB && VerifyOops) {
4274     Label next, ok;
4275     Register t1 = rsi;
4276     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4277 
4278     push(t1);
4279     NOT_LP64(push(thread_reg));
4280     NOT_LP64(get_thread(thread_reg));
4281 
4282     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4283     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4284     jcc(Assembler::aboveEqual, next);
4285     STOP("assert(top >= start)");
4286     should_not_reach_here();
4287 
4288     bind(next);
4289     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4290     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4291     jcc(Assembler::aboveEqual, ok);
4292     STOP("assert(top <= end)");
4293     should_not_reach_here();
4294 
4295     bind(ok);
4296     NOT_LP64(pop(thread_reg));
4297     pop(t1);
4298   }
4299 #endif
4300 }
4301 
4302 class ControlWord {
4303  public:
4304   int32_t _value;
4305 
4306   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4307   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4308   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4309   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4310   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4311   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4312   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4313   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4314 
4315   void print() const {
4316     // rounding control
4317     const char* rc;
4318     switch (rounding_control()) {
4319       case 0: rc = "round near"; break;
4320       case 1: rc = "round down"; break;
4321       case 2: rc = "round up  "; break;
4322       case 3: rc = "chop      "; break;
4323       default:
4324         rc = NULL; // silence compiler warnings
4325         fatal("Unknown rounding control: %d", rounding_control());
4326     };
4327     // precision control
4328     const char* pc;
4329     switch (precision_control()) {
4330       case 0: pc = "24 bits "; break;
4331       case 1: pc = "reserved"; break;
4332       case 2: pc = "53 bits "; break;
4333       case 3: pc = "64 bits "; break;
4334       default:
4335         pc = NULL; // silence compiler warnings
4336         fatal("Unknown precision control: %d", precision_control());
4337     };
4338     // flags
4339     char f[9];
4340     f[0] = ' ';
4341     f[1] = ' ';
4342     f[2] = (precision   ()) ? 'P' : 'p';
4343     f[3] = (underflow   ()) ? 'U' : 'u';
4344     f[4] = (overflow    ()) ? 'O' : 'o';
4345     f[5] = (zero_divide ()) ? 'Z' : 'z';
4346     f[6] = (denormalized()) ? 'D' : 'd';
4347     f[7] = (invalid     ()) ? 'I' : 'i';
4348     f[8] = '\x0';
4349     // output
4350     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4351   }
4352 
4353 };
4354 
4355 class StatusWord {
4356  public:
4357   int32_t _value;
4358 
4359   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4360   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4361   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4362   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4363   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4364   int  top() const                     { return  (_value >> 11) & 7      ; }
4365   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4366   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4367   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4368   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4369   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4370   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4371   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4372   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4373 
4374   void print() const {
4375     // condition codes
4376     char c[5];
4377     c[0] = (C3()) ? '3' : '-';
4378     c[1] = (C2()) ? '2' : '-';
4379     c[2] = (C1()) ? '1' : '-';
4380     c[3] = (C0()) ? '0' : '-';
4381     c[4] = '\x0';
4382     // flags
4383     char f[9];
4384     f[0] = (error_status()) ? 'E' : '-';
4385     f[1] = (stack_fault ()) ? 'S' : '-';
4386     f[2] = (precision   ()) ? 'P' : '-';
4387     f[3] = (underflow   ()) ? 'U' : '-';
4388     f[4] = (overflow    ()) ? 'O' : '-';
4389     f[5] = (zero_divide ()) ? 'Z' : '-';
4390     f[6] = (denormalized()) ? 'D' : '-';
4391     f[7] = (invalid     ()) ? 'I' : '-';
4392     f[8] = '\x0';
4393     // output
4394     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4395   }
4396 
4397 };
4398 
4399 class TagWord {
4400  public:
4401   int32_t _value;
4402 
4403   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4404 
4405   void print() const {
4406     printf("%04x", _value & 0xFFFF);
4407   }
4408 
4409 };
4410 
4411 class FPU_Register {
4412  public:
4413   int32_t _m0;
4414   int32_t _m1;
4415   int16_t _ex;
4416 
4417   bool is_indefinite() const           {
4418     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4419   }
4420 
4421   void print() const {
4422     char  sign = (_ex < 0) ? '-' : '+';
4423     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
4424     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
4425   };
4426 
4427 };
4428 
4429 class FPU_State {
4430  public:
4431   enum {
4432     register_size       = 10,
4433     number_of_registers =  8,
4434     register_mask       =  7
4435   };
4436 
4437   ControlWord  _control_word;
4438   StatusWord   _status_word;
4439   TagWord      _tag_word;
4440   int32_t      _error_offset;
4441   int32_t      _error_selector;
4442   int32_t      _data_offset;
4443   int32_t      _data_selector;
4444   int8_t       _register[register_size * number_of_registers];
4445 
4446   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4447   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
4448 
4449   const char* tag_as_string(int tag) const {
4450     switch (tag) {
4451       case 0: return "valid";
4452       case 1: return "zero";
4453       case 2: return "special";
4454       case 3: return "empty";
4455     }
4456     ShouldNotReachHere();
4457     return NULL;
4458   }
4459 
4460   void print() const {
4461     // print computation registers
4462     { int t = _status_word.top();
4463       for (int i = 0; i < number_of_registers; i++) {
4464         int j = (i - t) & register_mask;
4465         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4466         st(j)->print();
4467         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4468       }
4469     }
4470     printf("\n");
4471     // print control registers
4472     printf("ctrl = "); _control_word.print(); printf("\n");
4473     printf("stat = "); _status_word .print(); printf("\n");
4474     printf("tags = "); _tag_word    .print(); printf("\n");
4475   }
4476 
4477 };
4478 
4479 class Flag_Register {
4480  public:
4481   int32_t _value;
4482 
4483   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
4484   bool direction() const               { return ((_value >> 10) & 1) != 0; }
4485   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
4486   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
4487   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
4488   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
4489   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
4490 
4491   void print() const {
4492     // flags
4493     char f[8];
4494     f[0] = (overflow       ()) ? 'O' : '-';
4495     f[1] = (direction      ()) ? 'D' : '-';
4496     f[2] = (sign           ()) ? 'S' : '-';
4497     f[3] = (zero           ()) ? 'Z' : '-';
4498     f[4] = (auxiliary_carry()) ? 'A' : '-';
4499     f[5] = (parity         ()) ? 'P' : '-';
4500     f[6] = (carry          ()) ? 'C' : '-';
4501     f[7] = '\x0';
4502     // output
4503     printf("%08x  flags = %s", _value, f);
4504   }
4505 
4506 };
4507 
4508 class IU_Register {
4509  public:
4510   int32_t _value;
4511 
4512   void print() const {
4513     printf("%08x  %11d", _value, _value);
4514   }
4515 
4516 };
4517 
4518 class IU_State {
4519  public:
4520   Flag_Register _eflags;
4521   IU_Register   _rdi;
4522   IU_Register   _rsi;
4523   IU_Register   _rbp;
4524   IU_Register   _rsp;
4525   IU_Register   _rbx;
4526   IU_Register   _rdx;
4527   IU_Register   _rcx;
4528   IU_Register   _rax;
4529 
4530   void print() const {
4531     // computation registers
4532     printf("rax,  = "); _rax.print(); printf("\n");
4533     printf("rbx,  = "); _rbx.print(); printf("\n");
4534     printf("rcx  = "); _rcx.print(); printf("\n");
4535     printf("rdx  = "); _rdx.print(); printf("\n");
4536     printf("rdi  = "); _rdi.print(); printf("\n");
4537     printf("rsi  = "); _rsi.print(); printf("\n");
4538     printf("rbp,  = "); _rbp.print(); printf("\n");
4539     printf("rsp  = "); _rsp.print(); printf("\n");
4540     printf("\n");
4541     // control registers
4542     printf("flgs = "); _eflags.print(); printf("\n");
4543   }
4544 };
4545 
4546 
4547 class CPU_State {
4548  public:
4549   FPU_State _fpu_state;
4550   IU_State  _iu_state;
4551 
4552   void print() const {
4553     printf("--------------------------------------------------\n");
4554     _iu_state .print();
4555     printf("\n");
4556     _fpu_state.print();
4557     printf("--------------------------------------------------\n");
4558   }
4559 
4560 };
4561 
4562 
4563 static void _print_CPU_state(CPU_State* state) {
4564   state->print();
4565 };
4566 
4567 
4568 void MacroAssembler::print_CPU_state() {
4569   push_CPU_state();
4570   push(rsp);                // pass CPU state
4571   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
4572   addptr(rsp, wordSize);       // discard argument
4573   pop_CPU_state();
4574 }
4575 
4576 
4577 #ifndef _LP64
4578 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
4579   static int counter = 0;
4580   FPU_State* fs = &state->_fpu_state;
4581   counter++;
4582   // For leaf calls, only verify that the top few elements remain empty.
4583   // We only need 1 empty at the top for C2 code.
4584   if( stack_depth < 0 ) {
4585     if( fs->tag_for_st(7) != 3 ) {
4586       printf("FPR7 not empty\n");
4587       state->print();
4588       assert(false, "error");
4589       return false;
4590     }
4591     return true;                // All other stack states do not matter
4592   }
4593 
4594   assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(),
4595          "bad FPU control word");
4596 
4597   // compute stack depth
4598   int i = 0;
4599   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
4600   int d = i;
4601   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
4602   // verify findings
4603   if (i != FPU_State::number_of_registers) {
4604     // stack not contiguous
4605     printf("%s: stack not contiguous at ST%d\n", s, i);
4606     state->print();
4607     assert(false, "error");
4608     return false;
4609   }
4610   // check if computed stack depth corresponds to expected stack depth
4611   if (stack_depth < 0) {
4612     // expected stack depth is -stack_depth or less
4613     if (d > -stack_depth) {
4614       // too many elements on the stack
4615       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
4616       state->print();
4617       assert(false, "error");
4618       return false;
4619     }
4620   } else {
4621     // expected stack depth is stack_depth
4622     if (d != stack_depth) {
4623       // wrong stack depth
4624       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
4625       state->print();
4626       assert(false, "error");
4627       return false;
4628     }
4629   }
4630   // everything is cool
4631   return true;
4632 }
4633 
4634 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
4635   if (!VerifyFPU) return;
4636   push_CPU_state();
4637   push(rsp);                // pass CPU state
4638   ExternalAddress msg((address) s);
4639   // pass message string s
4640   pushptr(msg.addr());
4641   push(stack_depth);        // pass stack depth
4642   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
4643   addptr(rsp, 3 * wordSize);   // discard arguments
4644   // check for error
4645   { Label L;
4646     testl(rax, rax);
4647     jcc(Assembler::notZero, L);
4648     int3();                  // break if error condition
4649     bind(L);
4650   }
4651   pop_CPU_state();
4652 }
4653 #endif // _LP64
4654 
4655 void MacroAssembler::restore_cpu_control_state_after_jni() {
4656   // Either restore the MXCSR register after returning from the JNI Call
4657   // or verify that it wasn't changed (with -Xcheck:jni flag).
4658   if (VM_Version::supports_sse()) {
4659     if (RestoreMXCSROnJNICalls) {
4660       ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()));
4661     } else if (CheckJNICalls) {
4662       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
4663     }
4664   }
4665   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
4666   vzeroupper();
4667 
4668 #ifndef _LP64
4669   // Either restore the x87 floating pointer control word after returning
4670   // from the JNI call or verify that it wasn't changed.
4671   if (CheckJNICalls) {
4672     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
4673   }
4674 #endif // _LP64
4675 }
4676 
4677 // ((OopHandle)result).resolve();
4678 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
4679   assert_different_registers(result, tmp);
4680 
4681   // Only 64 bit platforms support GCs that require a tmp register
4682   // Only IN_HEAP loads require a thread_tmp register
4683   // OopHandle::resolve is an indirection like jobject.
4684   access_load_at(T_OBJECT, IN_NATIVE,
4685                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
4686 }
4687 
4688 // ((WeakHandle)result).resolve();
4689 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
4690   assert_different_registers(rresult, rtmp);
4691   Label resolved;
4692 
4693   // A null weak handle resolves to null.
4694   cmpptr(rresult, 0);
4695   jcc(Assembler::equal, resolved);
4696 
4697   // Only 64 bit platforms support GCs that require a tmp register
4698   // Only IN_HEAP loads require a thread_tmp register
4699   // WeakHandle::resolve is an indirection like jweak.
4700   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4701                  rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
4702   bind(resolved);
4703 }
4704 
4705 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
4706   // get mirror
4707   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
4708   load_method_holder(mirror, method);
4709   movptr(mirror, Address(mirror, mirror_offset));
4710   resolve_oop_handle(mirror, tmp);
4711 }
4712 
4713 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
4714   load_method_holder(rresult, rmethod);
4715   movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
4716 }
4717 
4718 void MacroAssembler::load_method_holder(Register holder, Register method) {
4719   movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
4720   movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
4721   movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
4722 }
4723 
4724 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
4725   assert_different_registers(src, tmp);
4726   assert_different_registers(dst, tmp);
4727 #ifdef _LP64
4728   if (UseCompressedClassPointers) {
4729     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4730     decode_klass_not_null(dst, tmp);
4731   } else
4732 #endif
4733     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4734 }
4735 
4736 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
4737   assert_different_registers(src, tmp);
4738   assert_different_registers(dst, tmp);
4739 #ifdef _LP64
4740   if (UseCompressedClassPointers) {
4741     encode_klass_not_null(src, tmp);
4742     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4743   } else
4744 #endif
4745     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4746 }
4747 
4748 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
4749                                     Register tmp1, Register thread_tmp) {
4750   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4751   decorators = AccessInternal::decorator_fixup(decorators);
4752   bool as_raw = (decorators & AS_RAW) != 0;
4753   if (as_raw) {
4754     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4755   } else {
4756     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4757   }
4758 }
4759 
4760 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
4761                                      Register tmp1, Register tmp2, Register tmp3) {
4762   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4763   decorators = AccessInternal::decorator_fixup(decorators);
4764   bool as_raw = (decorators & AS_RAW) != 0;
4765   if (as_raw) {
4766     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2, tmp3);
4767   } else {
4768     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2, tmp3);
4769   }
4770 }
4771 
4772 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4773                                    Register thread_tmp, DecoratorSet decorators) {
4774   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4775 }
4776 
4777 // Doesn't do verification, generates fixed size code
4778 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4779                                             Register thread_tmp, DecoratorSet decorators) {
4780   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4781 }
4782 
4783 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4784                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
4785   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2, tmp3);
4786 }
4787 
4788 // Used for storing NULLs.
4789 void MacroAssembler::store_heap_oop_null(Address dst) {
4790   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
4791 }
4792 
4793 #ifdef _LP64
4794 void MacroAssembler::store_klass_gap(Register dst, Register src) {
4795   if (UseCompressedClassPointers) {
4796     // Store to klass gap in destination
4797     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
4798   }
4799 }
4800 
4801 #ifdef ASSERT
4802 void MacroAssembler::verify_heapbase(const char* msg) {
4803   assert (UseCompressedOops, "should be compressed");
4804   assert (Universe::heap() != NULL, "java heap should be initialized");
4805   if (CheckCompressedOops) {
4806     Label ok;
4807     const auto src2 = ExternalAddress((address)CompressedOops::ptrs_base_addr());
4808     assert(!src2.is_lval(), "should not be lval");
4809     const bool is_src2_reachable = reachable(src2);
4810     if (!is_src2_reachable) {
4811       push(rscratch1);  // cmpptr trashes rscratch1
4812     }
4813     cmpptr(r12_heapbase, src2);
4814     jcc(Assembler::equal, ok);
4815     STOP(msg);
4816     bind(ok);
4817     if (!is_src2_reachable) {
4818       pop(rscratch1);
4819     }
4820   }
4821 }
4822 #endif
4823 
4824 // Algorithm must match oop.inline.hpp encode_heap_oop.
4825 void MacroAssembler::encode_heap_oop(Register r) {
4826 #ifdef ASSERT
4827   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
4828 #endif
4829   verify_oop_msg(r, "broken oop in encode_heap_oop");
4830   if (CompressedOops::base() == NULL) {
4831     if (CompressedOops::shift() != 0) {
4832       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4833       shrq(r, LogMinObjAlignmentInBytes);
4834     }
4835     return;
4836   }
4837   testq(r, r);
4838   cmovq(Assembler::equal, r, r12_heapbase);
4839   subq(r, r12_heapbase);
4840   shrq(r, LogMinObjAlignmentInBytes);
4841 }
4842 
4843 void MacroAssembler::encode_heap_oop_not_null(Register r) {
4844 #ifdef ASSERT
4845   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
4846   if (CheckCompressedOops) {
4847     Label ok;
4848     testq(r, r);
4849     jcc(Assembler::notEqual, ok);
4850     STOP("null oop passed to encode_heap_oop_not_null");
4851     bind(ok);
4852   }
4853 #endif
4854   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
4855   if (CompressedOops::base() != NULL) {
4856     subq(r, r12_heapbase);
4857   }
4858   if (CompressedOops::shift() != 0) {
4859     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4860     shrq(r, LogMinObjAlignmentInBytes);
4861   }
4862 }
4863 
4864 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
4865 #ifdef ASSERT
4866   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
4867   if (CheckCompressedOops) {
4868     Label ok;
4869     testq(src, src);
4870     jcc(Assembler::notEqual, ok);
4871     STOP("null oop passed to encode_heap_oop_not_null2");
4872     bind(ok);
4873   }
4874 #endif
4875   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
4876   if (dst != src) {
4877     movq(dst, src);
4878   }
4879   if (CompressedOops::base() != NULL) {
4880     subq(dst, r12_heapbase);
4881   }
4882   if (CompressedOops::shift() != 0) {
4883     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4884     shrq(dst, LogMinObjAlignmentInBytes);
4885   }
4886 }
4887 
4888 void  MacroAssembler::decode_heap_oop(Register r) {
4889 #ifdef ASSERT
4890   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
4891 #endif
4892   if (CompressedOops::base() == NULL) {
4893     if (CompressedOops::shift() != 0) {
4894       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4895       shlq(r, LogMinObjAlignmentInBytes);
4896     }
4897   } else {
4898     Label done;
4899     shlq(r, LogMinObjAlignmentInBytes);
4900     jccb(Assembler::equal, done);
4901     addq(r, r12_heapbase);
4902     bind(done);
4903   }
4904   verify_oop_msg(r, "broken oop in decode_heap_oop");
4905 }
4906 
4907 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
4908   // Note: it will change flags
4909   assert (UseCompressedOops, "should only be used for compressed headers");
4910   assert (Universe::heap() != NULL, "java heap should be initialized");
4911   // Cannot assert, unverified entry point counts instructions (see .ad file)
4912   // vtableStubs also counts instructions in pd_code_size_limit.
4913   // Also do not verify_oop as this is called by verify_oop.
4914   if (CompressedOops::shift() != 0) {
4915     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4916     shlq(r, LogMinObjAlignmentInBytes);
4917     if (CompressedOops::base() != NULL) {
4918       addq(r, r12_heapbase);
4919     }
4920   } else {
4921     assert (CompressedOops::base() == NULL, "sanity");
4922   }
4923 }
4924 
4925 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
4926   // Note: it will change flags
4927   assert (UseCompressedOops, "should only be used for compressed headers");
4928   assert (Universe::heap() != NULL, "java heap should be initialized");
4929   // Cannot assert, unverified entry point counts instructions (see .ad file)
4930   // vtableStubs also counts instructions in pd_code_size_limit.
4931   // Also do not verify_oop as this is called by verify_oop.
4932   if (CompressedOops::shift() != 0) {
4933     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4934     if (LogMinObjAlignmentInBytes == Address::times_8) {
4935       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
4936     } else {
4937       if (dst != src) {
4938         movq(dst, src);
4939       }
4940       shlq(dst, LogMinObjAlignmentInBytes);
4941       if (CompressedOops::base() != NULL) {
4942         addq(dst, r12_heapbase);
4943       }
4944     }
4945   } else {
4946     assert (CompressedOops::base() == NULL, "sanity");
4947     if (dst != src) {
4948       movq(dst, src);
4949     }
4950   }
4951 }
4952 
4953 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
4954   assert_different_registers(r, tmp);
4955   if (CompressedKlassPointers::base() != NULL) {
4956     mov64(tmp, (int64_t)CompressedKlassPointers::base());
4957     subq(r, tmp);
4958   }
4959   if (CompressedKlassPointers::shift() != 0) {
4960     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4961     shrq(r, LogKlassAlignmentInBytes);
4962   }
4963 }
4964 
4965 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
4966   assert_different_registers(src, dst);
4967   if (CompressedKlassPointers::base() != NULL) {
4968     mov64(dst, -(int64_t)CompressedKlassPointers::base());
4969     addq(dst, src);
4970   } else {
4971     movptr(dst, src);
4972   }
4973   if (CompressedKlassPointers::shift() != 0) {
4974     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4975     shrq(dst, LogKlassAlignmentInBytes);
4976   }
4977 }
4978 
4979 void  MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
4980   assert_different_registers(r, tmp);
4981   // Note: it will change flags
4982   assert(UseCompressedClassPointers, "should only be used for compressed headers");
4983   // Cannot assert, unverified entry point counts instructions (see .ad file)
4984   // vtableStubs also counts instructions in pd_code_size_limit.
4985   // Also do not verify_oop as this is called by verify_oop.
4986   if (CompressedKlassPointers::shift() != 0) {
4987     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4988     shlq(r, LogKlassAlignmentInBytes);
4989   }
4990   if (CompressedKlassPointers::base() != NULL) {
4991     mov64(tmp, (int64_t)CompressedKlassPointers::base());
4992     addq(r, tmp);
4993   }
4994 }
4995 
4996 void  MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
4997   assert_different_registers(src, dst);
4998   // Note: it will change flags
4999   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5000   // Cannot assert, unverified entry point counts instructions (see .ad file)
5001   // vtableStubs also counts instructions in pd_code_size_limit.
5002   // Also do not verify_oop as this is called by verify_oop.
5003 
5004   if (CompressedKlassPointers::base() == NULL &&
5005       CompressedKlassPointers::shift() == 0) {
5006     // The best case scenario is that there is no base or shift. Then it is already
5007     // a pointer that needs nothing but a register rename.
5008     movl(dst, src);
5009   } else {
5010     if (CompressedKlassPointers::base() != NULL) {
5011       mov64(dst, (int64_t)CompressedKlassPointers::base());
5012     } else {
5013       xorq(dst, dst);
5014     }
5015     if (CompressedKlassPointers::shift() != 0) {
5016       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5017       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5018       leaq(dst, Address(dst, src, Address::times_8, 0));
5019     } else {
5020       addq(dst, src);
5021     }
5022   }
5023 }
5024 
5025 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5026   assert (UseCompressedOops, "should only be used for compressed headers");
5027   assert (Universe::heap() != NULL, "java heap should be initialized");
5028   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5029   int oop_index = oop_recorder()->find_index(obj);
5030   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5031   mov_narrow_oop(dst, oop_index, rspec);
5032 }
5033 
5034 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5035   assert (UseCompressedOops, "should only be used for compressed headers");
5036   assert (Universe::heap() != NULL, "java heap should be initialized");
5037   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5038   int oop_index = oop_recorder()->find_index(obj);
5039   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5040   mov_narrow_oop(dst, oop_index, rspec);
5041 }
5042 
5043 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5044   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5045   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5046   int klass_index = oop_recorder()->find_index(k);
5047   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5048   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5049 }
5050 
5051 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5052   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5053   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5054   int klass_index = oop_recorder()->find_index(k);
5055   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5056   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5057 }
5058 
5059 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5060   assert (UseCompressedOops, "should only be used for compressed headers");
5061   assert (Universe::heap() != NULL, "java heap should be initialized");
5062   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5063   int oop_index = oop_recorder()->find_index(obj);
5064   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5065   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5066 }
5067 
5068 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5069   assert (UseCompressedOops, "should only be used for compressed headers");
5070   assert (Universe::heap() != NULL, "java heap should be initialized");
5071   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5072   int oop_index = oop_recorder()->find_index(obj);
5073   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5074   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5075 }
5076 
5077 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5078   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5079   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5080   int klass_index = oop_recorder()->find_index(k);
5081   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5082   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5083 }
5084 
5085 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5086   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5087   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5088   int klass_index = oop_recorder()->find_index(k);
5089   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5090   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5091 }
5092 
5093 void MacroAssembler::reinit_heapbase() {
5094   if (UseCompressedOops) {
5095     if (Universe::heap() != NULL) {
5096       if (CompressedOops::base() == NULL) {
5097         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5098       } else {
5099         mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
5100       }
5101     } else {
5102       movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5103     }
5104   }
5105 }
5106 
5107 #endif // _LP64
5108 
5109 // C2 compiled method's prolog code.
5110 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
5111 
5112   // WARNING: Initial instruction MUST be 5 bytes or longer so that
5113   // NativeJump::patch_verified_entry will be able to patch out the entry
5114   // code safely. The push to verify stack depth is ok at 5 bytes,
5115   // the frame allocation can be either 3 or 6 bytes. So if we don't do
5116   // stack bang then we must use the 6 byte frame allocation even if
5117   // we have no frame. :-(
5118   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
5119 
5120   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
5121   // Remove word for return addr
5122   framesize -= wordSize;
5123   stack_bang_size -= wordSize;
5124 
5125   // Calls to C2R adapters often do not accept exceptional returns.
5126   // We require that their callers must bang for them.  But be careful, because
5127   // some VM calls (such as call site linkage) can use several kilobytes of
5128   // stack.  But the stack safety zone should account for that.
5129   // See bugs 4446381, 4468289, 4497237.
5130   if (stack_bang_size > 0) {
5131     generate_stack_overflow_check(stack_bang_size);
5132 
5133     // We always push rbp, so that on return to interpreter rbp, will be
5134     // restored correctly and we can correct the stack.
5135     push(rbp);
5136     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5137     if (PreserveFramePointer) {
5138       mov(rbp, rsp);
5139     }
5140     // Remove word for ebp
5141     framesize -= wordSize;
5142 
5143     // Create frame
5144     if (framesize) {
5145       subptr(rsp, framesize);
5146     }
5147   } else {
5148     // Create frame (force generation of a 4 byte immediate value)
5149     subptr_imm32(rsp, framesize);
5150 
5151     // Save RBP register now.
5152     framesize -= wordSize;
5153     movptr(Address(rsp, framesize), rbp);
5154     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5155     if (PreserveFramePointer) {
5156       movptr(rbp, rsp);
5157       if (framesize > 0) {
5158         addptr(rbp, framesize);
5159       }
5160     }
5161   }
5162 
5163   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5164     framesize -= wordSize;
5165     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5166   }
5167 
5168 #ifndef _LP64
5169   // If method sets FPU control word do it now
5170   if (fp_mode_24b) {
5171     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
5172   }
5173   if (UseSSE >= 2 && VerifyFPU) {
5174     verify_FPU(0, "FPU stack must be clean on entry");
5175   }
5176 #endif
5177 
5178 #ifdef ASSERT
5179   if (VerifyStackAtCalls) {
5180     Label L;
5181     push(rax);
5182     mov(rax, rsp);
5183     andptr(rax, StackAlignmentInBytes-1);
5184     cmpptr(rax, StackAlignmentInBytes-wordSize);
5185     pop(rax);
5186     jcc(Assembler::equal, L);
5187     STOP("Stack is not properly aligned!");
5188     bind(L);
5189   }
5190 #endif
5191 
5192   if (!is_stub) {
5193     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5194     bs->nmethod_entry_barrier(this);
5195   }
5196 }
5197 
5198 #if COMPILER2_OR_JVMCI
5199 
5200 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5201 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5202   // cnt - number of qwords (8-byte words).
5203   // base - start address, qword aligned.
5204   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5205   bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0);
5206   if (use64byteVector) {
5207     vpxor(xtmp, xtmp, xtmp, AVX_512bit);
5208   } else if (MaxVectorSize >= 32) {
5209     vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5210   } else {
5211     pxor(xtmp, xtmp);
5212   }
5213   jmp(L_zero_64_bytes);
5214 
5215   BIND(L_loop);
5216   if (MaxVectorSize >= 32) {
5217     fill64(base, 0, xtmp, use64byteVector);
5218   } else {
5219     movdqu(Address(base,  0), xtmp);
5220     movdqu(Address(base, 16), xtmp);
5221     movdqu(Address(base, 32), xtmp);
5222     movdqu(Address(base, 48), xtmp);
5223   }
5224   addptr(base, 64);
5225 
5226   BIND(L_zero_64_bytes);
5227   subptr(cnt, 8);
5228   jccb(Assembler::greaterEqual, L_loop);
5229 
5230   // Copy trailing 64 bytes
5231   if (use64byteVector) {
5232     addptr(cnt, 8);
5233     jccb(Assembler::equal, L_end);
5234     fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
5235     jmp(L_end);
5236   } else {
5237     addptr(cnt, 4);
5238     jccb(Assembler::less, L_tail);
5239     if (MaxVectorSize >= 32) {
5240       vmovdqu(Address(base, 0), xtmp);
5241     } else {
5242       movdqu(Address(base,  0), xtmp);
5243       movdqu(Address(base, 16), xtmp);
5244     }
5245   }
5246   addptr(base, 32);
5247   subptr(cnt, 4);
5248 
5249   BIND(L_tail);
5250   addptr(cnt, 4);
5251   jccb(Assembler::lessEqual, L_end);
5252   if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5253     fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
5254   } else {
5255     decrement(cnt);
5256 
5257     BIND(L_sloop);
5258     movq(Address(base, 0), xtmp);
5259     addptr(base, 8);
5260     decrement(cnt);
5261     jccb(Assembler::greaterEqual, L_sloop);
5262   }
5263   BIND(L_end);
5264 }
5265 
5266 // Clearing constant sized memory using YMM/ZMM registers.
5267 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5268   assert(UseAVX > 2 && VM_Version::supports_avx512vlbw(), "");
5269   bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
5270 
5271   int vector64_count = (cnt & (~0x7)) >> 3;
5272   cnt = cnt & 0x7;
5273   const int fill64_per_loop = 4;
5274   const int max_unrolled_fill64 = 8;
5275 
5276   // 64 byte initialization loop.
5277   vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
5278   int start64 = 0;
5279   if (vector64_count > max_unrolled_fill64) {
5280     Label LOOP;
5281     Register index = rtmp;
5282 
5283     start64 = vector64_count - (vector64_count % fill64_per_loop);
5284 
5285     movl(index, 0);
5286     BIND(LOOP);
5287     for (int i = 0; i < fill64_per_loop; i++) {
5288       fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
5289     }
5290     addl(index, fill64_per_loop * 64);
5291     cmpl(index, start64 * 64);
5292     jccb(Assembler::less, LOOP);
5293   }
5294   for (int i = start64; i < vector64_count; i++) {
5295     fill64(base, i * 64, xtmp, use64byteVector);
5296   }
5297 
5298   // Clear remaining 64 byte tail.
5299   int disp = vector64_count * 64;
5300   if (cnt) {
5301     switch (cnt) {
5302       case 1:
5303         movq(Address(base, disp), xtmp);
5304         break;
5305       case 2:
5306         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_128bit);
5307         break;
5308       case 3:
5309         movl(rtmp, 0x7);
5310         kmovwl(mask, rtmp);
5311         evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_256bit);
5312         break;
5313       case 4:
5314         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5315         break;
5316       case 5:
5317         if (use64byteVector) {
5318           movl(rtmp, 0x1F);
5319           kmovwl(mask, rtmp);
5320           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5321         } else {
5322           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5323           movq(Address(base, disp + 32), xtmp);
5324         }
5325         break;
5326       case 6:
5327         if (use64byteVector) {
5328           movl(rtmp, 0x3F);
5329           kmovwl(mask, rtmp);
5330           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5331         } else {
5332           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5333           evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, false, Assembler::AVX_128bit);
5334         }
5335         break;
5336       case 7:
5337         if (use64byteVector) {
5338           movl(rtmp, 0x7F);
5339           kmovwl(mask, rtmp);
5340           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5341         } else {
5342           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5343           movl(rtmp, 0x7);
5344           kmovwl(mask, rtmp);
5345           evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, true, Assembler::AVX_256bit);
5346         }
5347         break;
5348       default:
5349         fatal("Unexpected length : %d\n",cnt);
5350         break;
5351     }
5352   }
5353 }
5354 
5355 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
5356                                bool is_large, KRegister mask) {
5357   // cnt      - number of qwords (8-byte words).
5358   // base     - start address, qword aligned.
5359   // is_large - if optimizers know cnt is larger than InitArrayShortSize
5360   assert(base==rdi, "base register must be edi for rep stos");
5361   assert(tmp==rax,   "tmp register must be eax for rep stos");
5362   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
5363   assert(InitArrayShortSize % BytesPerLong == 0,
5364     "InitArrayShortSize should be the multiple of BytesPerLong");
5365 
5366   Label DONE;
5367   if (!is_large || !UseXMMForObjInit) {
5368     xorptr(tmp, tmp);
5369   }
5370 
5371   if (!is_large) {
5372     Label LOOP, LONG;
5373     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5374     jccb(Assembler::greater, LONG);
5375 
5376     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5377 
5378     decrement(cnt);
5379     jccb(Assembler::negative, DONE); // Zero length
5380 
5381     // Use individual pointer-sized stores for small counts:
5382     BIND(LOOP);
5383     movptr(Address(base, cnt, Address::times_ptr), tmp);
5384     decrement(cnt);
5385     jccb(Assembler::greaterEqual, LOOP);
5386     jmpb(DONE);
5387 
5388     BIND(LONG);
5389   }
5390 
5391   // Use longer rep-prefixed ops for non-small counts:
5392   if (UseFastStosb) {
5393     shlptr(cnt, 3); // convert to number of bytes
5394     rep_stosb();
5395   } else if (UseXMMForObjInit) {
5396     xmm_clear_mem(base, cnt, tmp, xtmp, mask);
5397   } else {
5398     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5399     rep_stos();
5400   }
5401 
5402   BIND(DONE);
5403 }
5404 
5405 #endif //COMPILER2_OR_JVMCI
5406 
5407 
5408 void MacroAssembler::generate_fill(BasicType t, bool aligned,
5409                                    Register to, Register value, Register count,
5410                                    Register rtmp, XMMRegister xtmp) {
5411   ShortBranchVerifier sbv(this);
5412   assert_different_registers(to, value, count, rtmp);
5413   Label L_exit;
5414   Label L_fill_2_bytes, L_fill_4_bytes;
5415 
5416 #if defined(COMPILER2) && defined(_LP64)
5417   if(MaxVectorSize >=32 &&
5418      VM_Version::supports_avx512vlbw() &&
5419      VM_Version::supports_bmi2()) {
5420     generate_fill_avx3(t, to, value, count, rtmp, xtmp);
5421     return;
5422   }
5423 #endif
5424 
5425   int shift = -1;
5426   switch (t) {
5427     case T_BYTE:
5428       shift = 2;
5429       break;
5430     case T_SHORT:
5431       shift = 1;
5432       break;
5433     case T_INT:
5434       shift = 0;
5435       break;
5436     default: ShouldNotReachHere();
5437   }
5438 
5439   if (t == T_BYTE) {
5440     andl(value, 0xff);
5441     movl(rtmp, value);
5442     shll(rtmp, 8);
5443     orl(value, rtmp);
5444   }
5445   if (t == T_SHORT) {
5446     andl(value, 0xffff);
5447   }
5448   if (t == T_BYTE || t == T_SHORT) {
5449     movl(rtmp, value);
5450     shll(rtmp, 16);
5451     orl(value, rtmp);
5452   }
5453 
5454   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
5455   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
5456   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
5457     Label L_skip_align2;
5458     // align source address at 4 bytes address boundary
5459     if (t == T_BYTE) {
5460       Label L_skip_align1;
5461       // One byte misalignment happens only for byte arrays
5462       testptr(to, 1);
5463       jccb(Assembler::zero, L_skip_align1);
5464       movb(Address(to, 0), value);
5465       increment(to);
5466       decrement(count);
5467       BIND(L_skip_align1);
5468     }
5469     // Two bytes misalignment happens only for byte and short (char) arrays
5470     testptr(to, 2);
5471     jccb(Assembler::zero, L_skip_align2);
5472     movw(Address(to, 0), value);
5473     addptr(to, 2);
5474     subl(count, 1<<(shift-1));
5475     BIND(L_skip_align2);
5476   }
5477   if (UseSSE < 2) {
5478     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5479     // Fill 32-byte chunks
5480     subl(count, 8 << shift);
5481     jcc(Assembler::less, L_check_fill_8_bytes);
5482     align(16);
5483 
5484     BIND(L_fill_32_bytes_loop);
5485 
5486     for (int i = 0; i < 32; i += 4) {
5487       movl(Address(to, i), value);
5488     }
5489 
5490     addptr(to, 32);
5491     subl(count, 8 << shift);
5492     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5493     BIND(L_check_fill_8_bytes);
5494     addl(count, 8 << shift);
5495     jccb(Assembler::zero, L_exit);
5496     jmpb(L_fill_8_bytes);
5497 
5498     //
5499     // length is too short, just fill qwords
5500     //
5501     BIND(L_fill_8_bytes_loop);
5502     movl(Address(to, 0), value);
5503     movl(Address(to, 4), value);
5504     addptr(to, 8);
5505     BIND(L_fill_8_bytes);
5506     subl(count, 1 << (shift + 1));
5507     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5508     // fall through to fill 4 bytes
5509   } else {
5510     Label L_fill_32_bytes;
5511     if (!UseUnalignedLoadStores) {
5512       // align to 8 bytes, we know we are 4 byte aligned to start
5513       testptr(to, 4);
5514       jccb(Assembler::zero, L_fill_32_bytes);
5515       movl(Address(to, 0), value);
5516       addptr(to, 4);
5517       subl(count, 1<<shift);
5518     }
5519     BIND(L_fill_32_bytes);
5520     {
5521       assert( UseSSE >= 2, "supported cpu only" );
5522       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5523       movdl(xtmp, value);
5524       if (UseAVX >= 2 && UseUnalignedLoadStores) {
5525         Label L_check_fill_32_bytes;
5526         if (UseAVX > 2) {
5527           // Fill 64-byte chunks
5528           Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
5529 
5530           // If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2
5531           cmpl(count, VM_Version::avx3_threshold());
5532           jccb(Assembler::below, L_check_fill_64_bytes_avx2);
5533 
5534           vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
5535 
5536           subl(count, 16 << shift);
5537           jccb(Assembler::less, L_check_fill_32_bytes);
5538           align(16);
5539 
5540           BIND(L_fill_64_bytes_loop_avx3);
5541           evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
5542           addptr(to, 64);
5543           subl(count, 16 << shift);
5544           jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
5545           jmpb(L_check_fill_32_bytes);
5546 
5547           BIND(L_check_fill_64_bytes_avx2);
5548         }
5549         // Fill 64-byte chunks
5550         Label L_fill_64_bytes_loop;
5551         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
5552 
5553         subl(count, 16 << shift);
5554         jcc(Assembler::less, L_check_fill_32_bytes);
5555         align(16);
5556 
5557         BIND(L_fill_64_bytes_loop);
5558         vmovdqu(Address(to, 0), xtmp);
5559         vmovdqu(Address(to, 32), xtmp);
5560         addptr(to, 64);
5561         subl(count, 16 << shift);
5562         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
5563 
5564         BIND(L_check_fill_32_bytes);
5565         addl(count, 8 << shift);
5566         jccb(Assembler::less, L_check_fill_8_bytes);
5567         vmovdqu(Address(to, 0), xtmp);
5568         addptr(to, 32);
5569         subl(count, 8 << shift);
5570 
5571         BIND(L_check_fill_8_bytes);
5572         // clean upper bits of YMM registers
5573         movdl(xtmp, value);
5574         pshufd(xtmp, xtmp, 0);
5575       } else {
5576         // Fill 32-byte chunks
5577         pshufd(xtmp, xtmp, 0);
5578 
5579         subl(count, 8 << shift);
5580         jcc(Assembler::less, L_check_fill_8_bytes);
5581         align(16);
5582 
5583         BIND(L_fill_32_bytes_loop);
5584 
5585         if (UseUnalignedLoadStores) {
5586           movdqu(Address(to, 0), xtmp);
5587           movdqu(Address(to, 16), xtmp);
5588         } else {
5589           movq(Address(to, 0), xtmp);
5590           movq(Address(to, 8), xtmp);
5591           movq(Address(to, 16), xtmp);
5592           movq(Address(to, 24), xtmp);
5593         }
5594 
5595         addptr(to, 32);
5596         subl(count, 8 << shift);
5597         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5598 
5599         BIND(L_check_fill_8_bytes);
5600       }
5601       addl(count, 8 << shift);
5602       jccb(Assembler::zero, L_exit);
5603       jmpb(L_fill_8_bytes);
5604 
5605       //
5606       // length is too short, just fill qwords
5607       //
5608       BIND(L_fill_8_bytes_loop);
5609       movq(Address(to, 0), xtmp);
5610       addptr(to, 8);
5611       BIND(L_fill_8_bytes);
5612       subl(count, 1 << (shift + 1));
5613       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5614     }
5615   }
5616   // fill trailing 4 bytes
5617   BIND(L_fill_4_bytes);
5618   testl(count, 1<<shift);
5619   jccb(Assembler::zero, L_fill_2_bytes);
5620   movl(Address(to, 0), value);
5621   if (t == T_BYTE || t == T_SHORT) {
5622     Label L_fill_byte;
5623     addptr(to, 4);
5624     BIND(L_fill_2_bytes);
5625     // fill trailing 2 bytes
5626     testl(count, 1<<(shift-1));
5627     jccb(Assembler::zero, L_fill_byte);
5628     movw(Address(to, 0), value);
5629     if (t == T_BYTE) {
5630       addptr(to, 2);
5631       BIND(L_fill_byte);
5632       // fill trailing byte
5633       testl(count, 1);
5634       jccb(Assembler::zero, L_exit);
5635       movb(Address(to, 0), value);
5636     } else {
5637       BIND(L_fill_byte);
5638     }
5639   } else {
5640     BIND(L_fill_2_bytes);
5641   }
5642   BIND(L_exit);
5643 }
5644 
5645 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
5646   switch(type) {
5647     case T_BYTE:
5648     case T_BOOLEAN:
5649       evpbroadcastb(dst, src, vector_len);
5650       break;
5651     case T_SHORT:
5652     case T_CHAR:
5653       evpbroadcastw(dst, src, vector_len);
5654       break;
5655     case T_INT:
5656     case T_FLOAT:
5657       evpbroadcastd(dst, src, vector_len);
5658       break;
5659     case T_LONG:
5660     case T_DOUBLE:
5661       evpbroadcastq(dst, src, vector_len);
5662       break;
5663     default:
5664       fatal("Unhandled type : %s", type2name(type));
5665       break;
5666   }
5667 }
5668 
5669 // encode char[] to byte[] in ISO_8859_1 or ASCII
5670    //@IntrinsicCandidate
5671    //private static int implEncodeISOArray(byte[] sa, int sp,
5672    //byte[] da, int dp, int len) {
5673    //  int i = 0;
5674    //  for (; i < len; i++) {
5675    //    char c = StringUTF16.getChar(sa, sp++);
5676    //    if (c > '\u00FF')
5677    //      break;
5678    //    da[dp++] = (byte)c;
5679    //  }
5680    //  return i;
5681    //}
5682    //
5683    //@IntrinsicCandidate
5684    //private static int implEncodeAsciiArray(char[] sa, int sp,
5685    //    byte[] da, int dp, int len) {
5686    //  int i = 0;
5687    //  for (; i < len; i++) {
5688    //    char c = sa[sp++];
5689    //    if (c >= '\u0080')
5690    //      break;
5691    //    da[dp++] = (byte)c;
5692    //  }
5693    //  return i;
5694    //}
5695 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
5696   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
5697   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
5698   Register tmp5, Register result, bool ascii) {
5699 
5700   // rsi: src
5701   // rdi: dst
5702   // rdx: len
5703   // rcx: tmp5
5704   // rax: result
5705   ShortBranchVerifier sbv(this);
5706   assert_different_registers(src, dst, len, tmp5, result);
5707   Label L_done, L_copy_1_char, L_copy_1_char_exit;
5708 
5709   int mask = ascii ? 0xff80ff80 : 0xff00ff00;
5710   int short_mask = ascii ? 0xff80 : 0xff00;
5711 
5712   // set result
5713   xorl(result, result);
5714   // check for zero length
5715   testl(len, len);
5716   jcc(Assembler::zero, L_done);
5717 
5718   movl(result, len);
5719 
5720   // Setup pointers
5721   lea(src, Address(src, len, Address::times_2)); // char[]
5722   lea(dst, Address(dst, len, Address::times_1)); // byte[]
5723   negptr(len);
5724 
5725   if (UseSSE42Intrinsics || UseAVX >= 2) {
5726     Label L_copy_8_chars, L_copy_8_chars_exit;
5727     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
5728 
5729     if (UseAVX >= 2) {
5730       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
5731       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
5732       movdl(tmp1Reg, tmp5);
5733       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
5734       jmp(L_chars_32_check);
5735 
5736       bind(L_copy_32_chars);
5737       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
5738       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
5739       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5740       vptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
5741       jccb(Assembler::notZero, L_copy_32_chars_exit);
5742       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5743       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
5744       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
5745 
5746       bind(L_chars_32_check);
5747       addptr(len, 32);
5748       jcc(Assembler::lessEqual, L_copy_32_chars);
5749 
5750       bind(L_copy_32_chars_exit);
5751       subptr(len, 16);
5752       jccb(Assembler::greater, L_copy_16_chars_exit);
5753 
5754     } else if (UseSSE42Intrinsics) {
5755       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
5756       movdl(tmp1Reg, tmp5);
5757       pshufd(tmp1Reg, tmp1Reg, 0);
5758       jmpb(L_chars_16_check);
5759     }
5760 
5761     bind(L_copy_16_chars);
5762     if (UseAVX >= 2) {
5763       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
5764       vptest(tmp2Reg, tmp1Reg);
5765       jcc(Assembler::notZero, L_copy_16_chars_exit);
5766       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
5767       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
5768     } else {
5769       if (UseAVX > 0) {
5770         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5771         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5772         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
5773       } else {
5774         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5775         por(tmp2Reg, tmp3Reg);
5776         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5777         por(tmp2Reg, tmp4Reg);
5778       }
5779       ptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
5780       jccb(Assembler::notZero, L_copy_16_chars_exit);
5781       packuswb(tmp3Reg, tmp4Reg);
5782     }
5783     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
5784 
5785     bind(L_chars_16_check);
5786     addptr(len, 16);
5787     jcc(Assembler::lessEqual, L_copy_16_chars);
5788 
5789     bind(L_copy_16_chars_exit);
5790     if (UseAVX >= 2) {
5791       // clean upper bits of YMM registers
5792       vpxor(tmp2Reg, tmp2Reg);
5793       vpxor(tmp3Reg, tmp3Reg);
5794       vpxor(tmp4Reg, tmp4Reg);
5795       movdl(tmp1Reg, tmp5);
5796       pshufd(tmp1Reg, tmp1Reg, 0);
5797     }
5798     subptr(len, 8);
5799     jccb(Assembler::greater, L_copy_8_chars_exit);
5800 
5801     bind(L_copy_8_chars);
5802     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
5803     ptest(tmp3Reg, tmp1Reg);
5804     jccb(Assembler::notZero, L_copy_8_chars_exit);
5805     packuswb(tmp3Reg, tmp1Reg);
5806     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
5807     addptr(len, 8);
5808     jccb(Assembler::lessEqual, L_copy_8_chars);
5809 
5810     bind(L_copy_8_chars_exit);
5811     subptr(len, 8);
5812     jccb(Assembler::zero, L_done);
5813   }
5814 
5815   bind(L_copy_1_char);
5816   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
5817   testl(tmp5, short_mask);      // check if Unicode or non-ASCII char
5818   jccb(Assembler::notZero, L_copy_1_char_exit);
5819   movb(Address(dst, len, Address::times_1, 0), tmp5);
5820   addptr(len, 1);
5821   jccb(Assembler::less, L_copy_1_char);
5822 
5823   bind(L_copy_1_char_exit);
5824   addptr(result, len); // len is negative count of not processed elements
5825 
5826   bind(L_done);
5827 }
5828 
5829 #ifdef _LP64
5830 /**
5831  * Helper for multiply_to_len().
5832  */
5833 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
5834   addq(dest_lo, src1);
5835   adcq(dest_hi, 0);
5836   addq(dest_lo, src2);
5837   adcq(dest_hi, 0);
5838 }
5839 
5840 /**
5841  * Multiply 64 bit by 64 bit first loop.
5842  */
5843 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5844                                            Register y, Register y_idx, Register z,
5845                                            Register carry, Register product,
5846                                            Register idx, Register kdx) {
5847   //
5848   //  jlong carry, x[], y[], z[];
5849   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5850   //    huge_128 product = y[idx] * x[xstart] + carry;
5851   //    z[kdx] = (jlong)product;
5852   //    carry  = (jlong)(product >>> 64);
5853   //  }
5854   //  z[xstart] = carry;
5855   //
5856 
5857   Label L_first_loop, L_first_loop_exit;
5858   Label L_one_x, L_one_y, L_multiply;
5859 
5860   decrementl(xstart);
5861   jcc(Assembler::negative, L_one_x);
5862 
5863   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
5864   rorq(x_xstart, 32); // convert big-endian to little-endian
5865 
5866   bind(L_first_loop);
5867   decrementl(idx);
5868   jcc(Assembler::negative, L_first_loop_exit);
5869   decrementl(idx);
5870   jcc(Assembler::negative, L_one_y);
5871   movq(y_idx, Address(y, idx, Address::times_4,  0));
5872   rorq(y_idx, 32); // convert big-endian to little-endian
5873   bind(L_multiply);
5874   movq(product, x_xstart);
5875   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
5876   addq(product, carry);
5877   adcq(rdx, 0);
5878   subl(kdx, 2);
5879   movl(Address(z, kdx, Address::times_4,  4), product);
5880   shrq(product, 32);
5881   movl(Address(z, kdx, Address::times_4,  0), product);
5882   movq(carry, rdx);
5883   jmp(L_first_loop);
5884 
5885   bind(L_one_y);
5886   movl(y_idx, Address(y,  0));
5887   jmp(L_multiply);
5888 
5889   bind(L_one_x);
5890   movl(x_xstart, Address(x,  0));
5891   jmp(L_first_loop);
5892 
5893   bind(L_first_loop_exit);
5894 }
5895 
5896 /**
5897  * Multiply 64 bit by 64 bit and add 128 bit.
5898  */
5899 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
5900                                             Register yz_idx, Register idx,
5901                                             Register carry, Register product, int offset) {
5902   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
5903   //     z[kdx] = (jlong)product;
5904 
5905   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
5906   rorq(yz_idx, 32); // convert big-endian to little-endian
5907   movq(product, x_xstart);
5908   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
5909   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
5910   rorq(yz_idx, 32); // convert big-endian to little-endian
5911 
5912   add2_with_carry(rdx, product, carry, yz_idx);
5913 
5914   movl(Address(z, idx, Address::times_4,  offset+4), product);
5915   shrq(product, 32);
5916   movl(Address(z, idx, Address::times_4,  offset), product);
5917 
5918 }
5919 
5920 /**
5921  * Multiply 128 bit by 128 bit. Unrolled inner loop.
5922  */
5923 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
5924                                              Register yz_idx, Register idx, Register jdx,
5925                                              Register carry, Register product,
5926                                              Register carry2) {
5927   //   jlong carry, x[], y[], z[];
5928   //   int kdx = ystart+1;
5929   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5930   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
5931   //     z[kdx+idx+1] = (jlong)product;
5932   //     jlong carry2  = (jlong)(product >>> 64);
5933   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
5934   //     z[kdx+idx] = (jlong)product;
5935   //     carry  = (jlong)(product >>> 64);
5936   //   }
5937   //   idx += 2;
5938   //   if (idx > 0) {
5939   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
5940   //     z[kdx+idx] = (jlong)product;
5941   //     carry  = (jlong)(product >>> 64);
5942   //   }
5943   //
5944 
5945   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5946 
5947   movl(jdx, idx);
5948   andl(jdx, 0xFFFFFFFC);
5949   shrl(jdx, 2);
5950 
5951   bind(L_third_loop);
5952   subl(jdx, 1);
5953   jcc(Assembler::negative, L_third_loop_exit);
5954   subl(idx, 4);
5955 
5956   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
5957   movq(carry2, rdx);
5958 
5959   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
5960   movq(carry, rdx);
5961   jmp(L_third_loop);
5962 
5963   bind (L_third_loop_exit);
5964 
5965   andl (idx, 0x3);
5966   jcc(Assembler::zero, L_post_third_loop_done);
5967 
5968   Label L_check_1;
5969   subl(idx, 2);
5970   jcc(Assembler::negative, L_check_1);
5971 
5972   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
5973   movq(carry, rdx);
5974 
5975   bind (L_check_1);
5976   addl (idx, 0x2);
5977   andl (idx, 0x1);
5978   subl(idx, 1);
5979   jcc(Assembler::negative, L_post_third_loop_done);
5980 
5981   movl(yz_idx, Address(y, idx, Address::times_4,  0));
5982   movq(product, x_xstart);
5983   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
5984   movl(yz_idx, Address(z, idx, Address::times_4,  0));
5985 
5986   add2_with_carry(rdx, product, yz_idx, carry);
5987 
5988   movl(Address(z, idx, Address::times_4,  0), product);
5989   shrq(product, 32);
5990 
5991   shlq(rdx, 32);
5992   orq(product, rdx);
5993   movq(carry, product);
5994 
5995   bind(L_post_third_loop_done);
5996 }
5997 
5998 /**
5999  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
6000  *
6001  */
6002 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
6003                                                   Register carry, Register carry2,
6004                                                   Register idx, Register jdx,
6005                                                   Register yz_idx1, Register yz_idx2,
6006                                                   Register tmp, Register tmp3, Register tmp4) {
6007   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
6008 
6009   //   jlong carry, x[], y[], z[];
6010   //   int kdx = ystart+1;
6011   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6012   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
6013   //     jlong carry2  = (jlong)(tmp3 >>> 64);
6014   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
6015   //     carry  = (jlong)(tmp4 >>> 64);
6016   //     z[kdx+idx+1] = (jlong)tmp3;
6017   //     z[kdx+idx] = (jlong)tmp4;
6018   //   }
6019   //   idx += 2;
6020   //   if (idx > 0) {
6021   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
6022   //     z[kdx+idx] = (jlong)yz_idx1;
6023   //     carry  = (jlong)(yz_idx1 >>> 64);
6024   //   }
6025   //
6026 
6027   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6028 
6029   movl(jdx, idx);
6030   andl(jdx, 0xFFFFFFFC);
6031   shrl(jdx, 2);
6032 
6033   bind(L_third_loop);
6034   subl(jdx, 1);
6035   jcc(Assembler::negative, L_third_loop_exit);
6036   subl(idx, 4);
6037 
6038   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
6039   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
6040   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
6041   rorxq(yz_idx2, yz_idx2, 32);
6042 
6043   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
6044   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
6045 
6046   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
6047   rorxq(yz_idx1, yz_idx1, 32);
6048   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6049   rorxq(yz_idx2, yz_idx2, 32);
6050 
6051   if (VM_Version::supports_adx()) {
6052     adcxq(tmp3, carry);
6053     adoxq(tmp3, yz_idx1);
6054 
6055     adcxq(tmp4, tmp);
6056     adoxq(tmp4, yz_idx2);
6057 
6058     movl(carry, 0); // does not affect flags
6059     adcxq(carry2, carry);
6060     adoxq(carry2, carry);
6061   } else {
6062     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
6063     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
6064   }
6065   movq(carry, carry2);
6066 
6067   movl(Address(z, idx, Address::times_4, 12), tmp3);
6068   shrq(tmp3, 32);
6069   movl(Address(z, idx, Address::times_4,  8), tmp3);
6070 
6071   movl(Address(z, idx, Address::times_4,  4), tmp4);
6072   shrq(tmp4, 32);
6073   movl(Address(z, idx, Address::times_4,  0), tmp4);
6074 
6075   jmp(L_third_loop);
6076 
6077   bind (L_third_loop_exit);
6078 
6079   andl (idx, 0x3);
6080   jcc(Assembler::zero, L_post_third_loop_done);
6081 
6082   Label L_check_1;
6083   subl(idx, 2);
6084   jcc(Assembler::negative, L_check_1);
6085 
6086   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
6087   rorxq(yz_idx1, yz_idx1, 32);
6088   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
6089   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6090   rorxq(yz_idx2, yz_idx2, 32);
6091 
6092   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
6093 
6094   movl(Address(z, idx, Address::times_4,  4), tmp3);
6095   shrq(tmp3, 32);
6096   movl(Address(z, idx, Address::times_4,  0), tmp3);
6097   movq(carry, tmp4);
6098 
6099   bind (L_check_1);
6100   addl (idx, 0x2);
6101   andl (idx, 0x1);
6102   subl(idx, 1);
6103   jcc(Assembler::negative, L_post_third_loop_done);
6104   movl(tmp4, Address(y, idx, Address::times_4,  0));
6105   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
6106   movl(tmp4, Address(z, idx, Address::times_4,  0));
6107 
6108   add2_with_carry(carry2, tmp3, tmp4, carry);
6109 
6110   movl(Address(z, idx, Address::times_4,  0), tmp3);
6111   shrq(tmp3, 32);
6112 
6113   shlq(carry2, 32);
6114   orq(tmp3, carry2);
6115   movq(carry, tmp3);
6116 
6117   bind(L_post_third_loop_done);
6118 }
6119 
6120 /**
6121  * Code for BigInteger::multiplyToLen() intrinsic.
6122  *
6123  * rdi: x
6124  * rax: xlen
6125  * rsi: y
6126  * rcx: ylen
6127  * r8:  z
6128  * r11: zlen
6129  * r12: tmp1
6130  * r13: tmp2
6131  * r14: tmp3
6132  * r15: tmp4
6133  * rbx: tmp5
6134  *
6135  */
6136 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
6137                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
6138   ShortBranchVerifier sbv(this);
6139   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
6140 
6141   push(tmp1);
6142   push(tmp2);
6143   push(tmp3);
6144   push(tmp4);
6145   push(tmp5);
6146 
6147   push(xlen);
6148   push(zlen);
6149 
6150   const Register idx = tmp1;
6151   const Register kdx = tmp2;
6152   const Register xstart = tmp3;
6153 
6154   const Register y_idx = tmp4;
6155   const Register carry = tmp5;
6156   const Register product  = xlen;
6157   const Register x_xstart = zlen;  // reuse register
6158 
6159   // First Loop.
6160   //
6161   //  final static long LONG_MASK = 0xffffffffL;
6162   //  int xstart = xlen - 1;
6163   //  int ystart = ylen - 1;
6164   //  long carry = 0;
6165   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6166   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
6167   //    z[kdx] = (int)product;
6168   //    carry = product >>> 32;
6169   //  }
6170   //  z[xstart] = (int)carry;
6171   //
6172 
6173   movl(idx, ylen);      // idx = ylen;
6174   movl(kdx, zlen);      // kdx = xlen+ylen;
6175   xorq(carry, carry);   // carry = 0;
6176 
6177   Label L_done;
6178 
6179   movl(xstart, xlen);
6180   decrementl(xstart);
6181   jcc(Assembler::negative, L_done);
6182 
6183   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6184 
6185   Label L_second_loop;
6186   testl(kdx, kdx);
6187   jcc(Assembler::zero, L_second_loop);
6188 
6189   Label L_carry;
6190   subl(kdx, 1);
6191   jcc(Assembler::zero, L_carry);
6192 
6193   movl(Address(z, kdx, Address::times_4,  0), carry);
6194   shrq(carry, 32);
6195   subl(kdx, 1);
6196 
6197   bind(L_carry);
6198   movl(Address(z, kdx, Address::times_4,  0), carry);
6199 
6200   // Second and third (nested) loops.
6201   //
6202   // for (int i = xstart-1; i >= 0; i--) { // Second loop
6203   //   carry = 0;
6204   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6205   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6206   //                    (z[k] & LONG_MASK) + carry;
6207   //     z[k] = (int)product;
6208   //     carry = product >>> 32;
6209   //   }
6210   //   z[i] = (int)carry;
6211   // }
6212   //
6213   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
6214 
6215   const Register jdx = tmp1;
6216 
6217   bind(L_second_loop);
6218   xorl(carry, carry);    // carry = 0;
6219   movl(jdx, ylen);       // j = ystart+1
6220 
6221   subl(xstart, 1);       // i = xstart-1;
6222   jcc(Assembler::negative, L_done);
6223 
6224   push (z);
6225 
6226   Label L_last_x;
6227   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
6228   subl(xstart, 1);       // i = xstart-1;
6229   jcc(Assembler::negative, L_last_x);
6230 
6231   if (UseBMI2Instructions) {
6232     movq(rdx,  Address(x, xstart, Address::times_4,  0));
6233     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
6234   } else {
6235     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
6236     rorq(x_xstart, 32);  // convert big-endian to little-endian
6237   }
6238 
6239   Label L_third_loop_prologue;
6240   bind(L_third_loop_prologue);
6241 
6242   push (x);
6243   push (xstart);
6244   push (ylen);
6245 
6246 
6247   if (UseBMI2Instructions) {
6248     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
6249   } else { // !UseBMI2Instructions
6250     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
6251   }
6252 
6253   pop(ylen);
6254   pop(xlen);
6255   pop(x);
6256   pop(z);
6257 
6258   movl(tmp3, xlen);
6259   addl(tmp3, 1);
6260   movl(Address(z, tmp3, Address::times_4,  0), carry);
6261   subl(tmp3, 1);
6262   jccb(Assembler::negative, L_done);
6263 
6264   shrq(carry, 32);
6265   movl(Address(z, tmp3, Address::times_4,  0), carry);
6266   jmp(L_second_loop);
6267 
6268   // Next infrequent code is moved outside loops.
6269   bind(L_last_x);
6270   if (UseBMI2Instructions) {
6271     movl(rdx, Address(x,  0));
6272   } else {
6273     movl(x_xstart, Address(x,  0));
6274   }
6275   jmp(L_third_loop_prologue);
6276 
6277   bind(L_done);
6278 
6279   pop(zlen);
6280   pop(xlen);
6281 
6282   pop(tmp5);
6283   pop(tmp4);
6284   pop(tmp3);
6285   pop(tmp2);
6286   pop(tmp1);
6287 }
6288 
6289 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
6290   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
6291   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
6292   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
6293   Label VECTOR8_TAIL, VECTOR4_TAIL;
6294   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
6295   Label SAME_TILL_END, DONE;
6296   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
6297 
6298   //scale is in rcx in both Win64 and Unix
6299   ShortBranchVerifier sbv(this);
6300 
6301   shlq(length);
6302   xorq(result, result);
6303 
6304   if ((AVX3Threshold == 0) && (UseAVX > 2) &&
6305       VM_Version::supports_avx512vlbw()) {
6306     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
6307 
6308     cmpq(length, 64);
6309     jcc(Assembler::less, VECTOR32_TAIL);
6310 
6311     movq(tmp1, length);
6312     andq(tmp1, 0x3F);      // tail count
6313     andq(length, ~(0x3F)); //vector count
6314 
6315     bind(VECTOR64_LOOP);
6316     // AVX512 code to compare 64 byte vectors.
6317     evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
6318     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
6319     kortestql(k7, k7);
6320     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
6321     addq(result, 64);
6322     subq(length, 64);
6323     jccb(Assembler::notZero, VECTOR64_LOOP);
6324 
6325     //bind(VECTOR64_TAIL);
6326     testq(tmp1, tmp1);
6327     jcc(Assembler::zero, SAME_TILL_END);
6328 
6329     //bind(VECTOR64_TAIL);
6330     // AVX512 code to compare up to 63 byte vectors.
6331     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
6332     shlxq(tmp2, tmp2, tmp1);
6333     notq(tmp2);
6334     kmovql(k3, tmp2);
6335 
6336     evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
6337     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
6338 
6339     ktestql(k7, k3);
6340     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
6341 
6342     bind(VECTOR64_NOT_EQUAL);
6343     kmovql(tmp1, k7);
6344     notq(tmp1);
6345     tzcntq(tmp1, tmp1);
6346     addq(result, tmp1);
6347     shrq(result);
6348     jmp(DONE);
6349     bind(VECTOR32_TAIL);
6350   }
6351 
6352   cmpq(length, 8);
6353   jcc(Assembler::equal, VECTOR8_LOOP);
6354   jcc(Assembler::less, VECTOR4_TAIL);
6355 
6356   if (UseAVX >= 2) {
6357     Label VECTOR16_TAIL, VECTOR32_LOOP;
6358 
6359     cmpq(length, 16);
6360     jcc(Assembler::equal, VECTOR16_LOOP);
6361     jcc(Assembler::less, VECTOR8_LOOP);
6362 
6363     cmpq(length, 32);
6364     jccb(Assembler::less, VECTOR16_TAIL);
6365 
6366     subq(length, 32);
6367     bind(VECTOR32_LOOP);
6368     vmovdqu(rymm0, Address(obja, result));
6369     vmovdqu(rymm1, Address(objb, result));
6370     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
6371     vptest(rymm2, rymm2);
6372     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
6373     addq(result, 32);
6374     subq(length, 32);
6375     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
6376     addq(length, 32);
6377     jcc(Assembler::equal, SAME_TILL_END);
6378     //falling through if less than 32 bytes left //close the branch here.
6379 
6380     bind(VECTOR16_TAIL);
6381     cmpq(length, 16);
6382     jccb(Assembler::less, VECTOR8_TAIL);
6383     bind(VECTOR16_LOOP);
6384     movdqu(rymm0, Address(obja, result));
6385     movdqu(rymm1, Address(objb, result));
6386     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
6387     ptest(rymm2, rymm2);
6388     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6389     addq(result, 16);
6390     subq(length, 16);
6391     jcc(Assembler::equal, SAME_TILL_END);
6392     //falling through if less than 16 bytes left
6393   } else {//regular intrinsics
6394 
6395     cmpq(length, 16);
6396     jccb(Assembler::less, VECTOR8_TAIL);
6397 
6398     subq(length, 16);
6399     bind(VECTOR16_LOOP);
6400     movdqu(rymm0, Address(obja, result));
6401     movdqu(rymm1, Address(objb, result));
6402     pxor(rymm0, rymm1);
6403     ptest(rymm0, rymm0);
6404     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6405     addq(result, 16);
6406     subq(length, 16);
6407     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
6408     addq(length, 16);
6409     jcc(Assembler::equal, SAME_TILL_END);
6410     //falling through if less than 16 bytes left
6411   }
6412 
6413   bind(VECTOR8_TAIL);
6414   cmpq(length, 8);
6415   jccb(Assembler::less, VECTOR4_TAIL);
6416   bind(VECTOR8_LOOP);
6417   movq(tmp1, Address(obja, result));
6418   movq(tmp2, Address(objb, result));
6419   xorq(tmp1, tmp2);
6420   testq(tmp1, tmp1);
6421   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
6422   addq(result, 8);
6423   subq(length, 8);
6424   jcc(Assembler::equal, SAME_TILL_END);
6425   //falling through if less than 8 bytes left
6426 
6427   bind(VECTOR4_TAIL);
6428   cmpq(length, 4);
6429   jccb(Assembler::less, BYTES_TAIL);
6430   bind(VECTOR4_LOOP);
6431   movl(tmp1, Address(obja, result));
6432   xorl(tmp1, Address(objb, result));
6433   testl(tmp1, tmp1);
6434   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
6435   addq(result, 4);
6436   subq(length, 4);
6437   jcc(Assembler::equal, SAME_TILL_END);
6438   //falling through if less than 4 bytes left
6439 
6440   bind(BYTES_TAIL);
6441   bind(BYTES_LOOP);
6442   load_unsigned_byte(tmp1, Address(obja, result));
6443   load_unsigned_byte(tmp2, Address(objb, result));
6444   xorl(tmp1, tmp2);
6445   testl(tmp1, tmp1);
6446   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6447   decq(length);
6448   jcc(Assembler::zero, SAME_TILL_END);
6449   incq(result);
6450   load_unsigned_byte(tmp1, Address(obja, result));
6451   load_unsigned_byte(tmp2, Address(objb, result));
6452   xorl(tmp1, tmp2);
6453   testl(tmp1, tmp1);
6454   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6455   decq(length);
6456   jcc(Assembler::zero, SAME_TILL_END);
6457   incq(result);
6458   load_unsigned_byte(tmp1, Address(obja, result));
6459   load_unsigned_byte(tmp2, Address(objb, result));
6460   xorl(tmp1, tmp2);
6461   testl(tmp1, tmp1);
6462   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6463   jmp(SAME_TILL_END);
6464 
6465   if (UseAVX >= 2) {
6466     bind(VECTOR32_NOT_EQUAL);
6467     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
6468     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
6469     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
6470     vpmovmskb(tmp1, rymm0);
6471     bsfq(tmp1, tmp1);
6472     addq(result, tmp1);
6473     shrq(result);
6474     jmp(DONE);
6475   }
6476 
6477   bind(VECTOR16_NOT_EQUAL);
6478   if (UseAVX >= 2) {
6479     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
6480     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
6481     pxor(rymm0, rymm2);
6482   } else {
6483     pcmpeqb(rymm2, rymm2);
6484     pxor(rymm0, rymm1);
6485     pcmpeqb(rymm0, rymm1);
6486     pxor(rymm0, rymm2);
6487   }
6488   pmovmskb(tmp1, rymm0);
6489   bsfq(tmp1, tmp1);
6490   addq(result, tmp1);
6491   shrq(result);
6492   jmpb(DONE);
6493 
6494   bind(VECTOR8_NOT_EQUAL);
6495   bind(VECTOR4_NOT_EQUAL);
6496   bsfq(tmp1, tmp1);
6497   shrq(tmp1, 3);
6498   addq(result, tmp1);
6499   bind(BYTES_NOT_EQUAL);
6500   shrq(result);
6501   jmpb(DONE);
6502 
6503   bind(SAME_TILL_END);
6504   mov64(result, -1);
6505 
6506   bind(DONE);
6507 }
6508 
6509 //Helper functions for square_to_len()
6510 
6511 /**
6512  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
6513  * Preserves x and z and modifies rest of the registers.
6514  */
6515 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6516   // Perform square and right shift by 1
6517   // Handle odd xlen case first, then for even xlen do the following
6518   // jlong carry = 0;
6519   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
6520   //     huge_128 product = x[j:j+1] * x[j:j+1];
6521   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
6522   //     z[i+2:i+3] = (jlong)(product >>> 1);
6523   //     carry = (jlong)product;
6524   // }
6525 
6526   xorq(tmp5, tmp5);     // carry
6527   xorq(rdxReg, rdxReg);
6528   xorl(tmp1, tmp1);     // index for x
6529   xorl(tmp4, tmp4);     // index for z
6530 
6531   Label L_first_loop, L_first_loop_exit;
6532 
6533   testl(xlen, 1);
6534   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
6535 
6536   // Square and right shift by 1 the odd element using 32 bit multiply
6537   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
6538   imulq(raxReg, raxReg);
6539   shrq(raxReg, 1);
6540   adcq(tmp5, 0);
6541   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
6542   incrementl(tmp1);
6543   addl(tmp4, 2);
6544 
6545   // Square and  right shift by 1 the rest using 64 bit multiply
6546   bind(L_first_loop);
6547   cmpptr(tmp1, xlen);
6548   jccb(Assembler::equal, L_first_loop_exit);
6549 
6550   // Square
6551   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
6552   rorq(raxReg, 32);    // convert big-endian to little-endian
6553   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
6554 
6555   // Right shift by 1 and save carry
6556   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
6557   rcrq(rdxReg, 1);
6558   rcrq(raxReg, 1);
6559   adcq(tmp5, 0);
6560 
6561   // Store result in z
6562   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
6563   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
6564 
6565   // Update indices for x and z
6566   addl(tmp1, 2);
6567   addl(tmp4, 4);
6568   jmp(L_first_loop);
6569 
6570   bind(L_first_loop_exit);
6571 }
6572 
6573 
6574 /**
6575  * Perform the following multiply add operation using BMI2 instructions
6576  * carry:sum = sum + op1*op2 + carry
6577  * op2 should be in rdx
6578  * op2 is preserved, all other registers are modified
6579  */
6580 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
6581   // assert op2 is rdx
6582   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
6583   addq(sum, carry);
6584   adcq(tmp2, 0);
6585   addq(sum, op1);
6586   adcq(tmp2, 0);
6587   movq(carry, tmp2);
6588 }
6589 
6590 /**
6591  * Perform the following multiply add operation:
6592  * carry:sum = sum + op1*op2 + carry
6593  * Preserves op1, op2 and modifies rest of registers
6594  */
6595 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
6596   // rdx:rax = op1 * op2
6597   movq(raxReg, op2);
6598   mulq(op1);
6599 
6600   //  rdx:rax = sum + carry + rdx:rax
6601   addq(sum, carry);
6602   adcq(rdxReg, 0);
6603   addq(sum, raxReg);
6604   adcq(rdxReg, 0);
6605 
6606   // carry:sum = rdx:sum
6607   movq(carry, rdxReg);
6608 }
6609 
6610 /**
6611  * Add 64 bit long carry into z[] with carry propagation.
6612  * Preserves z and carry register values and modifies rest of registers.
6613  *
6614  */
6615 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
6616   Label L_fourth_loop, L_fourth_loop_exit;
6617 
6618   movl(tmp1, 1);
6619   subl(zlen, 2);
6620   addq(Address(z, zlen, Address::times_4, 0), carry);
6621 
6622   bind(L_fourth_loop);
6623   jccb(Assembler::carryClear, L_fourth_loop_exit);
6624   subl(zlen, 2);
6625   jccb(Assembler::negative, L_fourth_loop_exit);
6626   addq(Address(z, zlen, Address::times_4, 0), tmp1);
6627   jmp(L_fourth_loop);
6628   bind(L_fourth_loop_exit);
6629 }
6630 
6631 /**
6632  * Shift z[] left by 1 bit.
6633  * Preserves x, len, z and zlen registers and modifies rest of the registers.
6634  *
6635  */
6636 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
6637 
6638   Label L_fifth_loop, L_fifth_loop_exit;
6639 
6640   // Fifth loop
6641   // Perform primitiveLeftShift(z, zlen, 1)
6642 
6643   const Register prev_carry = tmp1;
6644   const Register new_carry = tmp4;
6645   const Register value = tmp2;
6646   const Register zidx = tmp3;
6647 
6648   // int zidx, carry;
6649   // long value;
6650   // carry = 0;
6651   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
6652   //    (carry:value)  = (z[i] << 1) | carry ;
6653   //    z[i] = value;
6654   // }
6655 
6656   movl(zidx, zlen);
6657   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
6658 
6659   bind(L_fifth_loop);
6660   decl(zidx);  // Use decl to preserve carry flag
6661   decl(zidx);
6662   jccb(Assembler::negative, L_fifth_loop_exit);
6663 
6664   if (UseBMI2Instructions) {
6665      movq(value, Address(z, zidx, Address::times_4, 0));
6666      rclq(value, 1);
6667      rorxq(value, value, 32);
6668      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6669   }
6670   else {
6671     // clear new_carry
6672     xorl(new_carry, new_carry);
6673 
6674     // Shift z[i] by 1, or in previous carry and save new carry
6675     movq(value, Address(z, zidx, Address::times_4, 0));
6676     shlq(value, 1);
6677     adcl(new_carry, 0);
6678 
6679     orq(value, prev_carry);
6680     rorq(value, 0x20);
6681     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6682 
6683     // Set previous carry = new carry
6684     movl(prev_carry, new_carry);
6685   }
6686   jmp(L_fifth_loop);
6687 
6688   bind(L_fifth_loop_exit);
6689 }
6690 
6691 
6692 /**
6693  * Code for BigInteger::squareToLen() intrinsic
6694  *
6695  * rdi: x
6696  * rsi: len
6697  * r8:  z
6698  * rcx: zlen
6699  * r12: tmp1
6700  * r13: tmp2
6701  * r14: tmp3
6702  * r15: tmp4
6703  * rbx: tmp5
6704  *
6705  */
6706 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6707 
6708   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
6709   push(tmp1);
6710   push(tmp2);
6711   push(tmp3);
6712   push(tmp4);
6713   push(tmp5);
6714 
6715   // First loop
6716   // Store the squares, right shifted one bit (i.e., divided by 2).
6717   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
6718 
6719   // Add in off-diagonal sums.
6720   //
6721   // Second, third (nested) and fourth loops.
6722   // zlen +=2;
6723   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
6724   //    carry = 0;
6725   //    long op2 = x[xidx:xidx+1];
6726   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
6727   //       k -= 2;
6728   //       long op1 = x[j:j+1];
6729   //       long sum = z[k:k+1];
6730   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
6731   //       z[k:k+1] = sum;
6732   //    }
6733   //    add_one_64(z, k, carry, tmp_regs);
6734   // }
6735 
6736   const Register carry = tmp5;
6737   const Register sum = tmp3;
6738   const Register op1 = tmp4;
6739   Register op2 = tmp2;
6740 
6741   push(zlen);
6742   push(len);
6743   addl(zlen,2);
6744   bind(L_second_loop);
6745   xorq(carry, carry);
6746   subl(zlen, 4);
6747   subl(len, 2);
6748   push(zlen);
6749   push(len);
6750   cmpl(len, 0);
6751   jccb(Assembler::lessEqual, L_second_loop_exit);
6752 
6753   // Multiply an array by one 64 bit long.
6754   if (UseBMI2Instructions) {
6755     op2 = rdxReg;
6756     movq(op2, Address(x, len, Address::times_4,  0));
6757     rorxq(op2, op2, 32);
6758   }
6759   else {
6760     movq(op2, Address(x, len, Address::times_4,  0));
6761     rorq(op2, 32);
6762   }
6763 
6764   bind(L_third_loop);
6765   decrementl(len);
6766   jccb(Assembler::negative, L_third_loop_exit);
6767   decrementl(len);
6768   jccb(Assembler::negative, L_last_x);
6769 
6770   movq(op1, Address(x, len, Address::times_4,  0));
6771   rorq(op1, 32);
6772 
6773   bind(L_multiply);
6774   subl(zlen, 2);
6775   movq(sum, Address(z, zlen, Address::times_4,  0));
6776 
6777   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
6778   if (UseBMI2Instructions) {
6779     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
6780   }
6781   else {
6782     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6783   }
6784 
6785   movq(Address(z, zlen, Address::times_4, 0), sum);
6786 
6787   jmp(L_third_loop);
6788   bind(L_third_loop_exit);
6789 
6790   // Fourth loop
6791   // Add 64 bit long carry into z with carry propagation.
6792   // Uses offsetted zlen.
6793   add_one_64(z, zlen, carry, tmp1);
6794 
6795   pop(len);
6796   pop(zlen);
6797   jmp(L_second_loop);
6798 
6799   // Next infrequent code is moved outside loops.
6800   bind(L_last_x);
6801   movl(op1, Address(x, 0));
6802   jmp(L_multiply);
6803 
6804   bind(L_second_loop_exit);
6805   pop(len);
6806   pop(zlen);
6807   pop(len);
6808   pop(zlen);
6809 
6810   // Fifth loop
6811   // Shift z left 1 bit.
6812   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
6813 
6814   // z[zlen-1] |= x[len-1] & 1;
6815   movl(tmp3, Address(x, len, Address::times_4, -4));
6816   andl(tmp3, 1);
6817   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
6818 
6819   pop(tmp5);
6820   pop(tmp4);
6821   pop(tmp3);
6822   pop(tmp2);
6823   pop(tmp1);
6824 }
6825 
6826 /**
6827  * Helper function for mul_add()
6828  * Multiply the in[] by int k and add to out[] starting at offset offs using
6829  * 128 bit by 32 bit multiply and return the carry in tmp5.
6830  * Only quad int aligned length of in[] is operated on in this function.
6831  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
6832  * This function preserves out, in and k registers.
6833  * len and offset point to the appropriate index in "in" & "out" correspondingly
6834  * tmp5 has the carry.
6835  * other registers are temporary and are modified.
6836  *
6837  */
6838 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
6839   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
6840   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6841 
6842   Label L_first_loop, L_first_loop_exit;
6843 
6844   movl(tmp1, len);
6845   shrl(tmp1, 2);
6846 
6847   bind(L_first_loop);
6848   subl(tmp1, 1);
6849   jccb(Assembler::negative, L_first_loop_exit);
6850 
6851   subl(len, 4);
6852   subl(offset, 4);
6853 
6854   Register op2 = tmp2;
6855   const Register sum = tmp3;
6856   const Register op1 = tmp4;
6857   const Register carry = tmp5;
6858 
6859   if (UseBMI2Instructions) {
6860     op2 = rdxReg;
6861   }
6862 
6863   movq(op1, Address(in, len, Address::times_4,  8));
6864   rorq(op1, 32);
6865   movq(sum, Address(out, offset, Address::times_4,  8));
6866   rorq(sum, 32);
6867   if (UseBMI2Instructions) {
6868     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6869   }
6870   else {
6871     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6872   }
6873   // Store back in big endian from little endian
6874   rorq(sum, 0x20);
6875   movq(Address(out, offset, Address::times_4,  8), sum);
6876 
6877   movq(op1, Address(in, len, Address::times_4,  0));
6878   rorq(op1, 32);
6879   movq(sum, Address(out, offset, Address::times_4,  0));
6880   rorq(sum, 32);
6881   if (UseBMI2Instructions) {
6882     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6883   }
6884   else {
6885     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6886   }
6887   // Store back in big endian from little endian
6888   rorq(sum, 0x20);
6889   movq(Address(out, offset, Address::times_4,  0), sum);
6890 
6891   jmp(L_first_loop);
6892   bind(L_first_loop_exit);
6893 }
6894 
6895 /**
6896  * Code for BigInteger::mulAdd() intrinsic
6897  *
6898  * rdi: out
6899  * rsi: in
6900  * r11: offs (out.length - offset)
6901  * rcx: len
6902  * r8:  k
6903  * r12: tmp1
6904  * r13: tmp2
6905  * r14: tmp3
6906  * r15: tmp4
6907  * rbx: tmp5
6908  * Multiply the in[] by word k and add to out[], return the carry in rax
6909  */
6910 void MacroAssembler::mul_add(Register out, Register in, Register offs,
6911    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
6912    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6913 
6914   Label L_carry, L_last_in, L_done;
6915 
6916 // carry = 0;
6917 // for (int j=len-1; j >= 0; j--) {
6918 //    long product = (in[j] & LONG_MASK) * kLong +
6919 //                   (out[offs] & LONG_MASK) + carry;
6920 //    out[offs--] = (int)product;
6921 //    carry = product >>> 32;
6922 // }
6923 //
6924   push(tmp1);
6925   push(tmp2);
6926   push(tmp3);
6927   push(tmp4);
6928   push(tmp5);
6929 
6930   Register op2 = tmp2;
6931   const Register sum = tmp3;
6932   const Register op1 = tmp4;
6933   const Register carry =  tmp5;
6934 
6935   if (UseBMI2Instructions) {
6936     op2 = rdxReg;
6937     movl(op2, k);
6938   }
6939   else {
6940     movl(op2, k);
6941   }
6942 
6943   xorq(carry, carry);
6944 
6945   //First loop
6946 
6947   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
6948   //The carry is in tmp5
6949   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
6950 
6951   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
6952   decrementl(len);
6953   jccb(Assembler::negative, L_carry);
6954   decrementl(len);
6955   jccb(Assembler::negative, L_last_in);
6956 
6957   movq(op1, Address(in, len, Address::times_4,  0));
6958   rorq(op1, 32);
6959 
6960   subl(offs, 2);
6961   movq(sum, Address(out, offs, Address::times_4,  0));
6962   rorq(sum, 32);
6963 
6964   if (UseBMI2Instructions) {
6965     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6966   }
6967   else {
6968     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6969   }
6970 
6971   // Store back in big endian from little endian
6972   rorq(sum, 0x20);
6973   movq(Address(out, offs, Address::times_4,  0), sum);
6974 
6975   testl(len, len);
6976   jccb(Assembler::zero, L_carry);
6977 
6978   //Multiply the last in[] entry, if any
6979   bind(L_last_in);
6980   movl(op1, Address(in, 0));
6981   movl(sum, Address(out, offs, Address::times_4,  -4));
6982 
6983   movl(raxReg, k);
6984   mull(op1); //tmp4 * eax -> edx:eax
6985   addl(sum, carry);
6986   adcl(rdxReg, 0);
6987   addl(sum, raxReg);
6988   adcl(rdxReg, 0);
6989   movl(carry, rdxReg);
6990 
6991   movl(Address(out, offs, Address::times_4,  -4), sum);
6992 
6993   bind(L_carry);
6994   //return tmp5/carry as carry in rax
6995   movl(rax, carry);
6996 
6997   bind(L_done);
6998   pop(tmp5);
6999   pop(tmp4);
7000   pop(tmp3);
7001   pop(tmp2);
7002   pop(tmp1);
7003 }
7004 #endif
7005 
7006 /**
7007  * Emits code to update CRC-32 with a byte value according to constants in table
7008  *
7009  * @param [in,out]crc   Register containing the crc.
7010  * @param [in]val       Register containing the byte to fold into the CRC.
7011  * @param [in]table     Register containing the table of crc constants.
7012  *
7013  * uint32_t crc;
7014  * val = crc_table[(val ^ crc) & 0xFF];
7015  * crc = val ^ (crc >> 8);
7016  *
7017  */
7018 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7019   xorl(val, crc);
7020   andl(val, 0xFF);
7021   shrl(crc, 8); // unsigned shift
7022   xorl(crc, Address(table, val, Address::times_4, 0));
7023 }
7024 
7025 /**
7026  * Fold 128-bit data chunk
7027  */
7028 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7029   if (UseAVX > 0) {
7030     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7031     vpclmulldq(xcrc, xK, xcrc); // [63:0]
7032     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7033     pxor(xcrc, xtmp);
7034   } else {
7035     movdqa(xtmp, xcrc);
7036     pclmulhdq(xtmp, xK);   // [123:64]
7037     pclmulldq(xcrc, xK);   // [63:0]
7038     pxor(xcrc, xtmp);
7039     movdqu(xtmp, Address(buf, offset));
7040     pxor(xcrc, xtmp);
7041   }
7042 }
7043 
7044 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7045   if (UseAVX > 0) {
7046     vpclmulhdq(xtmp, xK, xcrc);
7047     vpclmulldq(xcrc, xK, xcrc);
7048     pxor(xcrc, xbuf);
7049     pxor(xcrc, xtmp);
7050   } else {
7051     movdqa(xtmp, xcrc);
7052     pclmulhdq(xtmp, xK);
7053     pclmulldq(xcrc, xK);
7054     pxor(xcrc, xbuf);
7055     pxor(xcrc, xtmp);
7056   }
7057 }
7058 
7059 /**
7060  * 8-bit folds to compute 32-bit CRC
7061  *
7062  * uint64_t xcrc;
7063  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7064  */
7065 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7066   movdl(tmp, xcrc);
7067   andl(tmp, 0xFF);
7068   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7069   psrldq(xcrc, 1); // unsigned shift one byte
7070   pxor(xcrc, xtmp);
7071 }
7072 
7073 /**
7074  * uint32_t crc;
7075  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7076  */
7077 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7078   movl(tmp, crc);
7079   andl(tmp, 0xFF);
7080   shrl(crc, 8);
7081   xorl(crc, Address(table, tmp, Address::times_4, 0));
7082 }
7083 
7084 /**
7085  * @param crc   register containing existing CRC (32-bit)
7086  * @param buf   register pointing to input byte buffer (byte*)
7087  * @param len   register containing number of bytes
7088  * @param table register that will contain address of CRC table
7089  * @param tmp   scratch register
7090  */
7091 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7092   assert_different_registers(crc, buf, len, table, tmp, rax);
7093 
7094   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7095   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7096 
7097   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7098   // context for the registers used, where all instructions below are using 128-bit mode
7099   // On EVEX without VL and BW, these instructions will all be AVX.
7100   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7101   notl(crc); // ~crc
7102   cmpl(len, 16);
7103   jcc(Assembler::less, L_tail);
7104 
7105   // Align buffer to 16 bytes
7106   movl(tmp, buf);
7107   andl(tmp, 0xF);
7108   jccb(Assembler::zero, L_aligned);
7109   subl(tmp,  16);
7110   addl(len, tmp);
7111 
7112   align(4);
7113   BIND(L_align_loop);
7114   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7115   update_byte_crc32(crc, rax, table);
7116   increment(buf);
7117   incrementl(tmp);
7118   jccb(Assembler::less, L_align_loop);
7119 
7120   BIND(L_aligned);
7121   movl(tmp, len); // save
7122   shrl(len, 4);
7123   jcc(Assembler::zero, L_tail_restore);
7124 
7125   // Fold crc into first bytes of vector
7126   movdqa(xmm1, Address(buf, 0));
7127   movdl(rax, xmm1);
7128   xorl(crc, rax);
7129   if (VM_Version::supports_sse4_1()) {
7130     pinsrd(xmm1, crc, 0);
7131   } else {
7132     pinsrw(xmm1, crc, 0);
7133     shrl(crc, 16);
7134     pinsrw(xmm1, crc, 1);
7135   }
7136   addptr(buf, 16);
7137   subl(len, 4); // len > 0
7138   jcc(Assembler::less, L_fold_tail);
7139 
7140   movdqa(xmm2, Address(buf,  0));
7141   movdqa(xmm3, Address(buf, 16));
7142   movdqa(xmm4, Address(buf, 32));
7143   addptr(buf, 48);
7144   subl(len, 3);
7145   jcc(Assembler::lessEqual, L_fold_512b);
7146 
7147   // Fold total 512 bits of polynomial on each iteration,
7148   // 128 bits per each of 4 parallel streams.
7149   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
7150 
7151   align32();
7152   BIND(L_fold_512b_loop);
7153   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7154   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7155   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7156   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7157   addptr(buf, 64);
7158   subl(len, 4);
7159   jcc(Assembler::greater, L_fold_512b_loop);
7160 
7161   // Fold 512 bits to 128 bits.
7162   BIND(L_fold_512b);
7163   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7164   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7165   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7166   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7167 
7168   // Fold the rest of 128 bits data chunks
7169   BIND(L_fold_tail);
7170   addl(len, 3);
7171   jccb(Assembler::lessEqual, L_fold_128b);
7172   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7173 
7174   BIND(L_fold_tail_loop);
7175   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7176   addptr(buf, 16);
7177   decrementl(len);
7178   jccb(Assembler::greater, L_fold_tail_loop);
7179 
7180   // Fold 128 bits in xmm1 down into 32 bits in crc register.
7181   BIND(L_fold_128b);
7182   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7183   if (UseAVX > 0) {
7184     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7185     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7186     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7187   } else {
7188     movdqa(xmm2, xmm0);
7189     pclmulqdq(xmm2, xmm1, 0x1);
7190     movdqa(xmm3, xmm0);
7191     pand(xmm3, xmm2);
7192     pclmulqdq(xmm0, xmm3, 0x1);
7193   }
7194   psrldq(xmm1, 8);
7195   psrldq(xmm2, 4);
7196   pxor(xmm0, xmm1);
7197   pxor(xmm0, xmm2);
7198 
7199   // 8 8-bit folds to compute 32-bit CRC.
7200   for (int j = 0; j < 4; j++) {
7201     fold_8bit_crc32(xmm0, table, xmm1, rax);
7202   }
7203   movdl(crc, xmm0); // mov 32 bits to general register
7204   for (int j = 0; j < 4; j++) {
7205     fold_8bit_crc32(crc, table, rax);
7206   }
7207 
7208   BIND(L_tail_restore);
7209   movl(len, tmp); // restore
7210   BIND(L_tail);
7211   andl(len, 0xf);
7212   jccb(Assembler::zero, L_exit);
7213 
7214   // Fold the rest of bytes
7215   align(4);
7216   BIND(L_tail_loop);
7217   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7218   update_byte_crc32(crc, rax, table);
7219   increment(buf);
7220   decrementl(len);
7221   jccb(Assembler::greater, L_tail_loop);
7222 
7223   BIND(L_exit);
7224   notl(crc); // ~c
7225 }
7226 
7227 #ifdef _LP64
7228 // Helper function for AVX 512 CRC32
7229 // Fold 512-bit data chunks
7230 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
7231                                              Register pos, int offset) {
7232   evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
7233   evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
7234   evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
7235   evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
7236   evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
7237 }
7238 
7239 // Helper function for AVX 512 CRC32
7240 // Compute CRC32 for < 256B buffers
7241 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
7242                                               Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7243                                               Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7244 
7245   Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
7246   Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
7247   Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
7248 
7249   // check if there is enough buffer to be able to fold 16B at a time
7250   cmpl(len, 32);
7251   jcc(Assembler::less, L_less_than_32);
7252 
7253   // if there is, load the constants
7254   movdqu(xmm10, Address(table, 1 * 16));    //rk1 and rk2 in xmm10
7255   movdl(xmm0, crc);                        // get the initial crc value
7256   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7257   pxor(xmm7, xmm0);
7258 
7259   // update the buffer pointer
7260   addl(pos, 16);
7261   //update the counter.subtract 32 instead of 16 to save one instruction from the loop
7262   subl(len, 32);
7263   jmp(L_16B_reduction_loop);
7264 
7265   bind(L_less_than_32);
7266   //mov initial crc to the return value. this is necessary for zero - length buffers.
7267   movl(rax, crc);
7268   testl(len, len);
7269   jcc(Assembler::equal, L_cleanup);
7270 
7271   movdl(xmm0, crc);                        //get the initial crc value
7272 
7273   cmpl(len, 16);
7274   jcc(Assembler::equal, L_exact_16_left);
7275   jcc(Assembler::less, L_less_than_16_left);
7276 
7277   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7278   pxor(xmm7, xmm0);                       //xor the initial crc value
7279   addl(pos, 16);
7280   subl(len, 16);
7281   movdqu(xmm10, Address(table, 1 * 16));    // rk1 and rk2 in xmm10
7282   jmp(L_get_last_two_xmms);
7283 
7284   bind(L_less_than_16_left);
7285   //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
7286   pxor(xmm1, xmm1);
7287   movptr(tmp1, rsp);
7288   movdqu(Address(tmp1, 0 * 16), xmm1);
7289 
7290   cmpl(len, 4);
7291   jcc(Assembler::less, L_only_less_than_4);
7292 
7293   //backup the counter value
7294   movl(tmp2, len);
7295   cmpl(len, 8);
7296   jcc(Assembler::less, L_less_than_8_left);
7297 
7298   //load 8 Bytes
7299   movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
7300   movq(Address(tmp1, 0 * 16), rax);
7301   addptr(tmp1, 8);
7302   subl(len, 8);
7303   addl(pos, 8);
7304 
7305   bind(L_less_than_8_left);
7306   cmpl(len, 4);
7307   jcc(Assembler::less, L_less_than_4_left);
7308 
7309   //load 4 Bytes
7310   movl(rax, Address(buf, pos, Address::times_1, 0));
7311   movl(Address(tmp1, 0 * 16), rax);
7312   addptr(tmp1, 4);
7313   subl(len, 4);
7314   addl(pos, 4);
7315 
7316   bind(L_less_than_4_left);
7317   cmpl(len, 2);
7318   jcc(Assembler::less, L_less_than_2_left);
7319 
7320   // load 2 Bytes
7321   movw(rax, Address(buf, pos, Address::times_1, 0));
7322   movl(Address(tmp1, 0 * 16), rax);
7323   addptr(tmp1, 2);
7324   subl(len, 2);
7325   addl(pos, 2);
7326 
7327   bind(L_less_than_2_left);
7328   cmpl(len, 1);
7329   jcc(Assembler::less, L_zero_left);
7330 
7331   // load 1 Byte
7332   movb(rax, Address(buf, pos, Address::times_1, 0));
7333   movb(Address(tmp1, 0 * 16), rax);
7334 
7335   bind(L_zero_left);
7336   movdqu(xmm7, Address(rsp, 0));
7337   pxor(xmm7, xmm0);                       //xor the initial crc value
7338 
7339   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7340   movdqu(xmm0, Address(rax, tmp2));
7341   pshufb(xmm7, xmm0);
7342   jmp(L_128_done);
7343 
7344   bind(L_exact_16_left);
7345   movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
7346   pxor(xmm7, xmm0);                       //xor the initial crc value
7347   jmp(L_128_done);
7348 
7349   bind(L_only_less_than_4);
7350   cmpl(len, 3);
7351   jcc(Assembler::less, L_only_less_than_3);
7352 
7353   // load 3 Bytes
7354   movb(rax, Address(buf, pos, Address::times_1, 0));
7355   movb(Address(tmp1, 0), rax);
7356 
7357   movb(rax, Address(buf, pos, Address::times_1, 1));
7358   movb(Address(tmp1, 1), rax);
7359 
7360   movb(rax, Address(buf, pos, Address::times_1, 2));
7361   movb(Address(tmp1, 2), rax);
7362 
7363   movdqu(xmm7, Address(rsp, 0));
7364   pxor(xmm7, xmm0);                     //xor the initial crc value
7365 
7366   pslldq(xmm7, 0x5);
7367   jmp(L_barrett);
7368   bind(L_only_less_than_3);
7369   cmpl(len, 2);
7370   jcc(Assembler::less, L_only_less_than_2);
7371 
7372   // load 2 Bytes
7373   movb(rax, Address(buf, pos, Address::times_1, 0));
7374   movb(Address(tmp1, 0), rax);
7375 
7376   movb(rax, Address(buf, pos, Address::times_1, 1));
7377   movb(Address(tmp1, 1), rax);
7378 
7379   movdqu(xmm7, Address(rsp, 0));
7380   pxor(xmm7, xmm0);                     //xor the initial crc value
7381 
7382   pslldq(xmm7, 0x6);
7383   jmp(L_barrett);
7384 
7385   bind(L_only_less_than_2);
7386   //load 1 Byte
7387   movb(rax, Address(buf, pos, Address::times_1, 0));
7388   movb(Address(tmp1, 0), rax);
7389 
7390   movdqu(xmm7, Address(rsp, 0));
7391   pxor(xmm7, xmm0);                     //xor the initial crc value
7392 
7393   pslldq(xmm7, 0x7);
7394 }
7395 
7396 /**
7397 * Compute CRC32 using AVX512 instructions
7398 * param crc   register containing existing CRC (32-bit)
7399 * param buf   register pointing to input byte buffer (byte*)
7400 * param len   register containing number of bytes
7401 * param table address of crc or crc32c table
7402 * param tmp1  scratch register
7403 * param tmp2  scratch register
7404 * return rax  result register
7405 *
7406 * This routine is identical for crc32c with the exception of the precomputed constant
7407 * table which will be passed as the table argument.  The calculation steps are
7408 * the same for both variants.
7409 */
7410 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
7411   assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
7412 
7413   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7414   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7415   Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
7416   Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
7417   Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
7418 
7419   const Register pos = r12;
7420   push(r12);
7421   subptr(rsp, 16 * 2 + 8);
7422 
7423   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7424   // context for the registers used, where all instructions below are using 128-bit mode
7425   // On EVEX without VL and BW, these instructions will all be AVX.
7426   movl(pos, 0);
7427 
7428   // check if smaller than 256B
7429   cmpl(len, 256);
7430   jcc(Assembler::less, L_less_than_256);
7431 
7432   // load the initial crc value
7433   movdl(xmm10, crc);
7434 
7435   // receive the initial 64B data, xor the initial crc value
7436   evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
7437   evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
7438   evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
7439   evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
7440 
7441   subl(len, 256);
7442   cmpl(len, 256);
7443   jcc(Assembler::less, L_fold_128_B_loop);
7444 
7445   evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
7446   evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
7447   evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
7448   subl(len, 256);
7449 
7450   bind(L_fold_256_B_loop);
7451   addl(pos, 256);
7452   fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
7453   fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
7454   fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
7455   fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
7456 
7457   subl(len, 256);
7458   jcc(Assembler::greaterEqual, L_fold_256_B_loop);
7459 
7460   // Fold 256 into 128
7461   addl(pos, 256);
7462   evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
7463   evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
7464   vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
7465 
7466   evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
7467   evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
7468   vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
7469 
7470   evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
7471   evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
7472 
7473   addl(len, 128);
7474   jmp(L_fold_128_B_register);
7475 
7476   // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
7477   // loop will fold 128B at a time until we have 128 + y Bytes of buffer
7478 
7479   // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
7480   bind(L_fold_128_B_loop);
7481   addl(pos, 128);
7482   fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
7483   fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
7484 
7485   subl(len, 128);
7486   jcc(Assembler::greaterEqual, L_fold_128_B_loop);
7487 
7488   addl(pos, 128);
7489 
7490   // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
7491   // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
7492   bind(L_fold_128_B_register);
7493   evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
7494   evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
7495   evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
7496   evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
7497   // save last that has no multiplicand
7498   vextracti64x2(xmm7, xmm4, 3);
7499 
7500   evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
7501   evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
7502   // Needed later in reduction loop
7503   movdqu(xmm10, Address(table, 1 * 16));
7504   vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
7505   vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
7506 
7507   // Swap 1,0,3,2 - 01 00 11 10
7508   evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
7509   evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
7510   vextracti128(xmm5, xmm8, 1);
7511   evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
7512 
7513   // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
7514   // instead of a cmp instruction, we use the negative flag with the jl instruction
7515   addl(len, 128 - 16);
7516   jcc(Assembler::less, L_final_reduction_for_128);
7517 
7518   bind(L_16B_reduction_loop);
7519   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7520   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7521   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7522   movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
7523   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7524   addl(pos, 16);
7525   subl(len, 16);
7526   jcc(Assembler::greaterEqual, L_16B_reduction_loop);
7527 
7528   bind(L_final_reduction_for_128);
7529   addl(len, 16);
7530   jcc(Assembler::equal, L_128_done);
7531 
7532   bind(L_get_last_two_xmms);
7533   movdqu(xmm2, xmm7);
7534   addl(pos, len);
7535   movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
7536   subl(pos, len);
7537 
7538   // get rid of the extra data that was loaded before
7539   // load the shift constant
7540   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7541   movdqu(xmm0, Address(rax, len));
7542   addl(rax, len);
7543 
7544   vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7545   //Change mask to 512
7546   vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
7547   vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
7548 
7549   blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
7550   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7551   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7552   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7553   vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
7554 
7555   bind(L_128_done);
7556   // compute crc of a 128-bit value
7557   movdqu(xmm10, Address(table, 3 * 16));
7558   movdqu(xmm0, xmm7);
7559 
7560   // 64b fold
7561   vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
7562   vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
7563   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7564 
7565   // 32b fold
7566   movdqu(xmm0, xmm7);
7567   vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
7568   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7569   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7570   jmp(L_barrett);
7571 
7572   bind(L_less_than_256);
7573   kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
7574 
7575   //barrett reduction
7576   bind(L_barrett);
7577   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
7578   movdqu(xmm1, xmm7);
7579   movdqu(xmm2, xmm7);
7580   movdqu(xmm10, Address(table, 4 * 16));
7581 
7582   pclmulqdq(xmm7, xmm10, 0x0);
7583   pxor(xmm7, xmm2);
7584   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
7585   movdqu(xmm2, xmm7);
7586   pclmulqdq(xmm7, xmm10, 0x10);
7587   pxor(xmm7, xmm2);
7588   pxor(xmm7, xmm1);
7589   pextrd(crc, xmm7, 2);
7590 
7591   bind(L_cleanup);
7592   addptr(rsp, 16 * 2 + 8);
7593   pop(r12);
7594 }
7595 
7596 // S. Gueron / Information Processing Letters 112 (2012) 184
7597 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
7598 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
7599 // Output: the 64-bit carry-less product of B * CONST
7600 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
7601                                      Register tmp1, Register tmp2, Register tmp3) {
7602   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7603   if (n > 0) {
7604     addq(tmp3, n * 256 * 8);
7605   }
7606   //    Q1 = TABLEExt[n][B & 0xFF];
7607   movl(tmp1, in);
7608   andl(tmp1, 0x000000FF);
7609   shll(tmp1, 3);
7610   addq(tmp1, tmp3);
7611   movq(tmp1, Address(tmp1, 0));
7612 
7613   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7614   movl(tmp2, in);
7615   shrl(tmp2, 8);
7616   andl(tmp2, 0x000000FF);
7617   shll(tmp2, 3);
7618   addq(tmp2, tmp3);
7619   movq(tmp2, Address(tmp2, 0));
7620 
7621   shlq(tmp2, 8);
7622   xorq(tmp1, tmp2);
7623 
7624   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7625   movl(tmp2, in);
7626   shrl(tmp2, 16);
7627   andl(tmp2, 0x000000FF);
7628   shll(tmp2, 3);
7629   addq(tmp2, tmp3);
7630   movq(tmp2, Address(tmp2, 0));
7631 
7632   shlq(tmp2, 16);
7633   xorq(tmp1, tmp2);
7634 
7635   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7636   shrl(in, 24);
7637   andl(in, 0x000000FF);
7638   shll(in, 3);
7639   addq(in, tmp3);
7640   movq(in, Address(in, 0));
7641 
7642   shlq(in, 24);
7643   xorq(in, tmp1);
7644   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7645 }
7646 
7647 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7648                                       Register in_out,
7649                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7650                                       XMMRegister w_xtmp2,
7651                                       Register tmp1,
7652                                       Register n_tmp2, Register n_tmp3) {
7653   if (is_pclmulqdq_supported) {
7654     movdl(w_xtmp1, in_out); // modified blindly
7655 
7656     movl(tmp1, const_or_pre_comp_const_index);
7657     movdl(w_xtmp2, tmp1);
7658     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7659 
7660     movdq(in_out, w_xtmp1);
7661   } else {
7662     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
7663   }
7664 }
7665 
7666 // Recombination Alternative 2: No bit-reflections
7667 // T1 = (CRC_A * U1) << 1
7668 // T2 = (CRC_B * U2) << 1
7669 // C1 = T1 >> 32
7670 // C2 = T2 >> 32
7671 // T1 = T1 & 0xFFFFFFFF
7672 // T2 = T2 & 0xFFFFFFFF
7673 // T1 = CRC32(0, T1)
7674 // T2 = CRC32(0, T2)
7675 // C1 = C1 ^ T1
7676 // C2 = C2 ^ T2
7677 // CRC = C1 ^ C2 ^ CRC_C
7678 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7679                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7680                                      Register tmp1, Register tmp2,
7681                                      Register n_tmp3) {
7682   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7683   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7684   shlq(in_out, 1);
7685   movl(tmp1, in_out);
7686   shrq(in_out, 32);
7687   xorl(tmp2, tmp2);
7688   crc32(tmp2, tmp1, 4);
7689   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
7690   shlq(in1, 1);
7691   movl(tmp1, in1);
7692   shrq(in1, 32);
7693   xorl(tmp2, tmp2);
7694   crc32(tmp2, tmp1, 4);
7695   xorl(in1, tmp2);
7696   xorl(in_out, in1);
7697   xorl(in_out, in2);
7698 }
7699 
7700 // Set N to predefined value
7701 // Subtract from a length of a buffer
7702 // execute in a loop:
7703 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
7704 // for i = 1 to N do
7705 //  CRC_A = CRC32(CRC_A, A[i])
7706 //  CRC_B = CRC32(CRC_B, B[i])
7707 //  CRC_C = CRC32(CRC_C, C[i])
7708 // end for
7709 // Recombine
7710 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7711                                        Register in_out1, Register in_out2, Register in_out3,
7712                                        Register tmp1, Register tmp2, Register tmp3,
7713                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7714                                        Register tmp4, Register tmp5,
7715                                        Register n_tmp6) {
7716   Label L_processPartitions;
7717   Label L_processPartition;
7718   Label L_exit;
7719 
7720   bind(L_processPartitions);
7721   cmpl(in_out1, 3 * size);
7722   jcc(Assembler::less, L_exit);
7723     xorl(tmp1, tmp1);
7724     xorl(tmp2, tmp2);
7725     movq(tmp3, in_out2);
7726     addq(tmp3, size);
7727 
7728     bind(L_processPartition);
7729       crc32(in_out3, Address(in_out2, 0), 8);
7730       crc32(tmp1, Address(in_out2, size), 8);
7731       crc32(tmp2, Address(in_out2, size * 2), 8);
7732       addq(in_out2, 8);
7733       cmpq(in_out2, tmp3);
7734       jcc(Assembler::less, L_processPartition);
7735     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7736             w_xtmp1, w_xtmp2, w_xtmp3,
7737             tmp4, tmp5,
7738             n_tmp6);
7739     addq(in_out2, 2 * size);
7740     subl(in_out1, 3 * size);
7741     jmp(L_processPartitions);
7742 
7743   bind(L_exit);
7744 }
7745 #else
7746 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
7747                                      Register tmp1, Register tmp2, Register tmp3,
7748                                      XMMRegister xtmp1, XMMRegister xtmp2) {
7749   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7750   if (n > 0) {
7751     addl(tmp3, n * 256 * 8);
7752   }
7753   //    Q1 = TABLEExt[n][B & 0xFF];
7754   movl(tmp1, in_out);
7755   andl(tmp1, 0x000000FF);
7756   shll(tmp1, 3);
7757   addl(tmp1, tmp3);
7758   movq(xtmp1, Address(tmp1, 0));
7759 
7760   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7761   movl(tmp2, in_out);
7762   shrl(tmp2, 8);
7763   andl(tmp2, 0x000000FF);
7764   shll(tmp2, 3);
7765   addl(tmp2, tmp3);
7766   movq(xtmp2, Address(tmp2, 0));
7767 
7768   psllq(xtmp2, 8);
7769   pxor(xtmp1, xtmp2);
7770 
7771   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7772   movl(tmp2, in_out);
7773   shrl(tmp2, 16);
7774   andl(tmp2, 0x000000FF);
7775   shll(tmp2, 3);
7776   addl(tmp2, tmp3);
7777   movq(xtmp2, Address(tmp2, 0));
7778 
7779   psllq(xtmp2, 16);
7780   pxor(xtmp1, xtmp2);
7781 
7782   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7783   shrl(in_out, 24);
7784   andl(in_out, 0x000000FF);
7785   shll(in_out, 3);
7786   addl(in_out, tmp3);
7787   movq(xtmp2, Address(in_out, 0));
7788 
7789   psllq(xtmp2, 24);
7790   pxor(xtmp1, xtmp2); // Result in CXMM
7791   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7792 }
7793 
7794 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7795                                       Register in_out,
7796                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7797                                       XMMRegister w_xtmp2,
7798                                       Register tmp1,
7799                                       Register n_tmp2, Register n_tmp3) {
7800   if (is_pclmulqdq_supported) {
7801     movdl(w_xtmp1, in_out);
7802 
7803     movl(tmp1, const_or_pre_comp_const_index);
7804     movdl(w_xtmp2, tmp1);
7805     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7806     // Keep result in XMM since GPR is 32 bit in length
7807   } else {
7808     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
7809   }
7810 }
7811 
7812 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7813                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7814                                      Register tmp1, Register tmp2,
7815                                      Register n_tmp3) {
7816   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7817   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7818 
7819   psllq(w_xtmp1, 1);
7820   movdl(tmp1, w_xtmp1);
7821   psrlq(w_xtmp1, 32);
7822   movdl(in_out, w_xtmp1);
7823 
7824   xorl(tmp2, tmp2);
7825   crc32(tmp2, tmp1, 4);
7826   xorl(in_out, tmp2);
7827 
7828   psllq(w_xtmp2, 1);
7829   movdl(tmp1, w_xtmp2);
7830   psrlq(w_xtmp2, 32);
7831   movdl(in1, w_xtmp2);
7832 
7833   xorl(tmp2, tmp2);
7834   crc32(tmp2, tmp1, 4);
7835   xorl(in1, tmp2);
7836   xorl(in_out, in1);
7837   xorl(in_out, in2);
7838 }
7839 
7840 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7841                                        Register in_out1, Register in_out2, Register in_out3,
7842                                        Register tmp1, Register tmp2, Register tmp3,
7843                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7844                                        Register tmp4, Register tmp5,
7845                                        Register n_tmp6) {
7846   Label L_processPartitions;
7847   Label L_processPartition;
7848   Label L_exit;
7849 
7850   bind(L_processPartitions);
7851   cmpl(in_out1, 3 * size);
7852   jcc(Assembler::less, L_exit);
7853     xorl(tmp1, tmp1);
7854     xorl(tmp2, tmp2);
7855     movl(tmp3, in_out2);
7856     addl(tmp3, size);
7857 
7858     bind(L_processPartition);
7859       crc32(in_out3, Address(in_out2, 0), 4);
7860       crc32(tmp1, Address(in_out2, size), 4);
7861       crc32(tmp2, Address(in_out2, size*2), 4);
7862       crc32(in_out3, Address(in_out2, 0+4), 4);
7863       crc32(tmp1, Address(in_out2, size+4), 4);
7864       crc32(tmp2, Address(in_out2, size*2+4), 4);
7865       addl(in_out2, 8);
7866       cmpl(in_out2, tmp3);
7867       jcc(Assembler::less, L_processPartition);
7868 
7869         push(tmp3);
7870         push(in_out1);
7871         push(in_out2);
7872         tmp4 = tmp3;
7873         tmp5 = in_out1;
7874         n_tmp6 = in_out2;
7875 
7876       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7877             w_xtmp1, w_xtmp2, w_xtmp3,
7878             tmp4, tmp5,
7879             n_tmp6);
7880 
7881         pop(in_out2);
7882         pop(in_out1);
7883         pop(tmp3);
7884 
7885     addl(in_out2, 2 * size);
7886     subl(in_out1, 3 * size);
7887     jmp(L_processPartitions);
7888 
7889   bind(L_exit);
7890 }
7891 #endif //LP64
7892 
7893 #ifdef _LP64
7894 // Algorithm 2: Pipelined usage of the CRC32 instruction.
7895 // Input: A buffer I of L bytes.
7896 // Output: the CRC32C value of the buffer.
7897 // Notations:
7898 // Write L = 24N + r, with N = floor (L/24).
7899 // r = L mod 24 (0 <= r < 24).
7900 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
7901 // N quadwords, and R consists of r bytes.
7902 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
7903 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
7904 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
7905 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
7906 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7907                                           Register tmp1, Register tmp2, Register tmp3,
7908                                           Register tmp4, Register tmp5, Register tmp6,
7909                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7910                                           bool is_pclmulqdq_supported) {
7911   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7912   Label L_wordByWord;
7913   Label L_byteByByteProlog;
7914   Label L_byteByByte;
7915   Label L_exit;
7916 
7917   if (is_pclmulqdq_supported ) {
7918     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7919     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
7920 
7921     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7922     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7923 
7924     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7925     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7926     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
7927   } else {
7928     const_or_pre_comp_const_index[0] = 1;
7929     const_or_pre_comp_const_index[1] = 0;
7930 
7931     const_or_pre_comp_const_index[2] = 3;
7932     const_or_pre_comp_const_index[3] = 2;
7933 
7934     const_or_pre_comp_const_index[4] = 5;
7935     const_or_pre_comp_const_index[5] = 4;
7936    }
7937   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
7938                     in2, in1, in_out,
7939                     tmp1, tmp2, tmp3,
7940                     w_xtmp1, w_xtmp2, w_xtmp3,
7941                     tmp4, tmp5,
7942                     tmp6);
7943   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
7944                     in2, in1, in_out,
7945                     tmp1, tmp2, tmp3,
7946                     w_xtmp1, w_xtmp2, w_xtmp3,
7947                     tmp4, tmp5,
7948                     tmp6);
7949   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
7950                     in2, in1, in_out,
7951                     tmp1, tmp2, tmp3,
7952                     w_xtmp1, w_xtmp2, w_xtmp3,
7953                     tmp4, tmp5,
7954                     tmp6);
7955   movl(tmp1, in2);
7956   andl(tmp1, 0x00000007);
7957   negl(tmp1);
7958   addl(tmp1, in2);
7959   addq(tmp1, in1);
7960 
7961   BIND(L_wordByWord);
7962   cmpq(in1, tmp1);
7963   jcc(Assembler::greaterEqual, L_byteByByteProlog);
7964     crc32(in_out, Address(in1, 0), 4);
7965     addq(in1, 4);
7966     jmp(L_wordByWord);
7967 
7968   BIND(L_byteByByteProlog);
7969   andl(in2, 0x00000007);
7970   movl(tmp2, 1);
7971 
7972   BIND(L_byteByByte);
7973   cmpl(tmp2, in2);
7974   jccb(Assembler::greater, L_exit);
7975     crc32(in_out, Address(in1, 0), 1);
7976     incq(in1);
7977     incl(tmp2);
7978     jmp(L_byteByByte);
7979 
7980   BIND(L_exit);
7981 }
7982 #else
7983 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7984                                           Register tmp1, Register  tmp2, Register tmp3,
7985                                           Register tmp4, Register  tmp5, Register tmp6,
7986                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7987                                           bool is_pclmulqdq_supported) {
7988   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7989   Label L_wordByWord;
7990   Label L_byteByByteProlog;
7991   Label L_byteByByte;
7992   Label L_exit;
7993 
7994   if (is_pclmulqdq_supported) {
7995     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7996     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
7997 
7998     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7999     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
8000 
8001     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
8002     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
8003   } else {
8004     const_or_pre_comp_const_index[0] = 1;
8005     const_or_pre_comp_const_index[1] = 0;
8006 
8007     const_or_pre_comp_const_index[2] = 3;
8008     const_or_pre_comp_const_index[3] = 2;
8009 
8010     const_or_pre_comp_const_index[4] = 5;
8011     const_or_pre_comp_const_index[5] = 4;
8012   }
8013   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8014                     in2, in1, in_out,
8015                     tmp1, tmp2, tmp3,
8016                     w_xtmp1, w_xtmp2, w_xtmp3,
8017                     tmp4, tmp5,
8018                     tmp6);
8019   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8020                     in2, in1, in_out,
8021                     tmp1, tmp2, tmp3,
8022                     w_xtmp1, w_xtmp2, w_xtmp3,
8023                     tmp4, tmp5,
8024                     tmp6);
8025   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8026                     in2, in1, in_out,
8027                     tmp1, tmp2, tmp3,
8028                     w_xtmp1, w_xtmp2, w_xtmp3,
8029                     tmp4, tmp5,
8030                     tmp6);
8031   movl(tmp1, in2);
8032   andl(tmp1, 0x00000007);
8033   negl(tmp1);
8034   addl(tmp1, in2);
8035   addl(tmp1, in1);
8036 
8037   BIND(L_wordByWord);
8038   cmpl(in1, tmp1);
8039   jcc(Assembler::greaterEqual, L_byteByByteProlog);
8040     crc32(in_out, Address(in1,0), 4);
8041     addl(in1, 4);
8042     jmp(L_wordByWord);
8043 
8044   BIND(L_byteByByteProlog);
8045   andl(in2, 0x00000007);
8046   movl(tmp2, 1);
8047 
8048   BIND(L_byteByByte);
8049   cmpl(tmp2, in2);
8050   jccb(Assembler::greater, L_exit);
8051     movb(tmp1, Address(in1, 0));
8052     crc32(in_out, tmp1, 1);
8053     incl(in1);
8054     incl(tmp2);
8055     jmp(L_byteByByte);
8056 
8057   BIND(L_exit);
8058 }
8059 #endif // LP64
8060 #undef BIND
8061 #undef BLOCK_COMMENT
8062 
8063 // Compress char[] array to byte[].
8064 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
8065 //   @IntrinsicCandidate
8066 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8067 //     for (int i = 0; i < len; i++) {
8068 //       int c = src[srcOff++];
8069 //       if (c >>> 8 != 0) {
8070 //         return 0;
8071 //       }
8072 //       dst[dstOff++] = (byte)c;
8073 //     }
8074 //     return len;
8075 //   }
8076 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8077   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8078   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8079   Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8080   Label copy_chars_loop, return_length, return_zero, done;
8081 
8082   // rsi: src
8083   // rdi: dst
8084   // rdx: len
8085   // rcx: tmp5
8086   // rax: result
8087 
8088   // rsi holds start addr of source char[] to be compressed
8089   // rdi holds start addr of destination byte[]
8090   // rdx holds length
8091 
8092   assert(len != result, "");
8093 
8094   // save length for return
8095   push(len);
8096 
8097   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
8098     VM_Version::supports_avx512vlbw() &&
8099     VM_Version::supports_bmi2()) {
8100 
8101     Label copy_32_loop, copy_loop_tail, below_threshold;
8102 
8103     // alignment
8104     Label post_alignment;
8105 
8106     // if length of the string is less than 16, handle it in an old fashioned way
8107     testl(len, -32);
8108     jcc(Assembler::zero, below_threshold);
8109 
8110     // First check whether a character is compressible ( <= 0xFF).
8111     // Create mask to test for Unicode chars inside zmm vector
8112     movl(result, 0x00FF);
8113     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
8114 
8115     testl(len, -64);
8116     jcc(Assembler::zero, post_alignment);
8117 
8118     movl(tmp5, dst);
8119     andl(tmp5, (32 - 1));
8120     negl(tmp5);
8121     andl(tmp5, (32 - 1));
8122 
8123     // bail out when there is nothing to be done
8124     testl(tmp5, 0xFFFFFFFF);
8125     jcc(Assembler::zero, post_alignment);
8126 
8127     // ~(~0 << len), where len is the # of remaining elements to process
8128     movl(result, 0xFFFFFFFF);
8129     shlxl(result, result, tmp5);
8130     notl(result);
8131     kmovdl(mask2, result);
8132 
8133     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8134     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8135     ktestd(mask1, mask2);
8136     jcc(Assembler::carryClear, return_zero);
8137 
8138     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8139 
8140     addptr(src, tmp5);
8141     addptr(src, tmp5);
8142     addptr(dst, tmp5);
8143     subl(len, tmp5);
8144 
8145     bind(post_alignment);
8146     // end of alignment
8147 
8148     movl(tmp5, len);
8149     andl(tmp5, (32 - 1));    // tail count (in chars)
8150     andl(len, ~(32 - 1));    // vector count (in chars)
8151     jcc(Assembler::zero, copy_loop_tail);
8152 
8153     lea(src, Address(src, len, Address::times_2));
8154     lea(dst, Address(dst, len, Address::times_1));
8155     negptr(len);
8156 
8157     bind(copy_32_loop);
8158     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
8159     evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
8160     kortestdl(mask1, mask1);
8161     jcc(Assembler::carryClear, return_zero);
8162 
8163     // All elements in current processed chunk are valid candidates for
8164     // compression. Write a truncated byte elements to the memory.
8165     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
8166     addptr(len, 32);
8167     jcc(Assembler::notZero, copy_32_loop);
8168 
8169     bind(copy_loop_tail);
8170     // bail out when there is nothing to be done
8171     testl(tmp5, 0xFFFFFFFF);
8172     jcc(Assembler::zero, return_length);
8173 
8174     movl(len, tmp5);
8175 
8176     // ~(~0 << len), where len is the # of remaining elements to process
8177     movl(result, 0xFFFFFFFF);
8178     shlxl(result, result, len);
8179     notl(result);
8180 
8181     kmovdl(mask2, result);
8182 
8183     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8184     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8185     ktestd(mask1, mask2);
8186     jcc(Assembler::carryClear, return_zero);
8187 
8188     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8189     jmp(return_length);
8190 
8191     bind(below_threshold);
8192   }
8193 
8194   if (UseSSE42Intrinsics) {
8195     Label copy_32_loop, copy_16, copy_tail;
8196 
8197     movl(result, len);
8198 
8199     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
8200 
8201     // vectored compression
8202     andl(len, 0xfffffff0);    // vector count (in chars)
8203     andl(result, 0x0000000f);    // tail count (in chars)
8204     testl(len, len);
8205     jcc(Assembler::zero, copy_16);
8206 
8207     // compress 16 chars per iter
8208     movdl(tmp1Reg, tmp5);
8209     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8210     pxor(tmp4Reg, tmp4Reg);
8211 
8212     lea(src, Address(src, len, Address::times_2));
8213     lea(dst, Address(dst, len, Address::times_1));
8214     negptr(len);
8215 
8216     bind(copy_32_loop);
8217     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
8218     por(tmp4Reg, tmp2Reg);
8219     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
8220     por(tmp4Reg, tmp3Reg);
8221     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
8222     jcc(Assembler::notZero, return_zero);
8223     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
8224     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
8225     addptr(len, 16);
8226     jcc(Assembler::notZero, copy_32_loop);
8227 
8228     // compress next vector of 8 chars (if any)
8229     bind(copy_16);
8230     movl(len, result);
8231     andl(len, 0xfffffff8);    // vector count (in chars)
8232     andl(result, 0x00000007);    // tail count (in chars)
8233     testl(len, len);
8234     jccb(Assembler::zero, copy_tail);
8235 
8236     movdl(tmp1Reg, tmp5);
8237     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8238     pxor(tmp3Reg, tmp3Reg);
8239 
8240     movdqu(tmp2Reg, Address(src, 0));
8241     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
8242     jccb(Assembler::notZero, return_zero);
8243     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
8244     movq(Address(dst, 0), tmp2Reg);
8245     addptr(src, 16);
8246     addptr(dst, 8);
8247 
8248     bind(copy_tail);
8249     movl(len, result);
8250   }
8251   // compress 1 char per iter
8252   testl(len, len);
8253   jccb(Assembler::zero, return_length);
8254   lea(src, Address(src, len, Address::times_2));
8255   lea(dst, Address(dst, len, Address::times_1));
8256   negptr(len);
8257 
8258   bind(copy_chars_loop);
8259   load_unsigned_short(result, Address(src, len, Address::times_2));
8260   testl(result, 0xff00);      // check if Unicode char
8261   jccb(Assembler::notZero, return_zero);
8262   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
8263   increment(len);
8264   jcc(Assembler::notZero, copy_chars_loop);
8265 
8266   // if compression succeeded, return length
8267   bind(return_length);
8268   pop(result);
8269   jmpb(done);
8270 
8271   // if compression failed, return 0
8272   bind(return_zero);
8273   xorl(result, result);
8274   addptr(rsp, wordSize);
8275 
8276   bind(done);
8277 }
8278 
8279 // Inflate byte[] array to char[].
8280 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
8281 //   @IntrinsicCandidate
8282 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
8283 //     for (int i = 0; i < len; i++) {
8284 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
8285 //     }
8286 //   }
8287 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
8288   XMMRegister tmp1, Register tmp2, KRegister mask) {
8289   Label copy_chars_loop, done, below_threshold, avx3_threshold;
8290   // rsi: src
8291   // rdi: dst
8292   // rdx: len
8293   // rcx: tmp2
8294 
8295   // rsi holds start addr of source byte[] to be inflated
8296   // rdi holds start addr of destination char[]
8297   // rdx holds length
8298   assert_different_registers(src, dst, len, tmp2);
8299   movl(tmp2, len);
8300   if ((UseAVX > 2) && // AVX512
8301     VM_Version::supports_avx512vlbw() &&
8302     VM_Version::supports_bmi2()) {
8303 
8304     Label copy_32_loop, copy_tail;
8305     Register tmp3_aliased = len;
8306 
8307     // if length of the string is less than 16, handle it in an old fashioned way
8308     testl(len, -16);
8309     jcc(Assembler::zero, below_threshold);
8310 
8311     testl(len, -1 * AVX3Threshold);
8312     jcc(Assembler::zero, avx3_threshold);
8313 
8314     // In order to use only one arithmetic operation for the main loop we use
8315     // this pre-calculation
8316     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
8317     andl(len, -32);     // vector count
8318     jccb(Assembler::zero, copy_tail);
8319 
8320     lea(src, Address(src, len, Address::times_1));
8321     lea(dst, Address(dst, len, Address::times_2));
8322     negptr(len);
8323 
8324 
8325     // inflate 32 chars per iter
8326     bind(copy_32_loop);
8327     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
8328     evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
8329     addptr(len, 32);
8330     jcc(Assembler::notZero, copy_32_loop);
8331 
8332     bind(copy_tail);
8333     // bail out when there is nothing to be done
8334     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
8335     jcc(Assembler::zero, done);
8336 
8337     // ~(~0 << length), where length is the # of remaining elements to process
8338     movl(tmp3_aliased, -1);
8339     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
8340     notl(tmp3_aliased);
8341     kmovdl(mask, tmp3_aliased);
8342     evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
8343     evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
8344 
8345     jmp(done);
8346     bind(avx3_threshold);
8347   }
8348   if (UseSSE42Intrinsics) {
8349     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
8350 
8351     if (UseAVX > 1) {
8352       andl(tmp2, (16 - 1));
8353       andl(len, -16);
8354       jccb(Assembler::zero, copy_new_tail);
8355     } else {
8356       andl(tmp2, 0x00000007);   // tail count (in chars)
8357       andl(len, 0xfffffff8);    // vector count (in chars)
8358       jccb(Assembler::zero, copy_tail);
8359     }
8360 
8361     // vectored inflation
8362     lea(src, Address(src, len, Address::times_1));
8363     lea(dst, Address(dst, len, Address::times_2));
8364     negptr(len);
8365 
8366     if (UseAVX > 1) {
8367       bind(copy_16_loop);
8368       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
8369       vmovdqu(Address(dst, len, Address::times_2), tmp1);
8370       addptr(len, 16);
8371       jcc(Assembler::notZero, copy_16_loop);
8372 
8373       bind(below_threshold);
8374       bind(copy_new_tail);
8375       movl(len, tmp2);
8376       andl(tmp2, 0x00000007);
8377       andl(len, 0xFFFFFFF8);
8378       jccb(Assembler::zero, copy_tail);
8379 
8380       pmovzxbw(tmp1, Address(src, 0));
8381       movdqu(Address(dst, 0), tmp1);
8382       addptr(src, 8);
8383       addptr(dst, 2 * 8);
8384 
8385       jmp(copy_tail, true);
8386     }
8387 
8388     // inflate 8 chars per iter
8389     bind(copy_8_loop);
8390     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
8391     movdqu(Address(dst, len, Address::times_2), tmp1);
8392     addptr(len, 8);
8393     jcc(Assembler::notZero, copy_8_loop);
8394 
8395     bind(copy_tail);
8396     movl(len, tmp2);
8397 
8398     cmpl(len, 4);
8399     jccb(Assembler::less, copy_bytes);
8400 
8401     movdl(tmp1, Address(src, 0));  // load 4 byte chars
8402     pmovzxbw(tmp1, tmp1);
8403     movq(Address(dst, 0), tmp1);
8404     subptr(len, 4);
8405     addptr(src, 4);
8406     addptr(dst, 8);
8407 
8408     bind(copy_bytes);
8409   } else {
8410     bind(below_threshold);
8411   }
8412 
8413   testl(len, len);
8414   jccb(Assembler::zero, done);
8415   lea(src, Address(src, len, Address::times_1));
8416   lea(dst, Address(dst, len, Address::times_2));
8417   negptr(len);
8418 
8419   // inflate 1 char per iter
8420   bind(copy_chars_loop);
8421   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
8422   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
8423   increment(len);
8424   jcc(Assembler::notZero, copy_chars_loop);
8425 
8426   bind(done);
8427 }
8428 
8429 
8430 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
8431   switch(type) {
8432     case T_BYTE:
8433     case T_BOOLEAN:
8434       evmovdqub(dst, kmask, src, merge, vector_len);
8435       break;
8436     case T_CHAR:
8437     case T_SHORT:
8438       evmovdquw(dst, kmask, src, merge, vector_len);
8439       break;
8440     case T_INT:
8441     case T_FLOAT:
8442       evmovdqul(dst, kmask, src, merge, vector_len);
8443       break;
8444     case T_LONG:
8445     case T_DOUBLE:
8446       evmovdquq(dst, kmask, src, merge, vector_len);
8447       break;
8448     default:
8449       fatal("Unexpected type argument %s", type2name(type));
8450       break;
8451   }
8452 }
8453 
8454 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
8455   switch(type) {
8456     case T_BYTE:
8457     case T_BOOLEAN:
8458       evmovdqub(dst, kmask, src, merge, vector_len);
8459       break;
8460     case T_CHAR:
8461     case T_SHORT:
8462       evmovdquw(dst, kmask, src, merge, vector_len);
8463       break;
8464     case T_INT:
8465     case T_FLOAT:
8466       evmovdqul(dst, kmask, src, merge, vector_len);
8467       break;
8468     case T_LONG:
8469     case T_DOUBLE:
8470       evmovdquq(dst, kmask, src, merge, vector_len);
8471       break;
8472     default:
8473       fatal("Unexpected type argument %s", type2name(type));
8474       break;
8475   }
8476 }
8477 
8478 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) {
8479   switch(masklen) {
8480     case 2:
8481        knotbl(dst, src);
8482        movl(rtmp, 3);
8483        kmovbl(ktmp, rtmp);
8484        kandbl(dst, ktmp, dst);
8485        break;
8486     case 4:
8487        knotbl(dst, src);
8488        movl(rtmp, 15);
8489        kmovbl(ktmp, rtmp);
8490        kandbl(dst, ktmp, dst);
8491        break;
8492     case 8:
8493        knotbl(dst, src);
8494        break;
8495     case 16:
8496        knotwl(dst, src);
8497        break;
8498     case 32:
8499        knotdl(dst, src);
8500        break;
8501     case 64:
8502        knotql(dst, src);
8503        break;
8504     default:
8505       fatal("Unexpected vector length %d", masklen);
8506       break;
8507   }
8508 }
8509 
8510 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8511   switch(type) {
8512     case T_BOOLEAN:
8513     case T_BYTE:
8514        kandbl(dst, src1, src2);
8515        break;
8516     case T_CHAR:
8517     case T_SHORT:
8518        kandwl(dst, src1, src2);
8519        break;
8520     case T_INT:
8521     case T_FLOAT:
8522        kanddl(dst, src1, src2);
8523        break;
8524     case T_LONG:
8525     case T_DOUBLE:
8526        kandql(dst, src1, src2);
8527        break;
8528     default:
8529       fatal("Unexpected type argument %s", type2name(type));
8530       break;
8531   }
8532 }
8533 
8534 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8535   switch(type) {
8536     case T_BOOLEAN:
8537     case T_BYTE:
8538        korbl(dst, src1, src2);
8539        break;
8540     case T_CHAR:
8541     case T_SHORT:
8542        korwl(dst, src1, src2);
8543        break;
8544     case T_INT:
8545     case T_FLOAT:
8546        kordl(dst, src1, src2);
8547        break;
8548     case T_LONG:
8549     case T_DOUBLE:
8550        korql(dst, src1, src2);
8551        break;
8552     default:
8553       fatal("Unexpected type argument %s", type2name(type));
8554       break;
8555   }
8556 }
8557 
8558 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8559   switch(type) {
8560     case T_BOOLEAN:
8561     case T_BYTE:
8562        kxorbl(dst, src1, src2);
8563        break;
8564     case T_CHAR:
8565     case T_SHORT:
8566        kxorwl(dst, src1, src2);
8567        break;
8568     case T_INT:
8569     case T_FLOAT:
8570        kxordl(dst, src1, src2);
8571        break;
8572     case T_LONG:
8573     case T_DOUBLE:
8574        kxorql(dst, src1, src2);
8575        break;
8576     default:
8577       fatal("Unexpected type argument %s", type2name(type));
8578       break;
8579   }
8580 }
8581 
8582 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8583   switch(type) {
8584     case T_BOOLEAN:
8585     case T_BYTE:
8586       evpermb(dst, mask, nds, src, merge, vector_len); break;
8587     case T_CHAR:
8588     case T_SHORT:
8589       evpermw(dst, mask, nds, src, merge, vector_len); break;
8590     case T_INT:
8591     case T_FLOAT:
8592       evpermd(dst, mask, nds, src, merge, vector_len); break;
8593     case T_LONG:
8594     case T_DOUBLE:
8595       evpermq(dst, mask, nds, src, merge, vector_len); break;
8596     default:
8597       fatal("Unexpected type argument %s", type2name(type)); break;
8598   }
8599 }
8600 
8601 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8602   switch(type) {
8603     case T_BOOLEAN:
8604     case T_BYTE:
8605       evpermb(dst, mask, nds, src, merge, vector_len); break;
8606     case T_CHAR:
8607     case T_SHORT:
8608       evpermw(dst, mask, nds, src, merge, vector_len); break;
8609     case T_INT:
8610     case T_FLOAT:
8611       evpermd(dst, mask, nds, src, merge, vector_len); break;
8612     case T_LONG:
8613     case T_DOUBLE:
8614       evpermq(dst, mask, nds, src, merge, vector_len); break;
8615     default:
8616       fatal("Unexpected type argument %s", type2name(type)); break;
8617   }
8618 }
8619 
8620 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8621   switch(type) {
8622     case T_BYTE:
8623       evpminsb(dst, mask, nds, src, merge, vector_len); break;
8624     case T_SHORT:
8625       evpminsw(dst, mask, nds, src, merge, vector_len); break;
8626     case T_INT:
8627       evpminsd(dst, mask, nds, src, merge, vector_len); break;
8628     case T_LONG:
8629       evpminsq(dst, mask, nds, src, merge, vector_len); break;
8630     default:
8631       fatal("Unexpected type argument %s", type2name(type)); break;
8632   }
8633 }
8634 
8635 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8636   switch(type) {
8637     case T_BYTE:
8638       evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
8639     case T_SHORT:
8640       evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
8641     case T_INT:
8642       evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
8643     case T_LONG:
8644       evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
8645     default:
8646       fatal("Unexpected type argument %s", type2name(type)); break;
8647   }
8648 }
8649 
8650 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8651   switch(type) {
8652     case T_BYTE:
8653       evpminsb(dst, mask, nds, src, merge, vector_len); break;
8654     case T_SHORT:
8655       evpminsw(dst, mask, nds, src, merge, vector_len); break;
8656     case T_INT:
8657       evpminsd(dst, mask, nds, src, merge, vector_len); break;
8658     case T_LONG:
8659       evpminsq(dst, mask, nds, src, merge, vector_len); break;
8660     default:
8661       fatal("Unexpected type argument %s", type2name(type)); break;
8662   }
8663 }
8664 
8665 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8666   switch(type) {
8667     case T_BYTE:
8668       evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
8669     case T_SHORT:
8670       evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
8671     case T_INT:
8672       evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
8673     case T_LONG:
8674       evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
8675     default:
8676       fatal("Unexpected type argument %s", type2name(type)); break;
8677   }
8678 }
8679 
8680 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8681   switch(type) {
8682     case T_INT:
8683       evpxord(dst, mask, nds, src, merge, vector_len); break;
8684     case T_LONG:
8685       evpxorq(dst, mask, nds, src, merge, vector_len); break;
8686     default:
8687       fatal("Unexpected type argument %s", type2name(type)); break;
8688   }
8689 }
8690 
8691 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8692   switch(type) {
8693     case T_INT:
8694       evpxord(dst, mask, nds, src, merge, vector_len); break;
8695     case T_LONG:
8696       evpxorq(dst, mask, nds, src, merge, vector_len); break;
8697     default:
8698       fatal("Unexpected type argument %s", type2name(type)); break;
8699   }
8700 }
8701 
8702 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8703   switch(type) {
8704     case T_INT:
8705       Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
8706     case T_LONG:
8707       evporq(dst, mask, nds, src, merge, vector_len); break;
8708     default:
8709       fatal("Unexpected type argument %s", type2name(type)); break;
8710   }
8711 }
8712 
8713 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8714   switch(type) {
8715     case T_INT:
8716       Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
8717     case T_LONG:
8718       evporq(dst, mask, nds, src, merge, vector_len); break;
8719     default:
8720       fatal("Unexpected type argument %s", type2name(type)); break;
8721   }
8722 }
8723 
8724 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8725   switch(type) {
8726     case T_INT:
8727       evpandd(dst, mask, nds, src, merge, vector_len); break;
8728     case T_LONG:
8729       evpandq(dst, mask, nds, src, merge, vector_len); break;
8730     default:
8731       fatal("Unexpected type argument %s", type2name(type)); break;
8732   }
8733 }
8734 
8735 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8736   switch(type) {
8737     case T_INT:
8738       evpandd(dst, mask, nds, src, merge, vector_len); break;
8739     case T_LONG:
8740       evpandq(dst, mask, nds, src, merge, vector_len); break;
8741     default:
8742       fatal("Unexpected type argument %s", type2name(type)); break;
8743   }
8744 }
8745 
8746 void MacroAssembler::anytrue(Register dst, uint masklen, KRegister src1, KRegister src2) {
8747    masklen = masklen < 8 ? 8 : masklen;
8748    ktest(masklen, src1, src2);
8749    setb(Assembler::notZero, dst);
8750    movzbl(dst, dst);
8751 }
8752 
8753 void MacroAssembler::alltrue(Register dst, uint masklen, KRegister src1, KRegister src2, KRegister kscratch) {
8754   if (masklen < 8) {
8755     knotbl(kscratch, src2);
8756     kortestbl(src1, kscratch);
8757     setb(Assembler::carrySet, dst);
8758     movzbl(dst, dst);
8759   } else {
8760     ktest(masklen, src1, src2);
8761     setb(Assembler::carrySet, dst);
8762     movzbl(dst, dst);
8763   }
8764 }
8765 
8766 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) {
8767   switch(masklen) {
8768     case 8:
8769        kortestbl(src1, src2);
8770        break;
8771     case 16:
8772        kortestwl(src1, src2);
8773        break;
8774     case 32:
8775        kortestdl(src1, src2);
8776        break;
8777     case 64:
8778        kortestql(src1, src2);
8779        break;
8780     default:
8781       fatal("Unexpected mask length %d", masklen);
8782       break;
8783   }
8784 }
8785 
8786 
8787 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) {
8788   switch(masklen)  {
8789     case 8:
8790        ktestbl(src1, src2);
8791        break;
8792     case 16:
8793        ktestwl(src1, src2);
8794        break;
8795     case 32:
8796        ktestdl(src1, src2);
8797        break;
8798     case 64:
8799        ktestql(src1, src2);
8800        break;
8801     default:
8802       fatal("Unexpected mask length %d", masklen);
8803       break;
8804   }
8805 }
8806 
8807 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
8808   switch(type) {
8809     case T_INT:
8810       evprold(dst, mask, src, shift, merge, vlen_enc); break;
8811     case T_LONG:
8812       evprolq(dst, mask, src, shift, merge, vlen_enc); break;
8813     default:
8814       fatal("Unexpected type argument %s", type2name(type)); break;
8815       break;
8816   }
8817 }
8818 
8819 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
8820   switch(type) {
8821     case T_INT:
8822       evprord(dst, mask, src, shift, merge, vlen_enc); break;
8823     case T_LONG:
8824       evprorq(dst, mask, src, shift, merge, vlen_enc); break;
8825     default:
8826       fatal("Unexpected type argument %s", type2name(type)); break;
8827   }
8828 }
8829 
8830 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
8831   switch(type) {
8832     case T_INT:
8833       evprolvd(dst, mask, src1, src2, merge, vlen_enc); break;
8834     case T_LONG:
8835       evprolvq(dst, mask, src1, src2, merge, vlen_enc); break;
8836     default:
8837       fatal("Unexpected type argument %s", type2name(type)); break;
8838   }
8839 }
8840 
8841 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
8842   switch(type) {
8843     case T_INT:
8844       evprorvd(dst, mask, src1, src2, merge, vlen_enc); break;
8845     case T_LONG:
8846       evprorvq(dst, mask, src1, src2, merge, vlen_enc); break;
8847     default:
8848       fatal("Unexpected type argument %s", type2name(type)); break;
8849   }
8850 }
8851 #if COMPILER2_OR_JVMCI
8852 
8853 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
8854                                  Register length, Register temp, int vec_enc) {
8855   // Computing mask for predicated vector store.
8856   movptr(temp, -1);
8857   bzhiq(temp, temp, length);
8858   kmov(mask, temp);
8859   evmovdqu(bt, mask, dst, xmm, true, vec_enc);
8860 }
8861 
8862 // Set memory operation for length "less than" 64 bytes.
8863 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
8864                                        XMMRegister xmm, KRegister mask, Register length,
8865                                        Register temp, bool use64byteVector) {
8866   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8867   BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
8868   if (!use64byteVector) {
8869     fill32(dst, disp, xmm);
8870     subptr(length, 32 >> shift);
8871     fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
8872   } else {
8873     assert(MaxVectorSize == 64, "vector length != 64");
8874     fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
8875   }
8876 }
8877 
8878 
8879 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
8880                                        XMMRegister xmm, KRegister mask, Register length,
8881                                        Register temp) {
8882   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8883   BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
8884   fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
8885 }
8886 
8887 
8888 void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
8889   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8890   vmovdqu(dst, xmm);
8891 }
8892 
8893 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
8894   fill32(Address(dst, disp), xmm);
8895 }
8896 
8897 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
8898   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8899   if (!use64byteVector) {
8900     fill32(dst, xmm);
8901     fill32(dst.plus_disp(32), xmm);
8902   } else {
8903     evmovdquq(dst, xmm, Assembler::AVX_512bit);
8904   }
8905 }
8906 
8907 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
8908   fill64(Address(dst, disp), xmm, use64byteVector);
8909 }
8910 
8911 #ifdef _LP64
8912 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
8913                                         Register count, Register rtmp, XMMRegister xtmp) {
8914   Label L_exit;
8915   Label L_fill_start;
8916   Label L_fill_64_bytes;
8917   Label L_fill_96_bytes;
8918   Label L_fill_128_bytes;
8919   Label L_fill_128_bytes_loop;
8920   Label L_fill_128_loop_header;
8921   Label L_fill_128_bytes_loop_header;
8922   Label L_fill_128_bytes_loop_pre_header;
8923   Label L_fill_zmm_sequence;
8924 
8925   int shift = -1;
8926   int avx3threshold = VM_Version::avx3_threshold();
8927   switch(type) {
8928     case T_BYTE:  shift = 0;
8929       break;
8930     case T_SHORT: shift = 1;
8931       break;
8932     case T_INT:   shift = 2;
8933       break;
8934     /* Uncomment when LONG fill stubs are supported.
8935     case T_LONG:  shift = 3;
8936       break;
8937     */
8938     default:
8939       fatal("Unhandled type: %s\n", type2name(type));
8940   }
8941 
8942   if ((avx3threshold != 0)  || (MaxVectorSize == 32)) {
8943 
8944     if (MaxVectorSize == 64) {
8945       cmpq(count, avx3threshold >> shift);
8946       jcc(Assembler::greater, L_fill_zmm_sequence);
8947     }
8948 
8949     evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
8950 
8951     bind(L_fill_start);
8952 
8953     cmpq(count, 32 >> shift);
8954     jccb(Assembler::greater, L_fill_64_bytes);
8955     fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
8956     jmp(L_exit);
8957 
8958     bind(L_fill_64_bytes);
8959     cmpq(count, 64 >> shift);
8960     jccb(Assembler::greater, L_fill_96_bytes);
8961     fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
8962     jmp(L_exit);
8963 
8964     bind(L_fill_96_bytes);
8965     cmpq(count, 96 >> shift);
8966     jccb(Assembler::greater, L_fill_128_bytes);
8967     fill64(to, 0, xtmp);
8968     subq(count, 64 >> shift);
8969     fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
8970     jmp(L_exit);
8971 
8972     bind(L_fill_128_bytes);
8973     cmpq(count, 128 >> shift);
8974     jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
8975     fill64(to, 0, xtmp);
8976     fill32(to, 64, xtmp);
8977     subq(count, 96 >> shift);
8978     fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
8979     jmp(L_exit);
8980 
8981     bind(L_fill_128_bytes_loop_pre_header);
8982     {
8983       mov(rtmp, to);
8984       andq(rtmp, 31);
8985       jccb(Assembler::zero, L_fill_128_bytes_loop_header);
8986       negq(rtmp);
8987       addq(rtmp, 32);
8988       mov64(r8, -1L);
8989       bzhiq(r8, r8, rtmp);
8990       kmovql(k2, r8);
8991       evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_256bit);
8992       addq(to, rtmp);
8993       shrq(rtmp, shift);
8994       subq(count, rtmp);
8995     }
8996 
8997     cmpq(count, 128 >> shift);
8998     jcc(Assembler::less, L_fill_start);
8999 
9000     bind(L_fill_128_bytes_loop_header);
9001     subq(count, 128 >> shift);
9002 
9003     align32();
9004     bind(L_fill_128_bytes_loop);
9005       fill64(to, 0, xtmp);
9006       fill64(to, 64, xtmp);
9007       addq(to, 128);
9008       subq(count, 128 >> shift);
9009       jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
9010 
9011     addq(count, 128 >> shift);
9012     jcc(Assembler::zero, L_exit);
9013     jmp(L_fill_start);
9014   }
9015 
9016   if (MaxVectorSize == 64) {
9017     // Sequence using 64 byte ZMM register.
9018     Label L_fill_128_bytes_zmm;
9019     Label L_fill_192_bytes_zmm;
9020     Label L_fill_192_bytes_loop_zmm;
9021     Label L_fill_192_bytes_loop_header_zmm;
9022     Label L_fill_192_bytes_loop_pre_header_zmm;
9023     Label L_fill_start_zmm_sequence;
9024 
9025     bind(L_fill_zmm_sequence);
9026     evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
9027 
9028     bind(L_fill_start_zmm_sequence);
9029     cmpq(count, 64 >> shift);
9030     jccb(Assembler::greater, L_fill_128_bytes_zmm);
9031     fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
9032     jmp(L_exit);
9033 
9034     bind(L_fill_128_bytes_zmm);
9035     cmpq(count, 128 >> shift);
9036     jccb(Assembler::greater, L_fill_192_bytes_zmm);
9037     fill64(to, 0, xtmp, true);
9038     subq(count, 64 >> shift);
9039     fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
9040     jmp(L_exit);
9041 
9042     bind(L_fill_192_bytes_zmm);
9043     cmpq(count, 192 >> shift);
9044     jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
9045     fill64(to, 0, xtmp, true);
9046     fill64(to, 64, xtmp, true);
9047     subq(count, 128 >> shift);
9048     fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
9049     jmp(L_exit);
9050 
9051     bind(L_fill_192_bytes_loop_pre_header_zmm);
9052     {
9053       movq(rtmp, to);
9054       andq(rtmp, 63);
9055       jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
9056       negq(rtmp);
9057       addq(rtmp, 64);
9058       mov64(r8, -1L);
9059       bzhiq(r8, r8, rtmp);
9060       kmovql(k2, r8);
9061       evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_512bit);
9062       addq(to, rtmp);
9063       shrq(rtmp, shift);
9064       subq(count, rtmp);
9065     }
9066 
9067     cmpq(count, 192 >> shift);
9068     jcc(Assembler::less, L_fill_start_zmm_sequence);
9069 
9070     bind(L_fill_192_bytes_loop_header_zmm);
9071     subq(count, 192 >> shift);
9072 
9073     align32();
9074     bind(L_fill_192_bytes_loop_zmm);
9075       fill64(to, 0, xtmp, true);
9076       fill64(to, 64, xtmp, true);
9077       fill64(to, 128, xtmp, true);
9078       addq(to, 192);
9079       subq(count, 192 >> shift);
9080       jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
9081 
9082     addq(count, 192 >> shift);
9083     jcc(Assembler::zero, L_exit);
9084     jmp(L_fill_start_zmm_sequence);
9085   }
9086   bind(L_exit);
9087 }
9088 #endif
9089 #endif //COMPILER2_OR_JVMCI
9090 
9091 
9092 #ifdef _LP64
9093 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
9094   Label done;
9095   cvttss2sil(dst, src);
9096   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9097   cmpl(dst, 0x80000000); // float_sign_flip
9098   jccb(Assembler::notEqual, done);
9099   subptr(rsp, 8);
9100   movflt(Address(rsp, 0), src);
9101   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
9102   pop(dst);
9103   bind(done);
9104 }
9105 
9106 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
9107   Label done;
9108   cvttsd2sil(dst, src);
9109   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9110   cmpl(dst, 0x80000000); // float_sign_flip
9111   jccb(Assembler::notEqual, done);
9112   subptr(rsp, 8);
9113   movdbl(Address(rsp, 0), src);
9114   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
9115   pop(dst);
9116   bind(done);
9117 }
9118 
9119 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
9120   Label done;
9121   cvttss2siq(dst, src);
9122   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9123   jccb(Assembler::notEqual, done);
9124   subptr(rsp, 8);
9125   movflt(Address(rsp, 0), src);
9126   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
9127   pop(dst);
9128   bind(done);
9129 }
9130 
9131 void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9132   // Following code is line by line assembly translation rounding algorithm.
9133   // Please refer to java.lang.Math.round(float) algorithm for details.
9134   const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000;
9135   const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24;
9136   const int32_t FloatConsts_EXP_BIAS = 127;
9137   const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF;
9138   const int32_t MINUS_32 = 0xFFFFFFE0;
9139   Label L_special_case, L_block1, L_exit;
9140   movl(rtmp, FloatConsts_EXP_BIT_MASK);
9141   movdl(dst, src);
9142   andl(dst, rtmp);
9143   sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1);
9144   movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS);
9145   subl(rtmp, dst);
9146   movl(rcx, rtmp);
9147   movl(dst, MINUS_32);
9148   testl(rtmp, dst);
9149   jccb(Assembler::notEqual, L_special_case);
9150   movdl(dst, src);
9151   andl(dst, FloatConsts_SIGNIF_BIT_MASK);
9152   orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1);
9153   movdl(rtmp, src);
9154   testl(rtmp, rtmp);
9155   jccb(Assembler::greaterEqual, L_block1);
9156   negl(dst);
9157   bind(L_block1);
9158   sarl(dst);
9159   addl(dst, 0x1);
9160   sarl(dst, 0x1);
9161   jmp(L_exit);
9162   bind(L_special_case);
9163   convert_f2i(dst, src);
9164   bind(L_exit);
9165 }
9166 
9167 void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9168   // Following code is line by line assembly translation rounding algorithm.
9169   // Please refer to java.lang.Math.round(double) algorithm for details.
9170   const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L;
9171   const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53;
9172   const int64_t DoubleConsts_EXP_BIAS = 1023;
9173   const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL;
9174   const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L;
9175   Label L_special_case, L_block1, L_exit;
9176   mov64(rtmp, DoubleConsts_EXP_BIT_MASK);
9177   movq(dst, src);
9178   andq(dst, rtmp);
9179   sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1);
9180   mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS);
9181   subq(rtmp, dst);
9182   movq(rcx, rtmp);
9183   mov64(dst, MINUS_64);
9184   testq(rtmp, dst);
9185   jccb(Assembler::notEqual, L_special_case);
9186   movq(dst, src);
9187   mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK);
9188   andq(dst, rtmp);
9189   mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1);
9190   orq(dst, rtmp);
9191   movq(rtmp, src);
9192   testq(rtmp, rtmp);
9193   jccb(Assembler::greaterEqual, L_block1);
9194   negq(dst);
9195   bind(L_block1);
9196   sarq(dst);
9197   addq(dst, 0x1);
9198   sarq(dst, 0x1);
9199   jmp(L_exit);
9200   bind(L_special_case);
9201   convert_d2l(dst, src);
9202   bind(L_exit);
9203 }
9204 
9205 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
9206   Label done;
9207   cvttsd2siq(dst, src);
9208   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9209   jccb(Assembler::notEqual, done);
9210   subptr(rsp, 8);
9211   movdbl(Address(rsp, 0), src);
9212   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
9213   pop(dst);
9214   bind(done);
9215 }
9216 
9217 void MacroAssembler::cache_wb(Address line)
9218 {
9219   // 64 bit cpus always support clflush
9220   assert(VM_Version::supports_clflush(), "clflush should be available");
9221   bool optimized = VM_Version::supports_clflushopt();
9222   bool no_evict = VM_Version::supports_clwb();
9223 
9224   // prefer clwb (writeback without evict) otherwise
9225   // prefer clflushopt (potentially parallel writeback with evict)
9226   // otherwise fallback on clflush (serial writeback with evict)
9227 
9228   if (optimized) {
9229     if (no_evict) {
9230       clwb(line);
9231     } else {
9232       clflushopt(line);
9233     }
9234   } else {
9235     // no need for fence when using CLFLUSH
9236     clflush(line);
9237   }
9238 }
9239 
9240 void MacroAssembler::cache_wbsync(bool is_pre)
9241 {
9242   assert(VM_Version::supports_clflush(), "clflush should be available");
9243   bool optimized = VM_Version::supports_clflushopt();
9244   bool no_evict = VM_Version::supports_clwb();
9245 
9246   // pick the correct implementation
9247 
9248   if (!is_pre && (optimized || no_evict)) {
9249     // need an sfence for post flush when using clflushopt or clwb
9250     // otherwise no no need for any synchroniaztion
9251 
9252     sfence();
9253   }
9254 }
9255 
9256 #endif // _LP64
9257 
9258 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9259   switch (cond) {
9260     // Note some conditions are synonyms for others
9261     case Assembler::zero:         return Assembler::notZero;
9262     case Assembler::notZero:      return Assembler::zero;
9263     case Assembler::less:         return Assembler::greaterEqual;
9264     case Assembler::lessEqual:    return Assembler::greater;
9265     case Assembler::greater:      return Assembler::lessEqual;
9266     case Assembler::greaterEqual: return Assembler::less;
9267     case Assembler::below:        return Assembler::aboveEqual;
9268     case Assembler::belowEqual:   return Assembler::above;
9269     case Assembler::above:        return Assembler::belowEqual;
9270     case Assembler::aboveEqual:   return Assembler::below;
9271     case Assembler::overflow:     return Assembler::noOverflow;
9272     case Assembler::noOverflow:   return Assembler::overflow;
9273     case Assembler::negative:     return Assembler::positive;
9274     case Assembler::positive:     return Assembler::negative;
9275     case Assembler::parity:       return Assembler::noParity;
9276     case Assembler::noParity:     return Assembler::parity;
9277   }
9278   ShouldNotReachHere(); return Assembler::overflow;
9279 }
9280 
9281 SkipIfEqual::SkipIfEqual(
9282     MacroAssembler* masm, const bool* flag_addr, bool value) {
9283   _masm = masm;
9284   _masm->cmp8(ExternalAddress((address)flag_addr), value);
9285   _masm->jcc(Assembler::equal, _label);
9286 }
9287 
9288 SkipIfEqual::~SkipIfEqual() {
9289   _masm->bind(_label);
9290 }
9291 
9292 // 32-bit Windows has its own fast-path implementation
9293 // of get_thread
9294 #if !defined(WIN32) || defined(_LP64)
9295 
9296 // This is simply a call to Thread::current()
9297 void MacroAssembler::get_thread(Register thread) {
9298   if (thread != rax) {
9299     push(rax);
9300   }
9301   LP64_ONLY(push(rdi);)
9302   LP64_ONLY(push(rsi);)
9303   push(rdx);
9304   push(rcx);
9305 #ifdef _LP64
9306   push(r8);
9307   push(r9);
9308   push(r10);
9309   push(r11);
9310 #endif
9311 
9312   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
9313 
9314 #ifdef _LP64
9315   pop(r11);
9316   pop(r10);
9317   pop(r9);
9318   pop(r8);
9319 #endif
9320   pop(rcx);
9321   pop(rdx);
9322   LP64_ONLY(pop(rsi);)
9323   LP64_ONLY(pop(rdi);)
9324   if (thread != rax) {
9325     mov(thread, rax);
9326     pop(rax);
9327   }
9328 }
9329 
9330 
9331 #endif // !WIN32 || _LP64