1 /*
   2  * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef CPU_X86_MACROASSEMBLER_X86_HPP
  26 #define CPU_X86_MACROASSEMBLER_X86_HPP
  27 
  28 #include "asm/assembler.hpp"
  29 #include "asm/register.hpp"
  30 #include "code/vmreg.inline.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "utilities/macros.hpp"
  33 #include "runtime/rtmLocking.hpp"
  34 #include "runtime/vm_version.hpp"
  35 
  36 // MacroAssembler extends Assembler by frequently used macros.
  37 //
  38 // Instructions for which a 'better' code sequence exists depending
  39 // on arguments should also go in here.
  40 
  41 class MacroAssembler: public Assembler {
  42   friend class LIR_Assembler;
  43   friend class Runtime1;      // as_Address()
  44 
  45  public:
  46   // Support for VM calls
  47   //
  48   // This is the base routine called by the different versions of call_VM_leaf. The interpreter
  49   // may customize this version by overriding it for its purposes (e.g., to save/restore
  50   // additional registers when doing a VM call).
  51 
  52   virtual void call_VM_leaf_base(
  53     address entry_point,               // the entry point
  54     int     number_of_arguments        // the number of arguments to pop after the call
  55   );
  56 
  57  protected:
  58   // This is the base routine called by the different versions of call_VM. The interpreter
  59   // may customize this version by overriding it for its purposes (e.g., to save/restore
  60   // additional registers when doing a VM call).
  61   //
  62   // If no java_thread register is specified (noreg) than rdi will be used instead. call_VM_base
  63   // returns the register which contains the thread upon return. If a thread register has been
  64   // specified, the return value will correspond to that register. If no last_java_sp is specified
  65   // (noreg) than rsp will be used instead.
  66   virtual void call_VM_base(           // returns the register containing the thread upon return
  67     Register oop_result,               // where an oop-result ends up if any; use noreg otherwise
  68     Register java_thread,              // the thread if computed before     ; use noreg otherwise
  69     Register last_java_sp,             // to set up last_Java_frame in stubs; use noreg otherwise
  70     address  entry_point,              // the entry point
  71     int      number_of_arguments,      // the number of arguments (w/o thread) to pop after the call
  72     bool     check_exceptions          // whether to check for pending exceptions after return
  73   );
  74 
  75   void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions = true);
  76 
  77   // helpers for FPU flag access
  78   // tmp is a temporary register, if none is available use noreg
  79   void save_rax   (Register tmp);
  80   void restore_rax(Register tmp);
  81 
  82  public:
  83 
  84   enum KlassDecodeMode {
  85     KlassDecodeNone,
  86     KlassDecodeZero,
  87     KlassDecodeXor,
  88     KlassDecodeAdd
  89   };
  90 
  91   // Return the current narrow Klass pointer decode mode. Initialized on first call.
  92   static KlassDecodeMode klass_decode_mode();
  93 
  94   // Given an arbitrary base address, return the KlassDecodeMode that would be used. Return KlassDecodeNone
  95   // if base address is not valid for encoding.
  96   static KlassDecodeMode klass_decode_mode_for_base(address base);
  97 
  98   // Returns a static string
  99   static const char* describe_klass_decode_mode(KlassDecodeMode mode);
 100 
 101  private:
 102 
 103   static KlassDecodeMode _klass_decode_mode;
 104 
 105  public:
 106   MacroAssembler(CodeBuffer* code) : Assembler(code) {}
 107 
 108  // These routines should emit JVMTI PopFrame and ForceEarlyReturn handling code.
 109  // The implementation is only non-empty for the InterpreterMacroAssembler,
 110  // as only the interpreter handles PopFrame and ForceEarlyReturn requests.
 111  virtual void check_and_handle_popframe(Register java_thread);
 112  virtual void check_and_handle_earlyret(Register java_thread);
 113 
 114   Address as_Address(AddressLiteral adr);
 115   Address as_Address(ArrayAddress adr, Register rscratch);
 116 
 117   // Support for NULL-checks
 118   //
 119   // Generates code that causes a NULL OS exception if the content of reg is NULL.
 120   // If the accessed location is M[reg + offset] and the offset is known, provide the
 121   // offset. No explicit code generation is needed if the offset is within a certain
 122   // range (0 <= offset <= page_size).
 123 
 124   void null_check(Register reg, int offset = -1);
 125   static bool needs_explicit_null_check(intptr_t offset);
 126   static bool uses_implicit_null_check(void* address);
 127 
 128   // Required platform-specific helpers for Label::patch_instructions.
 129   // They _shadow_ the declarations in AbstractAssembler, which are undefined.
 130   void pd_patch_instruction(address branch, address target, const char* file, int line) {
 131     unsigned char op = branch[0];
 132     assert(op == 0xE8 /* call */ ||
 133         op == 0xE9 /* jmp */ ||
 134         op == 0xEB /* short jmp */ ||
 135         (op & 0xF0) == 0x70 /* short jcc */ ||
 136         op == 0x0F && (branch[1] & 0xF0) == 0x80 /* jcc */ ||
 137         op == 0xC7 && branch[1] == 0xF8 /* xbegin */,
 138         "Invalid opcode at patch point");
 139 
 140     if (op == 0xEB || (op & 0xF0) == 0x70) {
 141       // short offset operators (jmp and jcc)
 142       char* disp = (char*) &branch[1];
 143       int imm8 = target - (address) &disp[1];
 144       guarantee(this->is8bit(imm8), "Short forward jump exceeds 8-bit offset at %s:%d",
 145                 file == NULL ? "<NULL>" : file, line);
 146       *disp = imm8;
 147     } else {
 148       int* disp = (int*) &branch[(op == 0x0F || op == 0xC7)? 2: 1];
 149       int imm32 = target - (address) &disp[1];
 150       *disp = imm32;
 151     }
 152   }
 153 
 154   // The following 4 methods return the offset of the appropriate move instruction
 155 
 156   // Support for fast byte/short loading with zero extension (depending on particular CPU)
 157   int load_unsigned_byte(Register dst, Address src);
 158   int load_unsigned_short(Register dst, Address src);
 159 
 160   // Support for fast byte/short loading with sign extension (depending on particular CPU)
 161   int load_signed_byte(Register dst, Address src);
 162   int load_signed_short(Register dst, Address src);
 163 
 164   // Support for sign-extension (hi:lo = extend_sign(lo))
 165   void extend_sign(Register hi, Register lo);
 166 
 167   // Load and store values by size and signed-ness
 168   void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2 = noreg);
 169   void store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2 = noreg);
 170 
 171   // Support for inc/dec with optimal instruction selection depending on value
 172 
 173   void increment(Register reg, int value = 1) { LP64_ONLY(incrementq(reg, value)) NOT_LP64(incrementl(reg, value)) ; }
 174   void decrement(Register reg, int value = 1) { LP64_ONLY(decrementq(reg, value)) NOT_LP64(decrementl(reg, value)) ; }
 175 
 176   void decrementl(Address dst, int value = 1);
 177   void decrementl(Register reg, int value = 1);
 178 
 179   void decrementq(Register reg, int value = 1);
 180   void decrementq(Address dst, int value = 1);
 181 
 182   void incrementl(Address dst, int value = 1);
 183   void incrementl(Register reg, int value = 1);
 184 
 185   void incrementq(Register reg, int value = 1);
 186   void incrementq(Address dst, int value = 1);
 187 
 188   // Support optimal SSE move instructions.
 189   void movflt(XMMRegister dst, XMMRegister src) {
 190     if (dst-> encoding() == src->encoding()) return;
 191     if (UseXmmRegToRegMoveAll) { movaps(dst, src); return; }
 192     else                       { movss (dst, src); return; }
 193   }
 194   void movflt(XMMRegister dst, Address src) { movss(dst, src); }
 195   void movflt(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 196   void movflt(Address dst, XMMRegister src) { movss(dst, src); }
 197 
 198   // Move with zero extension
 199   void movfltz(XMMRegister dst, XMMRegister src) { movss(dst, src); }
 200 
 201   void movdbl(XMMRegister dst, XMMRegister src) {
 202     if (dst-> encoding() == src->encoding()) return;
 203     if (UseXmmRegToRegMoveAll) { movapd(dst, src); return; }
 204     else                       { movsd (dst, src); return; }
 205   }
 206 
 207   void movdbl(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 208 
 209   void movdbl(XMMRegister dst, Address src) {
 210     if (UseXmmLoadAndClearUpper) { movsd (dst, src); return; }
 211     else                         { movlpd(dst, src); return; }
 212   }
 213   void movdbl(Address dst, XMMRegister src) { movsd(dst, src); }
 214 
 215   void incrementl(AddressLiteral dst, Register rscratch = noreg);
 216   void incrementl(ArrayAddress   dst, Register rscratch);
 217 
 218   void incrementq(AddressLiteral dst, Register rscratch = noreg);
 219 
 220   // Alignment
 221   void align32();
 222   void align64();
 223   void align(int modulus);
 224   void align(int modulus, int target);
 225 
 226   void post_call_nop();
 227   // A 5 byte nop that is safe for patching (see patch_verified_entry)
 228   void fat_nop();
 229 
 230   // Stack frame creation/removal
 231   void enter();
 232   void leave();
 233 
 234   // Support for getting the JavaThread pointer (i.e.; a reference to thread-local information)
 235   // The pointer will be loaded into the thread register.
 236   void get_thread(Register thread);
 237 
 238 #ifdef _LP64
 239   // Support for argument shuffling
 240 
 241   // bias in bytes
 242   void move32_64(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 243   void long_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 244   void float_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 245   void double_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 246   void move_ptr(VMRegPair src, VMRegPair dst);
 247   void object_move(OopMap* map,
 248                    int oop_handle_offset,
 249                    int framesize_in_slots,
 250                    VMRegPair src,
 251                    VMRegPair dst,
 252                    bool is_receiver,
 253                    int* receiver_offset);
 254 #endif // _LP64
 255 
 256   // Support for VM calls
 257   //
 258   // It is imperative that all calls into the VM are handled via the call_VM macros.
 259   // They make sure that the stack linkage is setup correctly. call_VM's correspond
 260   // to ENTRY/ENTRY_X entry points while call_VM_leaf's correspond to LEAF entry points.
 261 
 262 
 263   void call_VM(Register oop_result,
 264                address entry_point,
 265                bool check_exceptions = true);
 266   void call_VM(Register oop_result,
 267                address entry_point,
 268                Register arg_1,
 269                bool check_exceptions = true);
 270   void call_VM(Register oop_result,
 271                address entry_point,
 272                Register arg_1, Register arg_2,
 273                bool check_exceptions = true);
 274   void call_VM(Register oop_result,
 275                address entry_point,
 276                Register arg_1, Register arg_2, Register arg_3,
 277                bool check_exceptions = true);
 278 
 279   // Overloadings with last_Java_sp
 280   void call_VM(Register oop_result,
 281                Register last_java_sp,
 282                address entry_point,
 283                int number_of_arguments = 0,
 284                bool check_exceptions = true);
 285   void call_VM(Register oop_result,
 286                Register last_java_sp,
 287                address entry_point,
 288                Register arg_1, bool
 289                check_exceptions = true);
 290   void call_VM(Register oop_result,
 291                Register last_java_sp,
 292                address entry_point,
 293                Register arg_1, Register arg_2,
 294                bool check_exceptions = true);
 295   void call_VM(Register oop_result,
 296                Register last_java_sp,
 297                address entry_point,
 298                Register arg_1, Register arg_2, Register arg_3,
 299                bool check_exceptions = true);
 300 
 301   void get_vm_result  (Register oop_result, Register thread);
 302   void get_vm_result_2(Register metadata_result, Register thread);
 303 
 304   // These always tightly bind to MacroAssembler::call_VM_base
 305   // bypassing the virtual implementation
 306   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments = 0, bool check_exceptions = true);
 307   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions = true);
 308   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions = true);
 309   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions = true);
 310   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4, bool check_exceptions = true);
 311 
 312   void call_VM_leaf0(address entry_point);
 313   void call_VM_leaf(address entry_point,
 314                     int number_of_arguments = 0);
 315   void call_VM_leaf(address entry_point,
 316                     Register arg_1);
 317   void call_VM_leaf(address entry_point,
 318                     Register arg_1, Register arg_2);
 319   void call_VM_leaf(address entry_point,
 320                     Register arg_1, Register arg_2, Register arg_3);
 321 
 322   void call_VM_leaf(address entry_point,
 323                     Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 324 
 325   // These always tightly bind to MacroAssembler::call_VM_leaf_base
 326   // bypassing the virtual implementation
 327   void super_call_VM_leaf(address entry_point);
 328   void super_call_VM_leaf(address entry_point, Register arg_1);
 329   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
 330   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
 331   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 332 
 333   // last Java Frame (fills frame anchor)
 334   void set_last_Java_frame(Register thread,
 335                            Register last_java_sp,
 336                            Register last_java_fp,
 337                            address  last_java_pc,
 338                            Register rscratch);
 339 
 340   // thread in the default location (r15_thread on 64bit)
 341   void set_last_Java_frame(Register last_java_sp,
 342                            Register last_java_fp,
 343                            address  last_java_pc,
 344                            Register rscratch);
 345 
 346   void reset_last_Java_frame(Register thread, bool clear_fp);
 347 
 348   // thread in the default location (r15_thread on 64bit)
 349   void reset_last_Java_frame(bool clear_fp);
 350 
 351   // jobjects
 352   void clear_jweak_tag(Register possibly_jweak);
 353   void resolve_jobject(Register value, Register thread, Register tmp);
 354 
 355   // C 'boolean' to Java boolean: x == 0 ? 0 : 1
 356   void c2bool(Register x);
 357 
 358   // C++ bool manipulation
 359 
 360   void movbool(Register dst, Address src);
 361   void movbool(Address dst, bool boolconst);
 362   void movbool(Address dst, Register src);
 363   void testbool(Register dst);
 364 
 365   void resolve_oop_handle(Register result, Register tmp);
 366   void resolve_weak_handle(Register result, Register tmp);
 367   void load_mirror(Register mirror, Register method, Register tmp);
 368   void load_method_holder_cld(Register rresult, Register rmethod);
 369 
 370   void load_method_holder(Register holder, Register method);
 371 
 372   // oop manipulations
 373   void load_klass(Register dst, Register src, Register tmp, bool null_check_src = false);
 374 #ifdef _LP64
 375   void load_nklass(Register dst, Register src);
 376 #else
 377   void store_klass(Register dst, Register src);
 378 #endif
 379 
 380   void access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
 381                       Register tmp1, Register thread_tmp);
 382   void access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
 383                        Register tmp1, Register tmp2, Register tmp3);
 384 
 385   void load_heap_oop(Register dst, Address src, Register tmp1 = noreg,
 386                      Register thread_tmp = noreg, DecoratorSet decorators = 0);
 387   void load_heap_oop_not_null(Register dst, Address src, Register tmp1 = noreg,
 388                               Register thread_tmp = noreg, DecoratorSet decorators = 0);
 389   void store_heap_oop(Address dst, Register src, Register tmp1 = noreg,
 390                       Register tmp2 = noreg, Register tmp3 = noreg, DecoratorSet decorators = 0);
 391 
 392   // Used for storing NULL. All other oop constants should be
 393   // stored using routines that take a jobject.
 394   void store_heap_oop_null(Address dst);
 395 
 396 #ifdef _LP64
 397   // This dummy is to prevent a call to store_heap_oop from
 398   // converting a zero (like NULL) into a Register by giving
 399   // the compiler two choices it can't resolve
 400 
 401   void store_heap_oop(Address dst, void* dummy);
 402 
 403   void encode_heap_oop(Register r);
 404   void decode_heap_oop(Register r);
 405   void encode_heap_oop_not_null(Register r);
 406   void decode_heap_oop_not_null(Register r);
 407   void encode_heap_oop_not_null(Register dst, Register src);
 408   void decode_heap_oop_not_null(Register dst, Register src);
 409 
 410   void set_narrow_oop(Register dst, jobject obj);
 411   void set_narrow_oop(Address dst, jobject obj);
 412   void cmp_narrow_oop(Register dst, jobject obj);
 413   void cmp_narrow_oop(Address dst, jobject obj);
 414 
 415   void encode_klass_not_null(Register r, Register tmp);
 416   void decode_klass_not_null(Register r, Register tmp);
 417   void encode_and_move_klass_not_null(Register dst, Register src);
 418   void decode_and_move_klass_not_null(Register dst, Register src);
 419   void set_narrow_klass(Register dst, Klass* k);
 420   void set_narrow_klass(Address dst, Klass* k);
 421   void cmp_narrow_klass(Register dst, Klass* k);
 422   void cmp_narrow_klass(Address dst, Klass* k);
 423 
 424   // if heap base register is used - reinit it with the correct value
 425   void reinit_heapbase();
 426 
 427   DEBUG_ONLY(void verify_heapbase(const char* msg);)
 428 
 429 #endif // _LP64
 430 
 431   // Int division/remainder for Java
 432   // (as idivl, but checks for special case as described in JVM spec.)
 433   // returns idivl instruction offset for implicit exception handling
 434   int corrected_idivl(Register reg);
 435 
 436   // Long division/remainder for Java
 437   // (as idivq, but checks for special case as described in JVM spec.)
 438   // returns idivq instruction offset for implicit exception handling
 439   int corrected_idivq(Register reg);
 440 
 441   void int3();
 442 
 443   // Long operation macros for a 32bit cpu
 444   // Long negation for Java
 445   void lneg(Register hi, Register lo);
 446 
 447   // Long multiplication for Java
 448   // (destroys contents of eax, ebx, ecx and edx)
 449   void lmul(int x_rsp_offset, int y_rsp_offset); // rdx:rax = x * y
 450 
 451   // Long shifts for Java
 452   // (semantics as described in JVM spec.)
 453   void lshl(Register hi, Register lo);                               // hi:lo << (rcx & 0x3f)
 454   void lshr(Register hi, Register lo, bool sign_extension = false);  // hi:lo >> (rcx & 0x3f)
 455 
 456   // Long compare for Java
 457   // (semantics as described in JVM spec.)
 458   void lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo); // x_hi = lcmp(x, y)
 459 
 460 
 461   // misc
 462 
 463   // Sign extension
 464   void sign_extend_short(Register reg);
 465   void sign_extend_byte(Register reg);
 466 
 467   // Division by power of 2, rounding towards 0
 468   void division_with_shift(Register reg, int shift_value);
 469 
 470 #ifndef _LP64
 471   // Compares the top-most stack entries on the FPU stack and sets the eflags as follows:
 472   //
 473   // CF (corresponds to C0) if x < y
 474   // PF (corresponds to C2) if unordered
 475   // ZF (corresponds to C3) if x = y
 476   //
 477   // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
 478   // tmp is a temporary register, if none is available use noreg (only matters for non-P6 code)
 479   void fcmp(Register tmp);
 480   // Variant of the above which allows y to be further down the stack
 481   // and which only pops x and y if specified. If pop_right is
 482   // specified then pop_left must also be specified.
 483   void fcmp(Register tmp, int index, bool pop_left, bool pop_right);
 484 
 485   // Floating-point comparison for Java
 486   // Compares the top-most stack entries on the FPU stack and stores the result in dst.
 487   // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
 488   // (semantics as described in JVM spec.)
 489   void fcmp2int(Register dst, bool unordered_is_less);
 490   // Variant of the above which allows y to be further down the stack
 491   // and which only pops x and y if specified. If pop_right is
 492   // specified then pop_left must also be specified.
 493   void fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right);
 494 
 495   // Floating-point remainder for Java (ST0 = ST0 fremr ST1, ST1 is empty afterwards)
 496   // tmp is a temporary register, if none is available use noreg
 497   void fremr(Register tmp);
 498 
 499   // only if +VerifyFPU
 500   void verify_FPU(int stack_depth, const char* s = "illegal FPU state");
 501 #endif // !LP64
 502 
 503   // dst = c = a * b + c
 504   void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 505   void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 506 
 507   void vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 508   void vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 509   void vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 510   void vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 511 
 512 
 513   // same as fcmp2int, but using SSE2
 514   void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 515   void cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 516 
 517   // branch to L if FPU flag C2 is set/not set
 518   // tmp is a temporary register, if none is available use noreg
 519   void jC2 (Register tmp, Label& L);
 520   void jnC2(Register tmp, Label& L);
 521 
 522   // Load float value from 'address'. If UseSSE >= 1, the value is loaded into
 523   // register xmm0. Otherwise, the value is loaded onto the FPU stack.
 524   void load_float(Address src);
 525 
 526   // Store float value to 'address'. If UseSSE >= 1, the value is stored
 527   // from register xmm0. Otherwise, the value is stored from the FPU stack.
 528   void store_float(Address dst);
 529 
 530   // Load double value from 'address'. If UseSSE >= 2, the value is loaded into
 531   // register xmm0. Otherwise, the value is loaded onto the FPU stack.
 532   void load_double(Address src);
 533 
 534   // Store double value to 'address'. If UseSSE >= 2, the value is stored
 535   // from register xmm0. Otherwise, the value is stored from the FPU stack.
 536   void store_double(Address dst);
 537 
 538 #ifndef _LP64
 539   // Pop ST (ffree & fincstp combined)
 540   void fpop();
 541 
 542   void empty_FPU_stack();
 543 #endif // !_LP64
 544 
 545   void push_IU_state();
 546   void pop_IU_state();
 547 
 548   void push_FPU_state();
 549   void pop_FPU_state();
 550 
 551   void push_CPU_state();
 552   void pop_CPU_state();
 553 
 554   void push_cont_fastpath();
 555   void pop_cont_fastpath();
 556 
 557   void inc_held_monitor_count();
 558   void dec_held_monitor_count();
 559 
 560   DEBUG_ONLY(void stop_if_in_cont(Register cont_reg, const char* name);)
 561 
 562   // Round up to a power of two
 563   void round_to(Register reg, int modulus);
 564 
 565 private:
 566   // General purpose and XMM registers potentially clobbered by native code; there
 567   // is no need for FPU or AVX opmask related methods because C1/interpreter
 568   // - we save/restore FPU state as a whole always
 569   // - do not care about AVX-512 opmask
 570   static RegSet call_clobbered_gp_registers();
 571   static XMMRegSet call_clobbered_xmm_registers();
 572 
 573   void push_set(XMMRegSet set, int offset);
 574   void pop_set(XMMRegSet set, int offset);
 575 
 576 public:
 577   void push_set(RegSet set, int offset = -1);
 578   void pop_set(RegSet set, int offset = -1);
 579 
 580   // Push and pop everything that might be clobbered by a native
 581   // runtime call.
 582   // Only save the lower 64 bits of each vector register.
 583   // Additional registers can be excluded in a passed RegSet.
 584   void push_call_clobbered_registers_except(RegSet exclude, bool save_fpu = true);
 585   void pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu = true);
 586 
 587   void push_call_clobbered_registers(bool save_fpu = true) {
 588     push_call_clobbered_registers_except(RegSet(), save_fpu);
 589   }
 590   void pop_call_clobbered_registers(bool restore_fpu = true) {
 591     pop_call_clobbered_registers_except(RegSet(), restore_fpu);
 592   }
 593 
 594   // allocation
 595   void tlab_allocate(
 596     Register thread,                   // Current thread
 597     Register obj,                      // result: pointer to object after successful allocation
 598     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
 599     int      con_size_in_bytes,        // object size in bytes if   known at compile time
 600     Register t1,                       // temp register
 601     Register t2,                       // temp register
 602     Label&   slow_case                 // continuation point if fast allocation fails
 603   );
 604   void zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp);
 605 
 606   // interface method calling
 607   void lookup_interface_method(Register recv_klass,
 608                                Register intf_klass,
 609                                RegisterOrConstant itable_index,
 610                                Register method_result,
 611                                Register scan_temp,
 612                                Label& no_such_interface,
 613                                bool return_method = true);
 614 
 615   // virtual method calling
 616   void lookup_virtual_method(Register recv_klass,
 617                              RegisterOrConstant vtable_index,
 618                              Register method_result);
 619 
 620   // Test sub_klass against super_klass, with fast and slow paths.
 621 
 622   // The fast path produces a tri-state answer: yes / no / maybe-slow.
 623   // One of the three labels can be NULL, meaning take the fall-through.
 624   // If super_check_offset is -1, the value is loaded up from super_klass.
 625   // No registers are killed, except temp_reg.
 626   void check_klass_subtype_fast_path(Register sub_klass,
 627                                      Register super_klass,
 628                                      Register temp_reg,
 629                                      Label* L_success,
 630                                      Label* L_failure,
 631                                      Label* L_slow_path,
 632                 RegisterOrConstant super_check_offset = RegisterOrConstant(-1));
 633 
 634   // The rest of the type check; must be wired to a corresponding fast path.
 635   // It does not repeat the fast path logic, so don't use it standalone.
 636   // The temp_reg and temp2_reg can be noreg, if no temps are available.
 637   // Updates the sub's secondary super cache as necessary.
 638   // If set_cond_codes, condition codes will be Z on success, NZ on failure.
 639   void check_klass_subtype_slow_path(Register sub_klass,
 640                                      Register super_klass,
 641                                      Register temp_reg,
 642                                      Register temp2_reg,
 643                                      Label* L_success,
 644                                      Label* L_failure,
 645                                      bool set_cond_codes = false);
 646 
 647   // Simplified, combined version, good for typical uses.
 648   // Falls through on failure.
 649   void check_klass_subtype(Register sub_klass,
 650                            Register super_klass,
 651                            Register temp_reg,
 652                            Label& L_success);
 653 
 654   void clinit_barrier(Register klass,
 655                       Register thread,
 656                       Label* L_fast_path = NULL,
 657                       Label* L_slow_path = NULL);
 658 
 659   // method handles (JSR 292)
 660   Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0);
 661 
 662   // Debugging
 663 
 664   // only if +VerifyOops
 665   void _verify_oop(Register reg, const char* s, const char* file, int line);
 666   void _verify_oop_addr(Address addr, const char* s, const char* file, int line);
 667 
 668   void _verify_oop_checked(Register reg, const char* s, const char* file, int line) {
 669     if (VerifyOops) {
 670       _verify_oop(reg, s, file, line);
 671     }
 672   }
 673   void _verify_oop_addr_checked(Address reg, const char* s, const char* file, int line) {
 674     if (VerifyOops) {
 675       _verify_oop_addr(reg, s, file, line);
 676     }
 677   }
 678 
 679   // TODO: verify method and klass metadata (compare against vptr?)
 680   void _verify_method_ptr(Register reg, const char * msg, const char * file, int line) {}
 681   void _verify_klass_ptr(Register reg, const char * msg, const char * file, int line){}
 682 
 683 #define verify_oop(reg) _verify_oop_checked(reg, "broken oop " #reg, __FILE__, __LINE__)
 684 #define verify_oop_msg(reg, msg) _verify_oop_checked(reg, "broken oop " #reg ", " #msg, __FILE__, __LINE__)
 685 #define verify_oop_addr(addr) _verify_oop_addr_checked(addr, "broken oop addr " #addr, __FILE__, __LINE__)
 686 #define verify_method_ptr(reg) _verify_method_ptr(reg, "broken method " #reg, __FILE__, __LINE__)
 687 #define verify_klass_ptr(reg) _verify_klass_ptr(reg, "broken klass " #reg, __FILE__, __LINE__)
 688 
 689   // Verify or restore cpu control state after JNI call
 690   void restore_cpu_control_state_after_jni(Register rscratch);
 691 
 692   // prints msg, dumps registers and stops execution
 693   void stop(const char* msg);
 694 
 695   // prints msg and continues
 696   void warn(const char* msg);
 697 
 698   // dumps registers and other state
 699   void print_state();
 700 
 701   static void debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg);
 702   static void debug64(char* msg, int64_t pc, int64_t regs[]);
 703   static void print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip);
 704   static void print_state64(int64_t pc, int64_t regs[]);
 705 
 706   void os_breakpoint();
 707 
 708   void untested()                                { stop("untested"); }
 709 
 710   void unimplemented(const char* what = "");
 711 
 712   void should_not_reach_here()                   { stop("should not reach here"); }
 713 
 714   void print_CPU_state();
 715 
 716   // Stack overflow checking
 717   void bang_stack_with_offset(int offset) {
 718     // stack grows down, caller passes positive offset
 719     assert(offset > 0, "must bang with negative offset");
 720     movl(Address(rsp, (-offset)), rax);
 721   }
 722 
 723   // Writes to stack successive pages until offset reached to check for
 724   // stack overflow + shadow pages.  Also, clobbers tmp
 725   void bang_stack_size(Register size, Register tmp);
 726 
 727   // Check for reserved stack access in method being exited (for JIT)
 728   void reserved_stack_check();
 729 
 730   void safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod);
 731 
 732   void verify_tlab();
 733 
 734   static Condition negate_condition(Condition cond);
 735 
 736   // Instructions that use AddressLiteral operands. These instruction can handle 32bit/64bit
 737   // operands. In general the names are modified to avoid hiding the instruction in Assembler
 738   // so that we don't need to implement all the varieties in the Assembler with trivial wrappers
 739   // here in MacroAssembler. The major exception to this rule is call
 740 
 741   // Arithmetics
 742 
 743 
 744   void addptr(Address dst, int32_t src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)) ; }
 745   void addptr(Address dst, Register src);
 746 
 747   void addptr(Register dst, Address src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); }
 748   void addptr(Register dst, int32_t src);
 749   void addptr(Register dst, Register src);
 750   void addptr(Register dst, RegisterOrConstant src) {
 751     if (src.is_constant()) addptr(dst, src.as_constant());
 752     else                   addptr(dst, src.as_register());
 753   }
 754 
 755   void andptr(Register dst, int32_t src);
 756   void andptr(Register src1, Register src2) { LP64_ONLY(andq(src1, src2)) NOT_LP64(andl(src1, src2)) ; }
 757 
 758   void cmp8(AddressLiteral src1, int imm, Register rscratch = noreg);
 759 
 760   // renamed to drag out the casting of address to int32_t/intptr_t
 761   void cmp32(Register src1, int32_t imm);
 762 
 763   void cmp32(AddressLiteral src1, int32_t imm, Register rscratch = noreg);
 764   // compare reg - mem, or reg - &mem
 765   void cmp32(Register src1, AddressLiteral src2, Register rscratch = noreg);
 766 
 767   void cmp32(Register src1, Address src2);
 768 
 769 #ifndef _LP64
 770   void cmpklass(Address dst, Metadata* obj);
 771   void cmpklass(Register dst, Metadata* obj);
 772   void cmpoop(Address dst, jobject obj);
 773 #endif // _LP64
 774 
 775   void cmpoop(Register src1, Register src2);
 776   void cmpoop(Register src1, Address src2);
 777   void cmpoop(Register dst, jobject obj, Register rscratch);
 778 
 779   // NOTE src2 must be the lval. This is NOT an mem-mem compare
 780   void cmpptr(Address src1, AddressLiteral src2, Register rscratch);
 781 
 782   void cmpptr(Register src1, AddressLiteral src2, Register rscratch = noreg);
 783 
 784   void cmpptr(Register src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 785   void cmpptr(Register src1, Address src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 786   // void cmpptr(Address src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 787 
 788   void cmpptr(Register src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 789   void cmpptr(Address src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 790 
 791   // cmp64 to avoild hiding cmpq
 792   void cmp64(Register src1, AddressLiteral src, Register rscratch = noreg);
 793 
 794   void cmpxchgptr(Register reg, Address adr);
 795 
 796   void locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch = noreg);
 797 
 798   void imulptr(Register dst, Register src) { LP64_ONLY(imulq(dst, src)) NOT_LP64(imull(dst, src)); }
 799   void imulptr(Register dst, Register src, int imm32) { LP64_ONLY(imulq(dst, src, imm32)) NOT_LP64(imull(dst, src, imm32)); }
 800 
 801 
 802   void negptr(Register dst) { LP64_ONLY(negq(dst)) NOT_LP64(negl(dst)); }
 803 
 804   void notptr(Register dst) { LP64_ONLY(notq(dst)) NOT_LP64(notl(dst)); }
 805 
 806   void shlptr(Register dst, int32_t shift);
 807   void shlptr(Register dst) { LP64_ONLY(shlq(dst)) NOT_LP64(shll(dst)); }
 808 
 809   void shrptr(Register dst, int32_t shift);
 810   void shrptr(Register dst) { LP64_ONLY(shrq(dst)) NOT_LP64(shrl(dst)); }
 811 
 812   void sarptr(Register dst) { LP64_ONLY(sarq(dst)) NOT_LP64(sarl(dst)); }
 813   void sarptr(Register dst, int32_t src) { LP64_ONLY(sarq(dst, src)) NOT_LP64(sarl(dst, src)); }
 814 
 815   void subptr(Address dst, int32_t src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
 816 
 817   void subptr(Register dst, Address src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
 818   void subptr(Register dst, int32_t src);
 819   // Force generation of a 4 byte immediate value even if it fits into 8bit
 820   void subptr_imm32(Register dst, int32_t src);
 821   void subptr(Register dst, Register src);
 822   void subptr(Register dst, RegisterOrConstant src) {
 823     if (src.is_constant()) subptr(dst, (int) src.as_constant());
 824     else                   subptr(dst,       src.as_register());
 825   }
 826 
 827   void sbbptr(Address dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
 828   void sbbptr(Register dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
 829 
 830   void xchgptr(Register src1, Register src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
 831   void xchgptr(Register src1, Address src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
 832 
 833   void xaddptr(Address src1, Register src2) { LP64_ONLY(xaddq(src1, src2)) NOT_LP64(xaddl(src1, src2)) ; }
 834 
 835 
 836 
 837   // Helper functions for statistics gathering.
 838   // Conditionally (atomically, on MPs) increments passed counter address, preserving condition codes.
 839   void cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch = noreg);
 840   // Unconditional atomic increment.
 841   void atomic_incl(Address counter_addr);
 842   void atomic_incl(AddressLiteral counter_addr, Register rscratch = noreg);
 843 #ifdef _LP64
 844   void atomic_incq(Address counter_addr);
 845   void atomic_incq(AddressLiteral counter_addr, Register rscratch = noreg);
 846 #endif
 847   void atomic_incptr(AddressLiteral counter_addr, Register rscratch = noreg) { LP64_ONLY(atomic_incq(counter_addr, rscratch)) NOT_LP64(atomic_incl(counter_addr, rscratch)) ; }
 848   void atomic_incptr(Address counter_addr) { LP64_ONLY(atomic_incq(counter_addr)) NOT_LP64(atomic_incl(counter_addr)) ; }
 849 
 850   void lea(Register dst, Address        adr) { Assembler::lea(dst, adr); }
 851   void lea(Register dst, AddressLiteral adr);
 852   void lea(Address  dst, AddressLiteral adr, Register rscratch);
 853 
 854   void leal32(Register dst, Address src) { leal(dst, src); }
 855 
 856   // Import other testl() methods from the parent class or else
 857   // they will be hidden by the following overriding declaration.
 858   using Assembler::testl;
 859   void testl(Address dst, int32_t imm32);
 860   void testl(Register dst, int32_t imm32);
 861   void testl(Register dst, AddressLiteral src); // requires reachable address
 862   using Assembler::testq;
 863   void testq(Address dst, int32_t imm32);
 864   void testq(Register dst, int32_t imm32);
 865 
 866   void orptr(Register dst, Address src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 867   void orptr(Register dst, Register src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 868   void orptr(Register dst, int32_t src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 869   void orptr(Address dst, int32_t imm32) { LP64_ONLY(orq(dst, imm32)) NOT_LP64(orl(dst, imm32)); }
 870 
 871   void testptr(Register src, int32_t imm32) {  LP64_ONLY(testq(src, imm32)) NOT_LP64(testl(src, imm32)); }
 872   void testptr(Register src1, Address src2) { LP64_ONLY(testq(src1, src2)) NOT_LP64(testl(src1, src2)); }
 873   void testptr(Register src1, Register src2);
 874 
 875   void xorptr(Register dst, Register src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
 876   void xorptr(Register dst, Address src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
 877 
 878   // Calls
 879 
 880   void call(Label& L, relocInfo::relocType rtype);
 881   void call(Register entry);
 882   void call(Address addr) { Assembler::call(addr); }
 883 
 884   // NOTE: this call transfers to the effective address of entry NOT
 885   // the address contained by entry. This is because this is more natural
 886   // for jumps/calls.
 887   void call(AddressLiteral entry, Register rscratch = rax);
 888 
 889   // Emit the CompiledIC call idiom
 890   void ic_call(address entry, jint method_index = 0);
 891 
 892   void emit_static_call_stub();
 893 
 894   // Jumps
 895 
 896   // NOTE: these jumps transfer to the effective address of dst NOT
 897   // the address contained by dst. This is because this is more natural
 898   // for jumps/calls.
 899   void jump(AddressLiteral dst, Register rscratch = noreg);
 900 
 901   void jump_cc(Condition cc, AddressLiteral dst, Register rscratch = noreg);
 902 
 903   // 32bit can do a case table jump in one instruction but we no longer allow the base
 904   // to be installed in the Address class. This jump will transfer to the address
 905   // contained in the location described by entry (not the address of entry)
 906   void jump(ArrayAddress entry, Register rscratch);
 907 
 908   // Floating
 909 
 910   void push_f(XMMRegister r);
 911   void pop_f(XMMRegister r);
 912   void push_d(XMMRegister r);
 913   void pop_d(XMMRegister r);
 914 
 915   void andpd(XMMRegister dst, XMMRegister    src) { Assembler::andpd(dst, src); }
 916   void andpd(XMMRegister dst, Address        src) { Assembler::andpd(dst, src); }
 917   void andpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 918 
 919   void andps(XMMRegister dst, XMMRegister    src) { Assembler::andps(dst, src); }
 920   void andps(XMMRegister dst, Address        src) { Assembler::andps(dst, src); }
 921   void andps(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 922 
 923   void comiss(XMMRegister dst, XMMRegister    src) { Assembler::comiss(dst, src); }
 924   void comiss(XMMRegister dst, Address        src) { Assembler::comiss(dst, src); }
 925   void comiss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 926 
 927   void comisd(XMMRegister dst, XMMRegister    src) { Assembler::comisd(dst, src); }
 928   void comisd(XMMRegister dst, Address        src) { Assembler::comisd(dst, src); }
 929   void comisd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 930 
 931 #ifndef _LP64
 932   void fadd_s(Address        src) { Assembler::fadd_s(src); }
 933   void fadd_s(AddressLiteral src) { Assembler::fadd_s(as_Address(src)); }
 934 
 935   void fldcw(Address        src) { Assembler::fldcw(src); }
 936   void fldcw(AddressLiteral src);
 937 
 938   void fld_s(int index)          { Assembler::fld_s(index); }
 939   void fld_s(Address        src) { Assembler::fld_s(src); }
 940   void fld_s(AddressLiteral src);
 941 
 942   void fld_d(Address        src) { Assembler::fld_d(src); }
 943   void fld_d(AddressLiteral src);
 944 
 945   void fld_x(Address        src) { Assembler::fld_x(src); }
 946   void fld_x(AddressLiteral src) { Assembler::fld_x(as_Address(src)); }
 947 
 948   void fmul_s(Address        src) { Assembler::fmul_s(src); }
 949   void fmul_s(AddressLiteral src) { Assembler::fmul_s(as_Address(src)); }
 950 #endif // !_LP64
 951 
 952   void ldmxcsr(Address src) { Assembler::ldmxcsr(src); }
 953   void ldmxcsr(AddressLiteral src, Register rscratch = noreg);
 954 
 955 #ifdef _LP64
 956  private:
 957   void sha256_AVX2_one_round_compute(
 958     Register  reg_old_h,
 959     Register  reg_a,
 960     Register  reg_b,
 961     Register  reg_c,
 962     Register  reg_d,
 963     Register  reg_e,
 964     Register  reg_f,
 965     Register  reg_g,
 966     Register  reg_h,
 967     int iter);
 968   void sha256_AVX2_four_rounds_compute_first(int start);
 969   void sha256_AVX2_four_rounds_compute_last(int start);
 970   void sha256_AVX2_one_round_and_sched(
 971         XMMRegister xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
 972         XMMRegister xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
 973         XMMRegister xmm_2,     /* ymm6 */
 974         XMMRegister xmm_3,     /* ymm7 */
 975         Register    reg_a,      /* == eax on 0 iteration, then rotate 8 register right on each next iteration */
 976         Register    reg_b,      /* ebx */    /* full cycle is 8 iterations */
 977         Register    reg_c,      /* edi */
 978         Register    reg_d,      /* esi */
 979         Register    reg_e,      /* r8d */
 980         Register    reg_f,      /* r9d */
 981         Register    reg_g,      /* r10d */
 982         Register    reg_h,      /* r11d */
 983         int iter);
 984 
 985   void addm(int disp, Register r1, Register r2);
 986 
 987   void sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c, Register d,
 988                                      Register e, Register f, Register g, Register h, int iteration);
 989 
 990   void sha512_AVX2_one_round_and_schedule(XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
 991                                           Register a, Register b, Register c, Register d, Register e, Register f,
 992                                           Register g, Register h, int iteration);
 993 
 994   void addmq(int disp, Register r1, Register r2);
 995  public:
 996   void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
 997                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
 998                    Register buf, Register state, Register ofs, Register limit, Register rsp,
 999                    bool multi_block, XMMRegister shuf_mask);
1000   void sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1001                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1002                    Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block,
1003                    XMMRegister shuf_mask);
1004 #endif // _LP64
1005 
1006   void fast_md5(Register buf, Address state, Address ofs, Address limit,
1007                 bool multi_block);
1008 
1009   void fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
1010                  XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
1011                  Register buf, Register state, Register ofs, Register limit, Register rsp,
1012                  bool multi_block);
1013 
1014 #ifdef _LP64
1015   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1016                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1017                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1018                    bool multi_block, XMMRegister shuf_mask);
1019 #else
1020   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1021                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1022                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1023                    bool multi_block);
1024 #endif
1025 
1026   void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1027                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1028                 Register rax, Register rcx, Register rdx, Register tmp);
1029 
1030 #ifndef _LP64
1031  private:
1032   // Initialized in macroAssembler_x86_constants.cpp
1033   static address ONES;
1034   static address L_2IL0FLOATPACKET_0;
1035   static address PI4_INV;
1036   static address PI4X3;
1037   static address PI4X4;
1038 
1039  public:
1040   void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1041                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1042                 Register rax, Register rcx, Register rdx, Register tmp1);
1043 
1044   void fast_log10(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1045                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1046                 Register rax, Register rcx, Register rdx, Register tmp);
1047 
1048   void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1049                 XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
1050                 Register rdx, Register tmp);
1051 
1052   void fast_sin(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1053                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1054                 Register rax, Register rbx, Register rdx);
1055 
1056   void fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1057                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1058                 Register rax, Register rcx, Register rdx, Register tmp);
1059 
1060   void libm_sincos_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1061                         Register edx, Register ebx, Register esi, Register edi,
1062                         Register ebp, Register esp);
1063 
1064   void libm_reduce_pi04l(Register eax, Register ecx, Register edx, Register ebx,
1065                          Register esi, Register edi, Register ebp, Register esp);
1066 
1067   void libm_tancot_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1068                         Register edx, Register ebx, Register esi, Register edi,
1069                         Register ebp, Register esp);
1070 
1071   void fast_tan(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1072                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1073                 Register rax, Register rcx, Register rdx, Register tmp);
1074 #endif // !_LP64
1075 
1076 private:
1077 
1078   // these are private because users should be doing movflt/movdbl
1079 
1080   void movss(Address     dst, XMMRegister    src) { Assembler::movss(dst, src); }
1081   void movss(XMMRegister dst, XMMRegister    src) { Assembler::movss(dst, src); }
1082   void movss(XMMRegister dst, Address        src) { Assembler::movss(dst, src); }
1083   void movss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1084 
1085   void movlpd(XMMRegister dst, Address        src) {Assembler::movlpd(dst, src); }
1086   void movlpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1087 
1088 public:
1089 
1090   void addsd(XMMRegister dst, XMMRegister    src) { Assembler::addsd(dst, src); }
1091   void addsd(XMMRegister dst, Address        src) { Assembler::addsd(dst, src); }
1092   void addsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1093 
1094   void addss(XMMRegister dst, XMMRegister    src) { Assembler::addss(dst, src); }
1095   void addss(XMMRegister dst, Address        src) { Assembler::addss(dst, src); }
1096   void addss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1097 
1098   void addpd(XMMRegister dst, XMMRegister    src) { Assembler::addpd(dst, src); }
1099   void addpd(XMMRegister dst, Address        src) { Assembler::addpd(dst, src); }
1100   void addpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1101 
1102   using Assembler::vbroadcastsd;
1103   void vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1104 
1105   using Assembler::vbroadcastss;
1106   void vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1107 
1108   void divsd(XMMRegister dst, XMMRegister    src) { Assembler::divsd(dst, src); }
1109   void divsd(XMMRegister dst, Address        src) { Assembler::divsd(dst, src); }
1110   void divsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1111 
1112   void divss(XMMRegister dst, XMMRegister    src) { Assembler::divss(dst, src); }
1113   void divss(XMMRegister dst, Address        src) { Assembler::divss(dst, src); }
1114   void divss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1115 
1116   // Move Unaligned Double Quadword
1117   void movdqu(Address     dst, XMMRegister    src);
1118   void movdqu(XMMRegister dst, XMMRegister    src);
1119   void movdqu(XMMRegister dst, Address        src);
1120   void movdqu(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1121 
1122   void kmovwl(Register  dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1123   void kmovwl(Address   dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1124   void kmovwl(KRegister dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1125   void kmovwl(KRegister dst, Register       src) { Assembler::kmovwl(dst, src); }
1126   void kmovwl(KRegister dst, Address        src) { Assembler::kmovwl(dst, src); }
1127   void kmovwl(KRegister dst, AddressLiteral src, Register rscratch = noreg);
1128 
1129   void kmovql(KRegister dst, KRegister      src) { Assembler::kmovql(dst, src); }
1130   void kmovql(KRegister dst, Register       src) { Assembler::kmovql(dst, src); }
1131   void kmovql(Register  dst, KRegister      src) { Assembler::kmovql(dst, src); }
1132   void kmovql(KRegister dst, Address        src) { Assembler::kmovql(dst, src); }
1133   void kmovql(Address   dst, KRegister      src) { Assembler::kmovql(dst, src); }
1134   void kmovql(KRegister dst, AddressLiteral src, Register rscratch = noreg);
1135 
1136   // Safe move operation, lowers down to 16bit moves for targets supporting
1137   // AVX512F feature and 64bit moves for targets supporting AVX512BW feature.
1138   void kmov(Address  dst, KRegister src);
1139   void kmov(KRegister dst, Address src);
1140   void kmov(KRegister dst, KRegister src);
1141   void kmov(Register dst, KRegister src);
1142   void kmov(KRegister dst, Register src);
1143 
1144   using Assembler::movddup;
1145   void movddup(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1146 
1147   using Assembler::vmovddup;
1148   void vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1149 
1150   // AVX Unaligned forms
1151   void vmovdqu(Address     dst, XMMRegister    src);
1152   void vmovdqu(XMMRegister dst, Address        src);
1153   void vmovdqu(XMMRegister dst, XMMRegister    src);
1154   void vmovdqu(XMMRegister dst, AddressLiteral src,                 Register rscratch = noreg);
1155   void vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1156 
1157   // AVX512 Unaligned
1158   void evmovdqu(BasicType type, KRegister kmask, Address     dst, XMMRegister src, bool merge, int vector_len);
1159   void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address     src, bool merge, int vector_len);
1160 
1161   void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::evmovdqub(dst, src, vector_len); }
1162   void evmovdqub(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdqub(dst, src, vector_len); }
1163 
1164   void evmovdqub(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1165     if (dst->encoding() != src->encoding() || mask != k0)  {
1166       Assembler::evmovdqub(dst, mask, src, merge, vector_len);
1167     }
1168   }
1169   void evmovdqub(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
1170   void evmovdqub(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
1171   void evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1172 
1173   void evmovdquw(Address     dst, XMMRegister src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
1174   void evmovdquw(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
1175 
1176   void evmovdquw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1177     if (dst->encoding() != src->encoding() || mask != k0) {
1178       Assembler::evmovdquw(dst, mask, src, merge, vector_len);
1179     }
1180   }
1181   void evmovdquw(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
1182   void evmovdquw(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
1183   void evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1184 
1185   void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
1186      if (dst->encoding() != src->encoding()) {
1187        Assembler::evmovdqul(dst, src, vector_len);
1188      }
1189   }
1190   void evmovdqul(Address     dst, XMMRegister src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
1191   void evmovdqul(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
1192 
1193   void evmovdqul(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1194     if (dst->encoding() != src->encoding() || mask != k0)  {
1195       Assembler::evmovdqul(dst, mask, src, merge, vector_len);
1196     }
1197   }
1198   void evmovdqul(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
1199   void evmovdqul(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
1200   void evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1201 
1202   void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) {
1203     if (dst->encoding() != src->encoding()) {
1204       Assembler::evmovdquq(dst, src, vector_len);
1205     }
1206   }
1207   void evmovdquq(XMMRegister dst, Address        src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1208   void evmovdquq(Address     dst, XMMRegister    src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1209   void evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1210 
1211   void evmovdquq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1212     if (dst->encoding() != src->encoding() || mask != k0) {
1213       Assembler::evmovdquq(dst, mask, src, merge, vector_len);
1214     }
1215   }
1216   void evmovdquq(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
1217   void evmovdquq(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
1218   void evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1219 
1220   // Move Aligned Double Quadword
1221   void movdqa(XMMRegister dst, XMMRegister    src) { Assembler::movdqa(dst, src); }
1222   void movdqa(XMMRegister dst, Address        src) { Assembler::movdqa(dst, src); }
1223   void movdqa(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1224 
1225   void movsd(Address     dst, XMMRegister    src) { Assembler::movsd(dst, src); }
1226   void movsd(XMMRegister dst, XMMRegister    src) { Assembler::movsd(dst, src); }
1227   void movsd(XMMRegister dst, Address        src) { Assembler::movsd(dst, src); }
1228   void movsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1229 
1230   void mulpd(XMMRegister dst, XMMRegister    src) { Assembler::mulpd(dst, src); }
1231   void mulpd(XMMRegister dst, Address        src) { Assembler::mulpd(dst, src); }
1232   void mulpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1233 
1234   void mulsd(XMMRegister dst, XMMRegister    src) { Assembler::mulsd(dst, src); }
1235   void mulsd(XMMRegister dst, Address        src) { Assembler::mulsd(dst, src); }
1236   void mulsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1237 
1238   void mulss(XMMRegister dst, XMMRegister    src) { Assembler::mulss(dst, src); }
1239   void mulss(XMMRegister dst, Address        src) { Assembler::mulss(dst, src); }
1240   void mulss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1241 
1242   // Carry-Less Multiplication Quadword
1243   void pclmulldq(XMMRegister dst, XMMRegister src) {
1244     // 0x00 - multiply lower 64 bits [0:63]
1245     Assembler::pclmulqdq(dst, src, 0x00);
1246   }
1247   void pclmulhdq(XMMRegister dst, XMMRegister src) {
1248     // 0x11 - multiply upper 64 bits [64:127]
1249     Assembler::pclmulqdq(dst, src, 0x11);
1250   }
1251 
1252   void pcmpeqb(XMMRegister dst, XMMRegister src);
1253   void pcmpeqw(XMMRegister dst, XMMRegister src);
1254 
1255   void pcmpestri(XMMRegister dst, Address src, int imm8);
1256   void pcmpestri(XMMRegister dst, XMMRegister src, int imm8);
1257 
1258   void pmovzxbw(XMMRegister dst, XMMRegister src);
1259   void pmovzxbw(XMMRegister dst, Address src);
1260 
1261   void pmovmskb(Register dst, XMMRegister src);
1262 
1263   void ptest(XMMRegister dst, XMMRegister src);
1264 
1265   void roundsd(XMMRegister dst, XMMRegister    src, int32_t rmode) { Assembler::roundsd(dst, src, rmode); }
1266   void roundsd(XMMRegister dst, Address        src, int32_t rmode) { Assembler::roundsd(dst, src, rmode); }
1267   void roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch = noreg);
1268 
1269   void sqrtss(XMMRegister dst, XMMRegister     src) { Assembler::sqrtss(dst, src); }
1270   void sqrtss(XMMRegister dst, Address         src) { Assembler::sqrtss(dst, src); }
1271   void sqrtss(XMMRegister dst, AddressLiteral  src, Register rscratch = noreg);
1272 
1273   void subsd(XMMRegister dst, XMMRegister    src) { Assembler::subsd(dst, src); }
1274   void subsd(XMMRegister dst, Address        src) { Assembler::subsd(dst, src); }
1275   void subsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1276 
1277   void subss(XMMRegister dst, XMMRegister    src) { Assembler::subss(dst, src); }
1278   void subss(XMMRegister dst, Address        src) { Assembler::subss(dst, src); }
1279   void subss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1280 
1281   void ucomiss(XMMRegister dst, XMMRegister    src) { Assembler::ucomiss(dst, src); }
1282   void ucomiss(XMMRegister dst, Address        src) { Assembler::ucomiss(dst, src); }
1283   void ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1284 
1285   void ucomisd(XMMRegister dst, XMMRegister    src) { Assembler::ucomisd(dst, src); }
1286   void ucomisd(XMMRegister dst, Address        src) { Assembler::ucomisd(dst, src); }
1287   void ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1288 
1289   // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
1290   void xorpd(XMMRegister dst, XMMRegister    src);
1291   void xorpd(XMMRegister dst, Address        src) { Assembler::xorpd(dst, src); }
1292   void xorpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1293 
1294   // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
1295   void xorps(XMMRegister dst, XMMRegister    src);
1296   void xorps(XMMRegister dst, Address        src) { Assembler::xorps(dst, src); }
1297   void xorps(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1298 
1299   // Shuffle Bytes
1300   void pshufb(XMMRegister dst, XMMRegister    src) { Assembler::pshufb(dst, src); }
1301   void pshufb(XMMRegister dst, Address        src) { Assembler::pshufb(dst, src); }
1302   void pshufb(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1303   // AVX 3-operands instructions
1304 
1305   void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vaddsd(dst, nds, src); }
1306   void vaddsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vaddsd(dst, nds, src); }
1307   void vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1308 
1309   void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vaddss(dst, nds, src); }
1310   void vaddss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vaddss(dst, nds, src); }
1311   void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1312 
1313   void vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch = noreg);
1314   void vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch = noreg);
1315 
1316   void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len);
1317   void vpaddb(XMMRegister dst, XMMRegister nds, Address        src, int vector_len);
1318   void vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1319 
1320   void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1321   void vpaddw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1322 
1323   void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1324   void vpaddd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1325   void vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1326 
1327   void vpand(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1328   void vpand(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1329   void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1330 
1331   using Assembler::vpbroadcastd;
1332   void vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1333 
1334   using Assembler::vpbroadcastq;
1335   void vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1336 
1337   void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1338 
1339   void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1340   void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1341 
1342   // Vector compares
1343   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1344     Assembler::evpcmpd(kdst, mask, nds, src, comparison, is_signed, vector_len);
1345   }
1346   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1347 
1348   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1349     Assembler::evpcmpq(kdst, mask, nds, src, comparison, is_signed, vector_len);
1350   }
1351   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1352 
1353   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1354     Assembler::evpcmpb(kdst, mask, nds, src, comparison, is_signed, vector_len);
1355   }
1356   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1357 
1358   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1359     Assembler::evpcmpw(kdst, mask, nds, src, comparison, is_signed, vector_len);
1360   }
1361   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1362 
1363   void evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len);
1364 
1365   // Emit comparison instruction for the specified comparison predicate.
1366   void vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len);
1367   void vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len);
1368 
1369   void vpmovzxbw(XMMRegister dst, Address     src, int vector_len);
1370   void vpmovzxbw(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vpmovzxbw(dst, src, vector_len); }
1371 
1372   void vpmovmskb(Register dst, XMMRegister src, int vector_len = Assembler::AVX_256bit);
1373 
1374   void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1375   void vpmullw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1376 
1377   void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); }
1378   void vpmulld(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); }
1379   void vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1380 
1381   void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1382   void vpsubb(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1383 
1384   void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1385   void vpsubw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1386 
1387   void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1388   void vpsraw(XMMRegister dst, XMMRegister nds, int         shift, int vector_len);
1389 
1390   void evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1391   void evpsraq(XMMRegister dst, XMMRegister nds, int         shift, int vector_len);
1392 
1393   void evpsllw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1394     if (!is_varshift) {
1395       Assembler::evpsllw(dst, mask, nds, src, merge, vector_len);
1396     } else {
1397       Assembler::evpsllvw(dst, mask, nds, src, merge, vector_len);
1398     }
1399   }
1400   void evpslld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1401     if (!is_varshift) {
1402       Assembler::evpslld(dst, mask, nds, src, merge, vector_len);
1403     } else {
1404       Assembler::evpsllvd(dst, mask, nds, src, merge, vector_len);
1405     }
1406   }
1407   void evpsllq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1408     if (!is_varshift) {
1409       Assembler::evpsllq(dst, mask, nds, src, merge, vector_len);
1410     } else {
1411       Assembler::evpsllvq(dst, mask, nds, src, merge, vector_len);
1412     }
1413   }
1414   void evpsrlw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1415     if (!is_varshift) {
1416       Assembler::evpsrlw(dst, mask, nds, src, merge, vector_len);
1417     } else {
1418       Assembler::evpsrlvw(dst, mask, nds, src, merge, vector_len);
1419     }
1420   }
1421   void evpsrld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1422     if (!is_varshift) {
1423       Assembler::evpsrld(dst, mask, nds, src, merge, vector_len);
1424     } else {
1425       Assembler::evpsrlvd(dst, mask, nds, src, merge, vector_len);
1426     }
1427   }
1428   void evpsrlq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1429     if (!is_varshift) {
1430       Assembler::evpsrlq(dst, mask, nds, src, merge, vector_len);
1431     } else {
1432       Assembler::evpsrlvq(dst, mask, nds, src, merge, vector_len);
1433     }
1434   }
1435   void evpsraw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1436     if (!is_varshift) {
1437       Assembler::evpsraw(dst, mask, nds, src, merge, vector_len);
1438     } else {
1439       Assembler::evpsravw(dst, mask, nds, src, merge, vector_len);
1440     }
1441   }
1442   void evpsrad(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1443     if (!is_varshift) {
1444       Assembler::evpsrad(dst, mask, nds, src, merge, vector_len);
1445     } else {
1446       Assembler::evpsravd(dst, mask, nds, src, merge, vector_len);
1447     }
1448   }
1449   void evpsraq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1450     if (!is_varshift) {
1451       Assembler::evpsraq(dst, mask, nds, src, merge, vector_len);
1452     } else {
1453       Assembler::evpsravq(dst, mask, nds, src, merge, vector_len);
1454     }
1455   }
1456 
1457   void evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1458   void evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1459   void evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1460   void evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1461 
1462   void vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1463   void vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1464 
1465   void vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1466   void vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1467 
1468   void vptest(XMMRegister dst, XMMRegister src);
1469   void vptest(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vptest(dst, src, vector_len); }
1470 
1471   void punpcklbw(XMMRegister dst, XMMRegister src);
1472   void punpcklbw(XMMRegister dst, Address src) { Assembler::punpcklbw(dst, src); }
1473 
1474   void pshufd(XMMRegister dst, Address src, int mode);
1475   void pshufd(XMMRegister dst, XMMRegister src, int mode) { Assembler::pshufd(dst, src, mode); }
1476 
1477   void pshuflw(XMMRegister dst, XMMRegister src, int mode);
1478   void pshuflw(XMMRegister dst, Address src, int mode) { Assembler::pshuflw(dst, src, mode); }
1479 
1480   void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1481   void vandpd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1482   void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1483 
1484   void vandps(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1485   void vandps(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1486   void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1487 
1488   void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1489 
1490   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vdivsd(dst, nds, src); }
1491   void vdivsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vdivsd(dst, nds, src); }
1492   void vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1493 
1494   void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vdivss(dst, nds, src); }
1495   void vdivss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vdivss(dst, nds, src); }
1496   void vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1497 
1498   void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vmulsd(dst, nds, src); }
1499   void vmulsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vmulsd(dst, nds, src); }
1500   void vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1501 
1502   void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vmulss(dst, nds, src); }
1503   void vmulss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vmulss(dst, nds, src); }
1504   void vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1505 
1506   void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vsubsd(dst, nds, src); }
1507   void vsubsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vsubsd(dst, nds, src); }
1508   void vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1509 
1510   void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vsubss(dst, nds, src); }
1511   void vsubss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vsubss(dst, nds, src); }
1512   void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1513 
1514   void vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1515   void vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1516 
1517   // AVX Vector instructions
1518 
1519   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1520   void vxorpd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1521   void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1522 
1523   void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1524   void vxorps(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1525   void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1526 
1527   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1528     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1529       Assembler::vpxor(dst, nds, src, vector_len);
1530     else
1531       Assembler::vxorpd(dst, nds, src, vector_len);
1532   }
1533   void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
1534     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1535       Assembler::vpxor(dst, nds, src, vector_len);
1536     else
1537       Assembler::vxorpd(dst, nds, src, vector_len);
1538   }
1539   void vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1540 
1541   // Simple version for AVX2 256bit vectors
1542   void vpxor(XMMRegister dst, XMMRegister src) {
1543     assert(UseAVX >= 2, "Should be at least AVX2");
1544     Assembler::vpxor(dst, dst, src, AVX_256bit);
1545   }
1546   void vpxor(XMMRegister dst, Address src) {
1547     assert(UseAVX >= 2, "Should be at least AVX2");
1548     Assembler::vpxor(dst, dst, src, AVX_256bit);
1549   }
1550 
1551   void vpermd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpermd(dst, nds, src, vector_len); }
1552   void vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1553 
1554   void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
1555     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1556       Assembler::vinserti32x4(dst, nds, src, imm8);
1557     } else if (UseAVX > 1) {
1558       // vinserti128 is available only in AVX2
1559       Assembler::vinserti128(dst, nds, src, imm8);
1560     } else {
1561       Assembler::vinsertf128(dst, nds, src, imm8);
1562     }
1563   }
1564 
1565   void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
1566     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1567       Assembler::vinserti32x4(dst, nds, src, imm8);
1568     } else if (UseAVX > 1) {
1569       // vinserti128 is available only in AVX2
1570       Assembler::vinserti128(dst, nds, src, imm8);
1571     } else {
1572       Assembler::vinsertf128(dst, nds, src, imm8);
1573     }
1574   }
1575 
1576   void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
1577     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1578       Assembler::vextracti32x4(dst, src, imm8);
1579     } else if (UseAVX > 1) {
1580       // vextracti128 is available only in AVX2
1581       Assembler::vextracti128(dst, src, imm8);
1582     } else {
1583       Assembler::vextractf128(dst, src, imm8);
1584     }
1585   }
1586 
1587   void vextracti128(Address dst, XMMRegister src, uint8_t imm8) {
1588     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1589       Assembler::vextracti32x4(dst, src, imm8);
1590     } else if (UseAVX > 1) {
1591       // vextracti128 is available only in AVX2
1592       Assembler::vextracti128(dst, src, imm8);
1593     } else {
1594       Assembler::vextractf128(dst, src, imm8);
1595     }
1596   }
1597 
1598   // 128bit copy to/from high 128 bits of 256bit (YMM) vector registers
1599   void vinserti128_high(XMMRegister dst, XMMRegister src) {
1600     vinserti128(dst, dst, src, 1);
1601   }
1602   void vinserti128_high(XMMRegister dst, Address src) {
1603     vinserti128(dst, dst, src, 1);
1604   }
1605   void vextracti128_high(XMMRegister dst, XMMRegister src) {
1606     vextracti128(dst, src, 1);
1607   }
1608   void vextracti128_high(Address dst, XMMRegister src) {
1609     vextracti128(dst, src, 1);
1610   }
1611 
1612   void vinsertf128_high(XMMRegister dst, XMMRegister src) {
1613     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1614       Assembler::vinsertf32x4(dst, dst, src, 1);
1615     } else {
1616       Assembler::vinsertf128(dst, dst, src, 1);
1617     }
1618   }
1619 
1620   void vinsertf128_high(XMMRegister dst, Address src) {
1621     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1622       Assembler::vinsertf32x4(dst, dst, src, 1);
1623     } else {
1624       Assembler::vinsertf128(dst, dst, src, 1);
1625     }
1626   }
1627 
1628   void vextractf128_high(XMMRegister dst, XMMRegister src) {
1629     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1630       Assembler::vextractf32x4(dst, src, 1);
1631     } else {
1632       Assembler::vextractf128(dst, src, 1);
1633     }
1634   }
1635 
1636   void vextractf128_high(Address dst, XMMRegister src) {
1637     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1638       Assembler::vextractf32x4(dst, src, 1);
1639     } else {
1640       Assembler::vextractf128(dst, src, 1);
1641     }
1642   }
1643 
1644   // 256bit copy to/from high 256 bits of 512bit (ZMM) vector registers
1645   void vinserti64x4_high(XMMRegister dst, XMMRegister src) {
1646     Assembler::vinserti64x4(dst, dst, src, 1);
1647   }
1648   void vinsertf64x4_high(XMMRegister dst, XMMRegister src) {
1649     Assembler::vinsertf64x4(dst, dst, src, 1);
1650   }
1651   void vextracti64x4_high(XMMRegister dst, XMMRegister src) {
1652     Assembler::vextracti64x4(dst, src, 1);
1653   }
1654   void vextractf64x4_high(XMMRegister dst, XMMRegister src) {
1655     Assembler::vextractf64x4(dst, src, 1);
1656   }
1657   void vextractf64x4_high(Address dst, XMMRegister src) {
1658     Assembler::vextractf64x4(dst, src, 1);
1659   }
1660   void vinsertf64x4_high(XMMRegister dst, Address src) {
1661     Assembler::vinsertf64x4(dst, dst, src, 1);
1662   }
1663 
1664   // 128bit copy to/from low 128 bits of 256bit (YMM) vector registers
1665   void vinserti128_low(XMMRegister dst, XMMRegister src) {
1666     vinserti128(dst, dst, src, 0);
1667   }
1668   void vinserti128_low(XMMRegister dst, Address src) {
1669     vinserti128(dst, dst, src, 0);
1670   }
1671   void vextracti128_low(XMMRegister dst, XMMRegister src) {
1672     vextracti128(dst, src, 0);
1673   }
1674   void vextracti128_low(Address dst, XMMRegister src) {
1675     vextracti128(dst, src, 0);
1676   }
1677 
1678   void vinsertf128_low(XMMRegister dst, XMMRegister src) {
1679     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1680       Assembler::vinsertf32x4(dst, dst, src, 0);
1681     } else {
1682       Assembler::vinsertf128(dst, dst, src, 0);
1683     }
1684   }
1685 
1686   void vinsertf128_low(XMMRegister dst, Address src) {
1687     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1688       Assembler::vinsertf32x4(dst, dst, src, 0);
1689     } else {
1690       Assembler::vinsertf128(dst, dst, src, 0);
1691     }
1692   }
1693 
1694   void vextractf128_low(XMMRegister dst, XMMRegister src) {
1695     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1696       Assembler::vextractf32x4(dst, src, 0);
1697     } else {
1698       Assembler::vextractf128(dst, src, 0);
1699     }
1700   }
1701 
1702   void vextractf128_low(Address dst, XMMRegister src) {
1703     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1704       Assembler::vextractf32x4(dst, src, 0);
1705     } else {
1706       Assembler::vextractf128(dst, src, 0);
1707     }
1708   }
1709 
1710   // 256bit copy to/from low 256 bits of 512bit (ZMM) vector registers
1711   void vinserti64x4_low(XMMRegister dst, XMMRegister src) {
1712     Assembler::vinserti64x4(dst, dst, src, 0);
1713   }
1714   void vinsertf64x4_low(XMMRegister dst, XMMRegister src) {
1715     Assembler::vinsertf64x4(dst, dst, src, 0);
1716   }
1717   void vextracti64x4_low(XMMRegister dst, XMMRegister src) {
1718     Assembler::vextracti64x4(dst, src, 0);
1719   }
1720   void vextractf64x4_low(XMMRegister dst, XMMRegister src) {
1721     Assembler::vextractf64x4(dst, src, 0);
1722   }
1723   void vextractf64x4_low(Address dst, XMMRegister src) {
1724     Assembler::vextractf64x4(dst, src, 0);
1725   }
1726   void vinsertf64x4_low(XMMRegister dst, Address src) {
1727     Assembler::vinsertf64x4(dst, dst, src, 0);
1728   }
1729 
1730   // Carry-Less Multiplication Quadword
1731   void vpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1732     // 0x00 - multiply lower 64 bits [0:63]
1733     Assembler::vpclmulqdq(dst, nds, src, 0x00);
1734   }
1735   void vpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1736     // 0x11 - multiply upper 64 bits [64:127]
1737     Assembler::vpclmulqdq(dst, nds, src, 0x11);
1738   }
1739   void vpclmullqhqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1740     // 0x10 - multiply nds[0:63] and src[64:127]
1741     Assembler::vpclmulqdq(dst, nds, src, 0x10);
1742   }
1743   void vpclmulhqlqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1744     //0x01 - multiply nds[64:127] and src[0:63]
1745     Assembler::vpclmulqdq(dst, nds, src, 0x01);
1746   }
1747 
1748   void evpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1749     // 0x00 - multiply lower 64 bits [0:63]
1750     Assembler::evpclmulqdq(dst, nds, src, 0x00, vector_len);
1751   }
1752   void evpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1753     // 0x11 - multiply upper 64 bits [64:127]
1754     Assembler::evpclmulqdq(dst, nds, src, 0x11, vector_len);
1755   }
1756 
1757   // AVX-512 mask operations.
1758   void kand(BasicType etype, KRegister dst, KRegister src1, KRegister src2);
1759   void kor(BasicType type, KRegister dst, KRegister src1, KRegister src2);
1760   void knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp = knoreg, Register rtmp = noreg);
1761   void kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2);
1762   void kortest(uint masklen, KRegister src1, KRegister src2);
1763   void ktest(uint masklen, KRegister src1, KRegister src2);
1764 
1765   void evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1766   void evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1767 
1768   void evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1769   void evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1770 
1771   void evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1772   void evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1773 
1774   void evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1775   void evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1776 
1777   void evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
1778   void evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
1779   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
1780   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
1781 
1782   void alltrue(Register dst, uint masklen, KRegister src1, KRegister src2, KRegister kscratch);
1783   void anytrue(Register dst, uint masklen, KRegister src, KRegister kscratch);
1784 
1785   void cmov32( Condition cc, Register dst, Address  src);
1786   void cmov32( Condition cc, Register dst, Register src);
1787 
1788   void cmov(   Condition cc, Register dst, Register src) { cmovptr(cc, dst, src); }
1789 
1790   void cmovptr(Condition cc, Register dst, Address  src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
1791   void cmovptr(Condition cc, Register dst, Register src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
1792 
1793   void movoop(Register dst, jobject obj);
1794   void movoop(Address  dst, jobject obj, Register rscratch);
1795 
1796   void mov_metadata(Register dst, Metadata* obj);
1797   void mov_metadata(Address  dst, Metadata* obj, Register rscratch);
1798 
1799   void movptr(Register     dst, Register       src);
1800   void movptr(Register     dst, Address        src);
1801   void movptr(Register     dst, AddressLiteral src);
1802   void movptr(Register     dst, ArrayAddress   src);
1803   void movptr(Register     dst, intptr_t       src);
1804   void movptr(Address      dst, Register       src);
1805   void movptr(Address      dst, int32_t        imm);
1806   void movptr(Address      dst, intptr_t       src, Register rscratch);
1807   void movptr(ArrayAddress dst, Register       src, Register rscratch);
1808 
1809   void movptr(Register dst, RegisterOrConstant src) {
1810     if (src.is_constant()) movptr(dst, src.as_constant());
1811     else                   movptr(dst, src.as_register());
1812   }
1813 
1814 
1815   // to avoid hiding movl
1816   void mov32(Register       dst, AddressLiteral src);
1817   void mov32(AddressLiteral dst, Register        src, Register rscratch = noreg);
1818 
1819   // Import other mov() methods from the parent class or else
1820   // they will be hidden by the following overriding declaration.
1821   using Assembler::movdl;
1822   void movdl(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1823 
1824   using Assembler::movq;
1825   void movq(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1826 
1827   // Can push value or effective address
1828   void pushptr(AddressLiteral src, Register rscratch);
1829 
1830   void pushptr(Address src) { LP64_ONLY(pushq(src)) NOT_LP64(pushl(src)); }
1831   void popptr(Address src) { LP64_ONLY(popq(src)) NOT_LP64(popl(src)); }
1832 
1833   void pushoop(jobject obj, Register rscratch);
1834   void pushklass(Metadata* obj, Register rscratch);
1835 
1836   // sign extend as need a l to ptr sized element
1837   void movl2ptr(Register dst, Address src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src)); }
1838   void movl2ptr(Register dst, Register src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(if (dst != src) movl(dst, src)); }
1839 
1840 
1841  public:
1842   // clear memory of size 'cnt' qwords, starting at 'base';
1843   // if 'is_large' is set, do not try to produce short loop
1844   void clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, bool is_large, KRegister mask=knoreg);
1845 
1846   // clear memory initialization sequence for constant size;
1847   void clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
1848 
1849   // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
1850   void xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
1851 
1852   // Fill primitive arrays
1853   void generate_fill(BasicType t, bool aligned,
1854                      Register to, Register value, Register count,
1855                      Register rtmp, XMMRegister xtmp);
1856 
1857   void encode_iso_array(Register src, Register dst, Register len,
1858                         XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
1859                         XMMRegister tmp4, Register tmp5, Register result, bool ascii);
1860 
1861 #ifdef _LP64
1862   void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2);
1863   void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
1864                              Register y, Register y_idx, Register z,
1865                              Register carry, Register product,
1866                              Register idx, Register kdx);
1867   void multiply_add_128_x_128(Register x_xstart, Register y, Register z,
1868                               Register yz_idx, Register idx,
1869                               Register carry, Register product, int offset);
1870   void multiply_128_x_128_bmi2_loop(Register y, Register z,
1871                                     Register carry, Register carry2,
1872                                     Register idx, Register jdx,
1873                                     Register yz_idx1, Register yz_idx2,
1874                                     Register tmp, Register tmp3, Register tmp4);
1875   void multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
1876                                Register yz_idx, Register idx, Register jdx,
1877                                Register carry, Register product,
1878                                Register carry2);
1879   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
1880                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
1881   void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
1882                      Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
1883   void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
1884                             Register tmp2);
1885   void multiply_add_64(Register sum, Register op1, Register op2, Register carry,
1886                        Register rdxReg, Register raxReg);
1887   void add_one_64(Register z, Register zlen, Register carry, Register tmp1);
1888   void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
1889                        Register tmp3, Register tmp4);
1890   void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
1891                      Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
1892 
1893   void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1,
1894                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
1895                Register raxReg);
1896   void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
1897                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
1898                Register raxReg);
1899   void vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
1900                            Register result, Register tmp1, Register tmp2,
1901                            XMMRegister vec1, XMMRegister vec2, XMMRegister vec3);
1902 #endif
1903 
1904   // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
1905   void update_byte_crc32(Register crc, Register val, Register table);
1906   void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
1907 
1908 
1909 #ifdef _LP64
1910   void kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2);
1911   void kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
1912                                 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
1913                                 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup);
1914 #endif // _LP64
1915 
1916   // CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
1917   // Note on a naming convention:
1918   // Prefix w = register only used on a Westmere+ architecture
1919   // Prefix n = register only used on a Nehalem architecture
1920 #ifdef _LP64
1921   void crc32c_ipl_alg4(Register in_out, uint32_t n,
1922                        Register tmp1, Register tmp2, Register tmp3);
1923 #else
1924   void crc32c_ipl_alg4(Register in_out, uint32_t n,
1925                        Register tmp1, Register tmp2, Register tmp3,
1926                        XMMRegister xtmp1, XMMRegister xtmp2);
1927 #endif
1928   void crc32c_pclmulqdq(XMMRegister w_xtmp1,
1929                         Register in_out,
1930                         uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
1931                         XMMRegister w_xtmp2,
1932                         Register tmp1,
1933                         Register n_tmp2, Register n_tmp3);
1934   void crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
1935                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
1936                        Register tmp1, Register tmp2,
1937                        Register n_tmp3);
1938   void crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
1939                          Register in_out1, Register in_out2, Register in_out3,
1940                          Register tmp1, Register tmp2, Register tmp3,
1941                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
1942                          Register tmp4, Register tmp5,
1943                          Register n_tmp6);
1944   void crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
1945                             Register tmp1, Register tmp2, Register tmp3,
1946                             Register tmp4, Register tmp5, Register tmp6,
1947                             XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
1948                             bool is_pclmulqdq_supported);
1949   // Fold 128-bit data chunk
1950   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
1951   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf);
1952 #ifdef _LP64
1953   // Fold 512-bit data chunk
1954   void fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, Register pos, int offset);
1955 #endif // _LP64
1956   // Fold 8-bit data
1957   void fold_8bit_crc32(Register crc, Register table, Register tmp);
1958   void fold_8bit_crc32(XMMRegister crc, Register table, XMMRegister xtmp, Register tmp);
1959 
1960   // Compress char[] array to byte[].
1961   void char_array_compress(Register src, Register dst, Register len,
1962                            XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
1963                            XMMRegister tmp4, Register tmp5, Register result,
1964                            KRegister mask1 = knoreg, KRegister mask2 = knoreg);
1965 
1966   // Inflate byte[] array to char[].
1967   void byte_array_inflate(Register src, Register dst, Register len,
1968                           XMMRegister tmp1, Register tmp2, KRegister mask = knoreg);
1969 
1970   void fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
1971                    Register length, Register temp, int vec_enc);
1972 
1973   void fill64_masked(uint shift, Register dst, int disp,
1974                          XMMRegister xmm, KRegister mask, Register length,
1975                          Register temp, bool use64byteVector = false);
1976 
1977   void fill32_masked(uint shift, Register dst, int disp,
1978                          XMMRegister xmm, KRegister mask, Register length,
1979                          Register temp);
1980 
1981   void fill32(Address dst, XMMRegister xmm);
1982 
1983   void fill32(Register dst, int disp, XMMRegister xmm);
1984 
1985   void fill64(Address dst, XMMRegister xmm, bool use64byteVector = false);
1986 
1987   void fill64(Register dst, int dis, XMMRegister xmm, bool use64byteVector = false);
1988 
1989 #ifdef _LP64
1990   void convert_f2i(Register dst, XMMRegister src);
1991   void convert_d2i(Register dst, XMMRegister src);
1992   void convert_f2l(Register dst, XMMRegister src);
1993   void convert_d2l(Register dst, XMMRegister src);
1994   void round_double(Register dst, XMMRegister src, Register rtmp, Register rcx);
1995   void round_float(Register dst, XMMRegister src, Register rtmp, Register rcx);
1996 
1997   void cache_wb(Address line);
1998   void cache_wbsync(bool is_pre);
1999 
2000 #ifdef COMPILER2_OR_JVMCI
2001   void generate_fill_avx3(BasicType type, Register to, Register value,
2002                           Register count, Register rtmp, XMMRegister xtmp);
2003 #endif // COMPILER2_OR_JVMCI
2004 
2005   OopMap* continuation_enter_setup(int& stack_slots);
2006   void fill_continuation_entry(Register reg_cont_obj, Register reg_flags);
2007   void continuation_enter_cleanup();
2008 #endif // _LP64
2009 
2010   void vallones(XMMRegister dst, int vector_len);
2011 
2012   void check_stack_alignment(Register sp, const char* msg, unsigned bias = 0, Register tmp = noreg);
2013 
2014 };
2015 
2016 /**
2017  * class SkipIfEqual:
2018  *
2019  * Instantiating this class will result in assembly code being output that will
2020  * jump around any code emitted between the creation of the instance and it's
2021  * automatic destruction at the end of a scope block, depending on the value of
2022  * the flag passed to the constructor, which will be checked at run-time.
2023  */
2024 class SkipIfEqual {
2025  private:
2026   MacroAssembler* _masm;
2027   Label _label;
2028 
2029  public:
2030    SkipIfEqual(MacroAssembler*, const bool* flag_addr, bool value, Register rscratch);
2031    ~SkipIfEqual();
2032 };
2033 
2034 #endif // CPU_X86_MACROASSEMBLER_X86_HPP