1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef CPU_X86_MACROASSEMBLER_X86_HPP
  26 #define CPU_X86_MACROASSEMBLER_X86_HPP
  27 
  28 #include "asm/assembler.hpp"
  29 #include "asm/register.hpp"
  30 #include "code/vmreg.inline.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "utilities/macros.hpp"
  33 #include "runtime/rtmLocking.hpp"
  34 #include "runtime/vm_version.hpp"
  35 
  36 // MacroAssembler extends Assembler by frequently used macros.
  37 //
  38 // Instructions for which a 'better' code sequence exists depending
  39 // on arguments should also go in here.
  40 
  41 class MacroAssembler: public Assembler {
  42   friend class LIR_Assembler;
  43   friend class Runtime1;      // as_Address()
  44 
  45  public:
  46   // Support for VM calls
  47   //
  48   // This is the base routine called by the different versions of call_VM_leaf. The interpreter
  49   // may customize this version by overriding it for its purposes (e.g., to save/restore
  50   // additional registers when doing a VM call).
  51 
  52   virtual void call_VM_leaf_base(
  53     address entry_point,               // the entry point
  54     int     number_of_arguments        // the number of arguments to pop after the call
  55   );
  56 
  57  protected:
  58   // This is the base routine called by the different versions of call_VM. The interpreter
  59   // may customize this version by overriding it for its purposes (e.g., to save/restore
  60   // additional registers when doing a VM call).
  61   //
  62   // If no java_thread register is specified (noreg) than rdi will be used instead. call_VM_base
  63   // returns the register which contains the thread upon return. If a thread register has been
  64   // specified, the return value will correspond to that register. If no last_java_sp is specified
  65   // (noreg) than rsp will be used instead.
  66   virtual void call_VM_base(           // returns the register containing the thread upon return
  67     Register oop_result,               // where an oop-result ends up if any; use noreg otherwise
  68     Register java_thread,              // the thread if computed before     ; use noreg otherwise
  69     Register last_java_sp,             // to set up last_Java_frame in stubs; use noreg otherwise
  70     address  entry_point,              // the entry point
  71     int      number_of_arguments,      // the number of arguments (w/o thread) to pop after the call
  72     bool     check_exceptions          // whether to check for pending exceptions after return
  73   );
  74 
  75   void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions = true);
  76 
  77   // helpers for FPU flag access
  78   // tmp is a temporary register, if none is available use noreg
  79   void save_rax   (Register tmp);
  80   void restore_rax(Register tmp);
  81 
  82  public:
  83 
  84   enum KlassDecodeMode {
  85     KlassDecodeNone,
  86     KlassDecodeZero,
  87     KlassDecodeXor,
  88     KlassDecodeAdd
  89   };
  90 
  91   // Return the current narrow Klass pointer decode mode. Initialized on first call.
  92   static KlassDecodeMode klass_decode_mode();
  93 
  94   // Given an arbitrary base address, return the KlassDecodeMode that would be used. Return KlassDecodeNone
  95   // if base address is not valid for encoding.
  96   static KlassDecodeMode klass_decode_mode_for_base(address base);
  97 
  98   // Returns a static string
  99   static const char* describe_klass_decode_mode(KlassDecodeMode mode);
 100 
 101  private:
 102   static KlassDecodeMode _klass_decode_mode;
 103 
 104  public:
 105   MacroAssembler(CodeBuffer* code) : Assembler(code) {}
 106 
 107  // These routines should emit JVMTI PopFrame and ForceEarlyReturn handling code.
 108  // The implementation is only non-empty for the InterpreterMacroAssembler,
 109  // as only the interpreter handles PopFrame and ForceEarlyReturn requests.
 110  virtual void check_and_handle_popframe(Register java_thread);
 111  virtual void check_and_handle_earlyret(Register java_thread);
 112 
 113   Address as_Address(AddressLiteral adr);
 114   Address as_Address(ArrayAddress adr, Register rscratch);
 115 
 116   // Support for NULL-checks
 117   //
 118   // Generates code that causes a NULL OS exception if the content of reg is NULL.
 119   // If the accessed location is M[reg + offset] and the offset is known, provide the
 120   // offset. No explicit code generation is needed if the offset is within a certain
 121   // range (0 <= offset <= page_size).
 122 
 123   void null_check(Register reg, int offset = -1);
 124   static bool needs_explicit_null_check(intptr_t offset);
 125   static bool uses_implicit_null_check(void* address);
 126 
 127   // Required platform-specific helpers for Label::patch_instructions.
 128   // They _shadow_ the declarations in AbstractAssembler, which are undefined.
 129   void pd_patch_instruction(address branch, address target, const char* file, int line) {
 130     unsigned char op = branch[0];
 131     assert(op == 0xE8 /* call */ ||
 132         op == 0xE9 /* jmp */ ||
 133         op == 0xEB /* short jmp */ ||
 134         (op & 0xF0) == 0x70 /* short jcc */ ||
 135         op == 0x0F && (branch[1] & 0xF0) == 0x80 /* jcc */ ||
 136         op == 0xC7 && branch[1] == 0xF8 /* xbegin */,
 137         "Invalid opcode at patch point");
 138 
 139     if (op == 0xEB || (op & 0xF0) == 0x70) {
 140       // short offset operators (jmp and jcc)
 141       char* disp = (char*) &branch[1];
 142       int imm8 = target - (address) &disp[1];
 143       guarantee(this->is8bit(imm8), "Short forward jump exceeds 8-bit offset at %s:%d",
 144                 file == NULL ? "<NULL>" : file, line);
 145       *disp = imm8;
 146     } else {
 147       int* disp = (int*) &branch[(op == 0x0F || op == 0xC7)? 2: 1];
 148       int imm32 = target - (address) &disp[1];
 149       *disp = imm32;
 150     }
 151   }
 152 
 153   // The following 4 methods return the offset of the appropriate move instruction
 154 
 155   // Support for fast byte/short loading with zero extension (depending on particular CPU)
 156   int load_unsigned_byte(Register dst, Address src);
 157   int load_unsigned_short(Register dst, Address src);
 158 
 159   // Support for fast byte/short loading with sign extension (depending on particular CPU)
 160   int load_signed_byte(Register dst, Address src);
 161   int load_signed_short(Register dst, Address src);
 162 
 163   // Support for sign-extension (hi:lo = extend_sign(lo))
 164   void extend_sign(Register hi, Register lo);
 165 
 166   // Load and store values by size and signed-ness
 167   void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2 = noreg);
 168   void store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2 = noreg);
 169 
 170   // Support for inc/dec with optimal instruction selection depending on value
 171 
 172   void increment(Register reg, int value = 1) { LP64_ONLY(incrementq(reg, value)) NOT_LP64(incrementl(reg, value)) ; }
 173   void decrement(Register reg, int value = 1) { LP64_ONLY(decrementq(reg, value)) NOT_LP64(decrementl(reg, value)) ; }
 174 
 175   void decrementl(Address dst, int value = 1);
 176   void decrementl(Register reg, int value = 1);
 177 
 178   void decrementq(Register reg, int value = 1);
 179   void decrementq(Address dst, int value = 1);
 180 
 181   void incrementl(Address dst, int value = 1);
 182   void incrementl(Register reg, int value = 1);
 183 
 184   void incrementq(Register reg, int value = 1);
 185   void incrementq(Address dst, int value = 1);
 186 
 187   // Support optimal SSE move instructions.
 188   void movflt(XMMRegister dst, XMMRegister src) {
 189     if (dst-> encoding() == src->encoding()) return;
 190     if (UseXmmRegToRegMoveAll) { movaps(dst, src); return; }
 191     else                       { movss (dst, src); return; }
 192   }
 193   void movflt(XMMRegister dst, Address src) { movss(dst, src); }
 194   void movflt(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 195   void movflt(Address dst, XMMRegister src) { movss(dst, src); }
 196 
 197   // Move with zero extension
 198   void movfltz(XMMRegister dst, XMMRegister src) { movss(dst, src); }
 199 
 200   void movdbl(XMMRegister dst, XMMRegister src) {
 201     if (dst-> encoding() == src->encoding()) return;
 202     if (UseXmmRegToRegMoveAll) { movapd(dst, src); return; }
 203     else                       { movsd (dst, src); return; }
 204   }
 205 
 206   void movdbl(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 207 
 208   void movdbl(XMMRegister dst, Address src) {
 209     if (UseXmmLoadAndClearUpper) { movsd (dst, src); return; }
 210     else                         { movlpd(dst, src); return; }
 211   }
 212   void movdbl(Address dst, XMMRegister src) { movsd(dst, src); }
 213 
 214   void incrementl(AddressLiteral dst, Register rscratch = noreg);
 215   void incrementl(ArrayAddress   dst, Register rscratch);
 216 
 217   void incrementq(AddressLiteral dst, Register rscratch = noreg);
 218 
 219   // Alignment
 220   void align32();
 221   void align64();
 222   void align(int modulus);
 223   void align(int modulus, int target);
 224 
 225   void post_call_nop();
 226   // A 5 byte nop that is safe for patching (see patch_verified_entry)
 227   void fat_nop();
 228 
 229   // Stack frame creation/removal
 230   void enter();
 231   void leave();
 232 
 233   // Support for getting the JavaThread pointer (i.e.; a reference to thread-local information)
 234   // The pointer will be loaded into the thread register.
 235   void get_thread(Register thread);
 236 
 237 #ifdef _LP64
 238   // Support for argument shuffling
 239 
 240   // bias in bytes
 241   void move32_64(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 242   void long_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 243   void float_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 244   void double_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 245   void move_ptr(VMRegPair src, VMRegPair dst);
 246   void object_move(OopMap* map,
 247                    int oop_handle_offset,
 248                    int framesize_in_slots,
 249                    VMRegPair src,
 250                    VMRegPair dst,
 251                    bool is_receiver,
 252                    int* receiver_offset);
 253 #endif // _LP64
 254 
 255   // Support for VM calls
 256   //
 257   // It is imperative that all calls into the VM are handled via the call_VM macros.
 258   // They make sure that the stack linkage is setup correctly. call_VM's correspond
 259   // to ENTRY/ENTRY_X entry points while call_VM_leaf's correspond to LEAF entry points.
 260 
 261 
 262   void call_VM(Register oop_result,
 263                address entry_point,
 264                bool check_exceptions = true);
 265   void call_VM(Register oop_result,
 266                address entry_point,
 267                Register arg_1,
 268                bool check_exceptions = true);
 269   void call_VM(Register oop_result,
 270                address entry_point,
 271                Register arg_1, Register arg_2,
 272                bool check_exceptions = true);
 273   void call_VM(Register oop_result,
 274                address entry_point,
 275                Register arg_1, Register arg_2, Register arg_3,
 276                bool check_exceptions = true);
 277 
 278   // Overloadings with last_Java_sp
 279   void call_VM(Register oop_result,
 280                Register last_java_sp,
 281                address entry_point,
 282                int number_of_arguments = 0,
 283                bool check_exceptions = true);
 284   void call_VM(Register oop_result,
 285                Register last_java_sp,
 286                address entry_point,
 287                Register arg_1, bool
 288                check_exceptions = true);
 289   void call_VM(Register oop_result,
 290                Register last_java_sp,
 291                address entry_point,
 292                Register arg_1, Register arg_2,
 293                bool check_exceptions = true);
 294   void call_VM(Register oop_result,
 295                Register last_java_sp,
 296                address entry_point,
 297                Register arg_1, Register arg_2, Register arg_3,
 298                bool check_exceptions = true);
 299 
 300   void get_vm_result  (Register oop_result, Register thread);
 301   void get_vm_result_2(Register metadata_result, Register thread);
 302 
 303   // These always tightly bind to MacroAssembler::call_VM_base
 304   // bypassing the virtual implementation
 305   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments = 0, bool check_exceptions = true);
 306   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions = true);
 307   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions = true);
 308   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions = true);
 309   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4, bool check_exceptions = true);
 310 
 311   void call_VM_leaf0(address entry_point);
 312   void call_VM_leaf(address entry_point,
 313                     int number_of_arguments = 0);
 314   void call_VM_leaf(address entry_point,
 315                     Register arg_1);
 316   void call_VM_leaf(address entry_point,
 317                     Register arg_1, Register arg_2);
 318   void call_VM_leaf(address entry_point,
 319                     Register arg_1, Register arg_2, Register arg_3);
 320 
 321   void call_VM_leaf(address entry_point,
 322                     Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 323 
 324   // These always tightly bind to MacroAssembler::call_VM_leaf_base
 325   // bypassing the virtual implementation
 326   void super_call_VM_leaf(address entry_point);
 327   void super_call_VM_leaf(address entry_point, Register arg_1);
 328   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
 329   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
 330   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 331 
 332   // last Java Frame (fills frame anchor)
 333   void set_last_Java_frame(Register thread,
 334                            Register last_java_sp,
 335                            Register last_java_fp,
 336                            address  last_java_pc,
 337                            Register rscratch);
 338 
 339   // thread in the default location (r15_thread on 64bit)
 340   void set_last_Java_frame(Register last_java_sp,
 341                            Register last_java_fp,
 342                            address  last_java_pc,
 343                            Register rscratch);
 344 
 345   void reset_last_Java_frame(Register thread, bool clear_fp);
 346 
 347   // thread in the default location (r15_thread on 64bit)
 348   void reset_last_Java_frame(bool clear_fp);
 349 
 350   // jobjects
 351   void clear_jobject_tag(Register possibly_non_local);
 352   void resolve_jobject(Register value, Register thread, Register tmp);
 353   void resolve_global_jobject(Register value, Register thread, Register tmp);
 354 
 355   // C 'boolean' to Java boolean: x == 0 ? 0 : 1
 356   void c2bool(Register x);
 357 
 358   // C++ bool manipulation
 359 
 360   void movbool(Register dst, Address src);
 361   void movbool(Address dst, bool boolconst);
 362   void movbool(Address dst, Register src);
 363   void testbool(Register dst);
 364 
 365   void resolve_oop_handle(Register result, Register tmp);
 366   void resolve_weak_handle(Register result, Register tmp);
 367   void load_mirror(Register mirror, Register method, Register tmp);
 368   void load_method_holder_cld(Register rresult, Register rmethod);
 369 
 370   void load_method_holder(Register holder, Register method);
 371 
 372   // oop manipulations
 373 #ifdef _LP64
 374   void load_nklass(Register dst, Register src);
 375 #endif
 376   void load_klass(Register dst, Register src, Register tmp);
 377   void load_klass_check_null(Register dst, Register src, Register tmp);
 378   void store_klass(Register dst, Register src, Register tmp);
 379 
 380   // Compares the Klass pointer of an object to a given Klass (which might be narrow,
 381   // depending on UseCompressedClassPointers).
 382   void cmp_klass(Register klass, Register dst, Register tmp);
 383 
 384   // Compares the Klass pointer of two objects o1 and o2. Result is in the condition flags.
 385   // Uses t1 and t2 as temporary registers.
 386   void cmp_klass(Register src, Register dst, Register tmp1, Register tmp2);
 387 
 388   void access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
 389                       Register tmp1, Register thread_tmp);
 390   void access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val,
 391                        Register tmp1, Register tmp2, Register tmp3);
 392 
 393   void load_heap_oop(Register dst, Address src, Register tmp1 = noreg,
 394                      Register thread_tmp = noreg, DecoratorSet decorators = 0);
 395   void load_heap_oop_not_null(Register dst, Address src, Register tmp1 = noreg,
 396                               Register thread_tmp = noreg, DecoratorSet decorators = 0);
 397   void store_heap_oop(Address dst, Register val, Register tmp1 = noreg,
 398                       Register tmp2 = noreg, Register tmp3 = noreg, DecoratorSet decorators = 0);
 399 
 400   // Used for storing NULL. All other oop constants should be
 401   // stored using routines that take a jobject.
 402   void store_heap_oop_null(Address dst);
 403 
 404 #ifdef _LP64
 405   void store_klass_gap(Register dst, Register src);
 406 
 407   // This dummy is to prevent a call to store_heap_oop from
 408   // converting a zero (like NULL) into a Register by giving
 409   // the compiler two choices it can't resolve
 410 
 411   void store_heap_oop(Address dst, void* dummy);
 412 
 413   void encode_heap_oop(Register r);
 414   void decode_heap_oop(Register r);
 415   void encode_heap_oop_not_null(Register r);
 416   void decode_heap_oop_not_null(Register r);
 417   void encode_heap_oop_not_null(Register dst, Register src);
 418   void decode_heap_oop_not_null(Register dst, Register src);
 419 
 420   void set_narrow_oop(Register dst, jobject obj);
 421   void set_narrow_oop(Address dst, jobject obj);
 422   void cmp_narrow_oop(Register dst, jobject obj);
 423   void cmp_narrow_oop(Address dst, jobject obj);
 424 
 425   void encode_klass_not_null(Register r, Register tmp);
 426   void decode_klass_not_null(Register r, Register tmp);
 427   void encode_and_move_klass_not_null(Register dst, Register src);
 428   void decode_and_move_klass_not_null(Register dst, Register src);
 429   void set_narrow_klass(Register dst, Klass* k);
 430   void set_narrow_klass(Address dst, Klass* k);
 431   void cmp_narrow_klass(Register dst, Klass* k);
 432   void cmp_narrow_klass(Address dst, Klass* k);
 433 
 434   // if heap base register is used - reinit it with the correct value
 435   void reinit_heapbase();
 436 
 437   DEBUG_ONLY(void verify_heapbase(const char* msg);)
 438 
 439 #endif // _LP64
 440 
 441   // Int division/remainder for Java
 442   // (as idivl, but checks for special case as described in JVM spec.)
 443   // returns idivl instruction offset for implicit exception handling
 444   int corrected_idivl(Register reg);
 445 
 446   // Long division/remainder for Java
 447   // (as idivq, but checks for special case as described in JVM spec.)
 448   // returns idivq instruction offset for implicit exception handling
 449   int corrected_idivq(Register reg);
 450 
 451   void int3();
 452 
 453   // Long operation macros for a 32bit cpu
 454   // Long negation for Java
 455   void lneg(Register hi, Register lo);
 456 
 457   // Long multiplication for Java
 458   // (destroys contents of eax, ebx, ecx and edx)
 459   void lmul(int x_rsp_offset, int y_rsp_offset); // rdx:rax = x * y
 460 
 461   // Long shifts for Java
 462   // (semantics as described in JVM spec.)
 463   void lshl(Register hi, Register lo);                               // hi:lo << (rcx & 0x3f)
 464   void lshr(Register hi, Register lo, bool sign_extension = false);  // hi:lo >> (rcx & 0x3f)
 465 
 466   // Long compare for Java
 467   // (semantics as described in JVM spec.)
 468   void lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo); // x_hi = lcmp(x, y)
 469 
 470 
 471   // misc
 472 
 473   // Sign extension
 474   void sign_extend_short(Register reg);
 475   void sign_extend_byte(Register reg);
 476 
 477   // Division by power of 2, rounding towards 0
 478   void division_with_shift(Register reg, int shift_value);
 479 
 480 #ifndef _LP64
 481   // Compares the top-most stack entries on the FPU stack and sets the eflags as follows:
 482   //
 483   // CF (corresponds to C0) if x < y
 484   // PF (corresponds to C2) if unordered
 485   // ZF (corresponds to C3) if x = y
 486   //
 487   // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
 488   // tmp is a temporary register, if none is available use noreg (only matters for non-P6 code)
 489   void fcmp(Register tmp);
 490   // Variant of the above which allows y to be further down the stack
 491   // and which only pops x and y if specified. If pop_right is
 492   // specified then pop_left must also be specified.
 493   void fcmp(Register tmp, int index, bool pop_left, bool pop_right);
 494 
 495   // Floating-point comparison for Java
 496   // Compares the top-most stack entries on the FPU stack and stores the result in dst.
 497   // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
 498   // (semantics as described in JVM spec.)
 499   void fcmp2int(Register dst, bool unordered_is_less);
 500   // Variant of the above which allows y to be further down the stack
 501   // and which only pops x and y if specified. If pop_right is
 502   // specified then pop_left must also be specified.
 503   void fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right);
 504 
 505   // Floating-point remainder for Java (ST0 = ST0 fremr ST1, ST1 is empty afterwards)
 506   // tmp is a temporary register, if none is available use noreg
 507   void fremr(Register tmp);
 508 
 509   // only if +VerifyFPU
 510   void verify_FPU(int stack_depth, const char* s = "illegal FPU state");
 511 #endif // !LP64
 512 
 513   // dst = c = a * b + c
 514   void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 515   void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 516 
 517   void vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 518   void vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 519   void vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 520   void vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 521 
 522 
 523   // same as fcmp2int, but using SSE2
 524   void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 525   void cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 526 
 527   // branch to L if FPU flag C2 is set/not set
 528   // tmp is a temporary register, if none is available use noreg
 529   void jC2 (Register tmp, Label& L);
 530   void jnC2(Register tmp, Label& L);
 531 
 532   // Load float value from 'address'. If UseSSE >= 1, the value is loaded into
 533   // register xmm0. Otherwise, the value is loaded onto the FPU stack.
 534   void load_float(Address src);
 535 
 536   // Store float value to 'address'. If UseSSE >= 1, the value is stored
 537   // from register xmm0. Otherwise, the value is stored from the FPU stack.
 538   void store_float(Address dst);
 539 
 540   // Load double value from 'address'. If UseSSE >= 2, the value is loaded into
 541   // register xmm0. Otherwise, the value is loaded onto the FPU stack.
 542   void load_double(Address src);
 543 
 544   // Store double value to 'address'. If UseSSE >= 2, the value is stored
 545   // from register xmm0. Otherwise, the value is stored from the FPU stack.
 546   void store_double(Address dst);
 547 
 548 #ifndef _LP64
 549   // Pop ST (ffree & fincstp combined)
 550   void fpop();
 551 
 552   void empty_FPU_stack();
 553 #endif // !_LP64
 554 
 555   void push_IU_state();
 556   void pop_IU_state();
 557 
 558   void push_FPU_state();
 559   void pop_FPU_state();
 560 
 561   void push_CPU_state();
 562   void pop_CPU_state();
 563 
 564   void push_cont_fastpath();
 565   void pop_cont_fastpath();
 566 
 567   void inc_held_monitor_count();
 568   void dec_held_monitor_count();
 569 
 570   DEBUG_ONLY(void stop_if_in_cont(Register cont_reg, const char* name);)
 571 
 572   // Round up to a power of two
 573   void round_to(Register reg, int modulus);
 574 
 575 private:
 576   // General purpose and XMM registers potentially clobbered by native code; there
 577   // is no need for FPU or AVX opmask related methods because C1/interpreter
 578   // - we save/restore FPU state as a whole always
 579   // - do not care about AVX-512 opmask
 580   static RegSet call_clobbered_gp_registers();
 581   static XMMRegSet call_clobbered_xmm_registers();
 582 
 583   void push_set(XMMRegSet set, int offset);
 584   void pop_set(XMMRegSet set, int offset);
 585 
 586 public:
 587   void push_set(RegSet set, int offset = -1);
 588   void pop_set(RegSet set, int offset = -1);
 589 
 590   // Push and pop everything that might be clobbered by a native
 591   // runtime call.
 592   // Only save the lower 64 bits of each vector register.
 593   // Additional registers can be excluded in a passed RegSet.
 594   void push_call_clobbered_registers_except(RegSet exclude, bool save_fpu = true);
 595   void pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu = true);
 596 
 597   void push_call_clobbered_registers(bool save_fpu = true) {
 598     push_call_clobbered_registers_except(RegSet(), save_fpu);
 599   }
 600   void pop_call_clobbered_registers(bool restore_fpu = true) {
 601     pop_call_clobbered_registers_except(RegSet(), restore_fpu);
 602   }
 603 
 604   // allocation
 605   void tlab_allocate(
 606     Register thread,                   // Current thread
 607     Register obj,                      // result: pointer to object after successful allocation
 608     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
 609     int      con_size_in_bytes,        // object size in bytes if   known at compile time
 610     Register t1,                       // temp register
 611     Register t2,                       // temp register
 612     Label&   slow_case                 // continuation point if fast allocation fails
 613   );
 614   void zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp);
 615 
 616   // interface method calling
 617   void lookup_interface_method(Register recv_klass,
 618                                Register intf_klass,
 619                                RegisterOrConstant itable_index,
 620                                Register method_result,
 621                                Register scan_temp,
 622                                Label& no_such_interface,
 623                                bool return_method = true);
 624 
 625   // virtual method calling
 626   void lookup_virtual_method(Register recv_klass,
 627                              RegisterOrConstant vtable_index,
 628                              Register method_result);
 629 
 630   // Test sub_klass against super_klass, with fast and slow paths.
 631 
 632   // The fast path produces a tri-state answer: yes / no / maybe-slow.
 633   // One of the three labels can be NULL, meaning take the fall-through.
 634   // If super_check_offset is -1, the value is loaded up from super_klass.
 635   // No registers are killed, except temp_reg.
 636   void check_klass_subtype_fast_path(Register sub_klass,
 637                                      Register super_klass,
 638                                      Register temp_reg,
 639                                      Label* L_success,
 640                                      Label* L_failure,
 641                                      Label* L_slow_path,
 642                 RegisterOrConstant super_check_offset = RegisterOrConstant(-1));
 643 
 644   // The rest of the type check; must be wired to a corresponding fast path.
 645   // It does not repeat the fast path logic, so don't use it standalone.
 646   // The temp_reg and temp2_reg can be noreg, if no temps are available.
 647   // Updates the sub's secondary super cache as necessary.
 648   // If set_cond_codes, condition codes will be Z on success, NZ on failure.
 649   void check_klass_subtype_slow_path(Register sub_klass,
 650                                      Register super_klass,
 651                                      Register temp_reg,
 652                                      Register temp2_reg,
 653                                      Label* L_success,
 654                                      Label* L_failure,
 655                                      bool set_cond_codes = false);
 656 
 657   // Simplified, combined version, good for typical uses.
 658   // Falls through on failure.
 659   void check_klass_subtype(Register sub_klass,
 660                            Register super_klass,
 661                            Register temp_reg,
 662                            Label& L_success);
 663 
 664   void clinit_barrier(Register klass,
 665                       Register thread,
 666                       Label* L_fast_path = NULL,
 667                       Label* L_slow_path = NULL);
 668 
 669   // method handles (JSR 292)
 670   Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0);
 671 
 672   // Debugging
 673 
 674   // only if +VerifyOops
 675   void _verify_oop(Register reg, const char* s, const char* file, int line);
 676   void _verify_oop_addr(Address addr, const char* s, const char* file, int line);
 677 
 678   void _verify_oop_checked(Register reg, const char* s, const char* file, int line) {
 679     if (VerifyOops) {
 680       _verify_oop(reg, s, file, line);
 681     }
 682   }
 683   void _verify_oop_addr_checked(Address reg, const char* s, const char* file, int line) {
 684     if (VerifyOops) {
 685       _verify_oop_addr(reg, s, file, line);
 686     }
 687   }
 688 
 689   // TODO: verify method and klass metadata (compare against vptr?)
 690   void _verify_method_ptr(Register reg, const char * msg, const char * file, int line) {}
 691   void _verify_klass_ptr(Register reg, const char * msg, const char * file, int line){}
 692 
 693 #define verify_oop(reg) _verify_oop_checked(reg, "broken oop " #reg, __FILE__, __LINE__)
 694 #define verify_oop_msg(reg, msg) _verify_oop_checked(reg, "broken oop " #reg ", " #msg, __FILE__, __LINE__)
 695 #define verify_oop_addr(addr) _verify_oop_addr_checked(addr, "broken oop addr " #addr, __FILE__, __LINE__)
 696 #define verify_method_ptr(reg) _verify_method_ptr(reg, "broken method " #reg, __FILE__, __LINE__)
 697 #define verify_klass_ptr(reg) _verify_klass_ptr(reg, "broken klass " #reg, __FILE__, __LINE__)
 698 
 699   // Verify or restore cpu control state after JNI call
 700   void restore_cpu_control_state_after_jni(Register rscratch);
 701 
 702   // prints msg, dumps registers and stops execution
 703   void stop(const char* msg);
 704 
 705   // prints msg and continues
 706   void warn(const char* msg);
 707 
 708   // dumps registers and other state
 709   void print_state();
 710 
 711   static void debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg);
 712   static void debug64(char* msg, int64_t pc, int64_t regs[]);
 713   static void print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip);
 714   static void print_state64(int64_t pc, int64_t regs[]);
 715 
 716   void os_breakpoint();
 717 
 718   void untested()                                { stop("untested"); }
 719 
 720   void unimplemented(const char* what = "");
 721 
 722   void should_not_reach_here()                   { stop("should not reach here"); }
 723 
 724   void print_CPU_state();
 725 
 726   // Stack overflow checking
 727   void bang_stack_with_offset(int offset) {
 728     // stack grows down, caller passes positive offset
 729     assert(offset > 0, "must bang with negative offset");
 730     movl(Address(rsp, (-offset)), rax);
 731   }
 732 
 733   // Writes to stack successive pages until offset reached to check for
 734   // stack overflow + shadow pages.  Also, clobbers tmp
 735   void bang_stack_size(Register size, Register tmp);
 736 
 737   // Check for reserved stack access in method being exited (for JIT)
 738   void reserved_stack_check();
 739 
 740   void safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod);
 741 
 742   void verify_tlab();
 743 
 744   static Condition negate_condition(Condition cond);
 745 
 746   // Instructions that use AddressLiteral operands. These instruction can handle 32bit/64bit
 747   // operands. In general the names are modified to avoid hiding the instruction in Assembler
 748   // so that we don't need to implement all the varieties in the Assembler with trivial wrappers
 749   // here in MacroAssembler. The major exception to this rule is call
 750 
 751   // Arithmetics
 752 
 753 
 754   void addptr(Address dst, int32_t src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)) ; }
 755   void addptr(Address dst, Register src);
 756 
 757   void addptr(Register dst, Address src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); }
 758   void addptr(Register dst, int32_t src);
 759   void addptr(Register dst, Register src);
 760   void addptr(Register dst, RegisterOrConstant src) {
 761     if (src.is_constant()) addptr(dst, src.as_constant());
 762     else                   addptr(dst, src.as_register());
 763   }
 764 
 765   void andptr(Register dst, int32_t src);
 766   void andptr(Register src1, Register src2) { LP64_ONLY(andq(src1, src2)) NOT_LP64(andl(src1, src2)) ; }
 767 
 768 #ifdef _LP64
 769   using Assembler::andq;
 770   void andq(Register dst, AddressLiteral src, Register rscratch = noreg);
 771 #endif
 772 
 773   void cmp8(AddressLiteral src1, int imm, Register rscratch = noreg);
 774 
 775   // renamed to drag out the casting of address to int32_t/intptr_t
 776   void cmp32(Register src1, int32_t imm);
 777 
 778   void cmp32(AddressLiteral src1, int32_t imm, Register rscratch = noreg);
 779   // compare reg - mem, or reg - &mem
 780   void cmp32(Register src1, AddressLiteral src2, Register rscratch = noreg);
 781 
 782   void cmp32(Register src1, Address src2);
 783 
 784 #ifndef _LP64
 785   void cmpklass(Address dst, Metadata* obj);
 786   void cmpklass(Register dst, Metadata* obj);
 787   void cmpoop(Address dst, jobject obj);
 788 #endif // _LP64
 789 
 790   void cmpoop(Register src1, Register src2);
 791   void cmpoop(Register src1, Address src2);
 792   void cmpoop(Register dst, jobject obj, Register rscratch);
 793 
 794   // NOTE src2 must be the lval. This is NOT an mem-mem compare
 795   void cmpptr(Address src1, AddressLiteral src2, Register rscratch);
 796 
 797   void cmpptr(Register src1, AddressLiteral src2, Register rscratch = noreg);
 798 
 799   void cmpptr(Register src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 800   void cmpptr(Register src1, Address src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 801   // void cmpptr(Address src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 802 
 803   void cmpptr(Register src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 804   void cmpptr(Address src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 805 
 806   // cmp64 to avoild hiding cmpq
 807   void cmp64(Register src1, AddressLiteral src, Register rscratch = noreg);
 808 
 809   void cmpxchgptr(Register reg, Address adr);
 810 
 811   void locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch = noreg);
 812 
 813   void imulptr(Register dst, Register src) { LP64_ONLY(imulq(dst, src)) NOT_LP64(imull(dst, src)); }
 814   void imulptr(Register dst, Register src, int imm32) { LP64_ONLY(imulq(dst, src, imm32)) NOT_LP64(imull(dst, src, imm32)); }
 815 
 816 
 817   void negptr(Register dst) { LP64_ONLY(negq(dst)) NOT_LP64(negl(dst)); }
 818 
 819   void notptr(Register dst) { LP64_ONLY(notq(dst)) NOT_LP64(notl(dst)); }
 820 
 821   void shlptr(Register dst, int32_t shift);
 822   void shlptr(Register dst) { LP64_ONLY(shlq(dst)) NOT_LP64(shll(dst)); }
 823 
 824   void shrptr(Register dst, int32_t shift);
 825   void shrptr(Register dst) { LP64_ONLY(shrq(dst)) NOT_LP64(shrl(dst)); }
 826 
 827   void sarptr(Register dst) { LP64_ONLY(sarq(dst)) NOT_LP64(sarl(dst)); }
 828   void sarptr(Register dst, int32_t src) { LP64_ONLY(sarq(dst, src)) NOT_LP64(sarl(dst, src)); }
 829 
 830   void subptr(Address dst, int32_t src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
 831 
 832   void subptr(Register dst, Address src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
 833   void subptr(Register dst, int32_t src);
 834   // Force generation of a 4 byte immediate value even if it fits into 8bit
 835   void subptr_imm32(Register dst, int32_t src);
 836   void subptr(Register dst, Register src);
 837   void subptr(Register dst, RegisterOrConstant src) {
 838     if (src.is_constant()) subptr(dst, (int) src.as_constant());
 839     else                   subptr(dst,       src.as_register());
 840   }
 841 
 842   void sbbptr(Address dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
 843   void sbbptr(Register dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
 844 
 845   void xchgptr(Register src1, Register src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
 846   void xchgptr(Register src1, Address src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
 847 
 848   void xaddptr(Address src1, Register src2) { LP64_ONLY(xaddq(src1, src2)) NOT_LP64(xaddl(src1, src2)) ; }
 849 
 850 
 851 
 852   // Helper functions for statistics gathering.
 853   // Conditionally (atomically, on MPs) increments passed counter address, preserving condition codes.
 854   void cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch = noreg);
 855   // Unconditional atomic increment.
 856   void atomic_incl(Address counter_addr);
 857   void atomic_incl(AddressLiteral counter_addr, Register rscratch = noreg);
 858 #ifdef _LP64
 859   void atomic_incq(Address counter_addr);
 860   void atomic_incq(AddressLiteral counter_addr, Register rscratch = noreg);
 861 #endif
 862   void atomic_incptr(AddressLiteral counter_addr, Register rscratch = noreg) { LP64_ONLY(atomic_incq(counter_addr, rscratch)) NOT_LP64(atomic_incl(counter_addr, rscratch)) ; }
 863   void atomic_incptr(Address counter_addr) { LP64_ONLY(atomic_incq(counter_addr)) NOT_LP64(atomic_incl(counter_addr)) ; }
 864 
 865   void lea(Register dst, Address        adr) { Assembler::lea(dst, adr); }
 866   void lea(Register dst, AddressLiteral adr);
 867   void lea(Address  dst, AddressLiteral adr, Register rscratch);
 868 
 869   void leal32(Register dst, Address src) { leal(dst, src); }
 870 
 871   // Import other testl() methods from the parent class or else
 872   // they will be hidden by the following overriding declaration.
 873   using Assembler::testl;
 874   void testl(Address dst, int32_t imm32);
 875   void testl(Register dst, int32_t imm32);
 876   void testl(Register dst, AddressLiteral src); // requires reachable address
 877   using Assembler::testq;
 878   void testq(Address dst, int32_t imm32);
 879   void testq(Register dst, int32_t imm32);
 880 
 881   void orptr(Register dst, Address src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 882   void orptr(Register dst, Register src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 883   void orptr(Register dst, int32_t src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 884   void orptr(Address dst, int32_t imm32) { LP64_ONLY(orq(dst, imm32)) NOT_LP64(orl(dst, imm32)); }
 885 
 886   void testptr(Register src, int32_t imm32) {  LP64_ONLY(testq(src, imm32)) NOT_LP64(testl(src, imm32)); }
 887   void testptr(Register src1, Address src2) { LP64_ONLY(testq(src1, src2)) NOT_LP64(testl(src1, src2)); }
 888   void testptr(Register src1, Register src2);
 889 
 890   void xorptr(Register dst, Register src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
 891   void xorptr(Register dst, Address src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
 892 
 893   // Calls
 894 
 895   void call(Label& L, relocInfo::relocType rtype);
 896   void call(Register entry);
 897   void call(Address addr) { Assembler::call(addr); }
 898 
 899   // NOTE: this call transfers to the effective address of entry NOT
 900   // the address contained by entry. This is because this is more natural
 901   // for jumps/calls.
 902   void call(AddressLiteral entry, Register rscratch = rax);
 903 
 904   // Emit the CompiledIC call idiom
 905   void ic_call(address entry, jint method_index = 0);
 906 
 907   void emit_static_call_stub();
 908 
 909   // Jumps
 910 
 911   // NOTE: these jumps transfer to the effective address of dst NOT
 912   // the address contained by dst. This is because this is more natural
 913   // for jumps/calls.
 914   void jump(AddressLiteral dst, Register rscratch = noreg);
 915 
 916   void jump_cc(Condition cc, AddressLiteral dst, Register rscratch = noreg);
 917 
 918   // 32bit can do a case table jump in one instruction but we no longer allow the base
 919   // to be installed in the Address class. This jump will transfer to the address
 920   // contained in the location described by entry (not the address of entry)
 921   void jump(ArrayAddress entry, Register rscratch);
 922 
 923   // Floating
 924 
 925   void push_f(XMMRegister r);
 926   void pop_f(XMMRegister r);
 927   void push_d(XMMRegister r);
 928   void pop_d(XMMRegister r);
 929 
 930   void andpd(XMMRegister dst, XMMRegister    src) { Assembler::andpd(dst, src); }
 931   void andpd(XMMRegister dst, Address        src) { Assembler::andpd(dst, src); }
 932   void andpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 933 
 934   void andps(XMMRegister dst, XMMRegister    src) { Assembler::andps(dst, src); }
 935   void andps(XMMRegister dst, Address        src) { Assembler::andps(dst, src); }
 936   void andps(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 937 
 938   void comiss(XMMRegister dst, XMMRegister    src) { Assembler::comiss(dst, src); }
 939   void comiss(XMMRegister dst, Address        src) { Assembler::comiss(dst, src); }
 940   void comiss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 941 
 942   void comisd(XMMRegister dst, XMMRegister    src) { Assembler::comisd(dst, src); }
 943   void comisd(XMMRegister dst, Address        src) { Assembler::comisd(dst, src); }
 944   void comisd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 945 
 946 #ifndef _LP64
 947   void fadd_s(Address        src) { Assembler::fadd_s(src); }
 948   void fadd_s(AddressLiteral src) { Assembler::fadd_s(as_Address(src)); }
 949 
 950   void fldcw(Address        src) { Assembler::fldcw(src); }
 951   void fldcw(AddressLiteral src);
 952 
 953   void fld_s(int index)          { Assembler::fld_s(index); }
 954   void fld_s(Address        src) { Assembler::fld_s(src); }
 955   void fld_s(AddressLiteral src);
 956 
 957   void fld_d(Address        src) { Assembler::fld_d(src); }
 958   void fld_d(AddressLiteral src);
 959 
 960   void fld_x(Address        src) { Assembler::fld_x(src); }
 961   void fld_x(AddressLiteral src) { Assembler::fld_x(as_Address(src)); }
 962 
 963   void fmul_s(Address        src) { Assembler::fmul_s(src); }
 964   void fmul_s(AddressLiteral src) { Assembler::fmul_s(as_Address(src)); }
 965 #endif // !_LP64
 966 
 967   void ldmxcsr(Address src) { Assembler::ldmxcsr(src); }
 968   void ldmxcsr(AddressLiteral src, Register rscratch = noreg);
 969 
 970 #ifdef _LP64
 971  private:
 972   void sha256_AVX2_one_round_compute(
 973     Register  reg_old_h,
 974     Register  reg_a,
 975     Register  reg_b,
 976     Register  reg_c,
 977     Register  reg_d,
 978     Register  reg_e,
 979     Register  reg_f,
 980     Register  reg_g,
 981     Register  reg_h,
 982     int iter);
 983   void sha256_AVX2_four_rounds_compute_first(int start);
 984   void sha256_AVX2_four_rounds_compute_last(int start);
 985   void sha256_AVX2_one_round_and_sched(
 986         XMMRegister xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
 987         XMMRegister xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
 988         XMMRegister xmm_2,     /* ymm6 */
 989         XMMRegister xmm_3,     /* ymm7 */
 990         Register    reg_a,      /* == eax on 0 iteration, then rotate 8 register right on each next iteration */
 991         Register    reg_b,      /* ebx */    /* full cycle is 8 iterations */
 992         Register    reg_c,      /* edi */
 993         Register    reg_d,      /* esi */
 994         Register    reg_e,      /* r8d */
 995         Register    reg_f,      /* r9d */
 996         Register    reg_g,      /* r10d */
 997         Register    reg_h,      /* r11d */
 998         int iter);
 999 
1000   void addm(int disp, Register r1, Register r2);
1001 
1002   void sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c, Register d,
1003                                      Register e, Register f, Register g, Register h, int iteration);
1004 
1005   void sha512_AVX2_one_round_and_schedule(XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1006                                           Register a, Register b, Register c, Register d, Register e, Register f,
1007                                           Register g, Register h, int iteration);
1008 
1009   void addmq(int disp, Register r1, Register r2);
1010  public:
1011   void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1012                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1013                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1014                    bool multi_block, XMMRegister shuf_mask);
1015   void sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1016                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1017                    Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block,
1018                    XMMRegister shuf_mask);
1019 #endif // _LP64
1020 
1021   void fast_md5(Register buf, Address state, Address ofs, Address limit,
1022                 bool multi_block);
1023 
1024   void fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
1025                  XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
1026                  Register buf, Register state, Register ofs, Register limit, Register rsp,
1027                  bool multi_block);
1028 
1029 #ifdef _LP64
1030   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1031                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1032                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1033                    bool multi_block, XMMRegister shuf_mask);
1034 #else
1035   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1036                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1037                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1038                    bool multi_block);
1039 #endif
1040 
1041   void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1042                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1043                 Register rax, Register rcx, Register rdx, Register tmp);
1044 
1045 #ifndef _LP64
1046  private:
1047   // Initialized in macroAssembler_x86_constants.cpp
1048   static address ONES;
1049   static address L_2IL0FLOATPACKET_0;
1050   static address PI4_INV;
1051   static address PI4X3;
1052   static address PI4X4;
1053 
1054  public:
1055   void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1056                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1057                 Register rax, Register rcx, Register rdx, Register tmp1);
1058 
1059   void fast_log10(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1060                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1061                 Register rax, Register rcx, Register rdx, Register tmp);
1062 
1063   void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1064                 XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
1065                 Register rdx, Register tmp);
1066 
1067   void fast_sin(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1068                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1069                 Register rax, Register rbx, Register rdx);
1070 
1071   void fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1072                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1073                 Register rax, Register rcx, Register rdx, Register tmp);
1074 
1075   void libm_sincos_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1076                         Register edx, Register ebx, Register esi, Register edi,
1077                         Register ebp, Register esp);
1078 
1079   void libm_reduce_pi04l(Register eax, Register ecx, Register edx, Register ebx,
1080                          Register esi, Register edi, Register ebp, Register esp);
1081 
1082   void libm_tancot_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1083                         Register edx, Register ebx, Register esi, Register edi,
1084                         Register ebp, Register esp);
1085 
1086   void fast_tan(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1087                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1088                 Register rax, Register rcx, Register rdx, Register tmp);
1089 #endif // !_LP64
1090 
1091 private:
1092 
1093   // these are private because users should be doing movflt/movdbl
1094 
1095   void movss(Address     dst, XMMRegister    src) { Assembler::movss(dst, src); }
1096   void movss(XMMRegister dst, XMMRegister    src) { Assembler::movss(dst, src); }
1097   void movss(XMMRegister dst, Address        src) { Assembler::movss(dst, src); }
1098   void movss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1099 
1100   void movlpd(XMMRegister dst, Address        src) {Assembler::movlpd(dst, src); }
1101   void movlpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1102 
1103 public:
1104 
1105   void addsd(XMMRegister dst, XMMRegister    src) { Assembler::addsd(dst, src); }
1106   void addsd(XMMRegister dst, Address        src) { Assembler::addsd(dst, src); }
1107   void addsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1108 
1109   void addss(XMMRegister dst, XMMRegister    src) { Assembler::addss(dst, src); }
1110   void addss(XMMRegister dst, Address        src) { Assembler::addss(dst, src); }
1111   void addss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1112 
1113   void addpd(XMMRegister dst, XMMRegister    src) { Assembler::addpd(dst, src); }
1114   void addpd(XMMRegister dst, Address        src) { Assembler::addpd(dst, src); }
1115   void addpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1116 
1117   using Assembler::vbroadcastsd;
1118   void vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1119 
1120   using Assembler::vbroadcastss;
1121   void vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1122 
1123   void divsd(XMMRegister dst, XMMRegister    src) { Assembler::divsd(dst, src); }
1124   void divsd(XMMRegister dst, Address        src) { Assembler::divsd(dst, src); }
1125   void divsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1126 
1127   void divss(XMMRegister dst, XMMRegister    src) { Assembler::divss(dst, src); }
1128   void divss(XMMRegister dst, Address        src) { Assembler::divss(dst, src); }
1129   void divss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1130 
1131   // Move Unaligned Double Quadword
1132   void movdqu(Address     dst, XMMRegister    src);
1133   void movdqu(XMMRegister dst, XMMRegister    src);
1134   void movdqu(XMMRegister dst, Address        src);
1135   void movdqu(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1136 
1137   void kmovwl(Register  dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1138   void kmovwl(Address   dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1139   void kmovwl(KRegister dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1140   void kmovwl(KRegister dst, Register       src) { Assembler::kmovwl(dst, src); }
1141   void kmovwl(KRegister dst, Address        src) { Assembler::kmovwl(dst, src); }
1142   void kmovwl(KRegister dst, AddressLiteral src, Register rscratch = noreg);
1143 
1144   void kmovql(KRegister dst, KRegister      src) { Assembler::kmovql(dst, src); }
1145   void kmovql(KRegister dst, Register       src) { Assembler::kmovql(dst, src); }
1146   void kmovql(Register  dst, KRegister      src) { Assembler::kmovql(dst, src); }
1147   void kmovql(KRegister dst, Address        src) { Assembler::kmovql(dst, src); }
1148   void kmovql(Address   dst, KRegister      src) { Assembler::kmovql(dst, src); }
1149   void kmovql(KRegister dst, AddressLiteral src, Register rscratch = noreg);
1150 
1151   // Safe move operation, lowers down to 16bit moves for targets supporting
1152   // AVX512F feature and 64bit moves for targets supporting AVX512BW feature.
1153   void kmov(Address  dst, KRegister src);
1154   void kmov(KRegister dst, Address src);
1155   void kmov(KRegister dst, KRegister src);
1156   void kmov(Register dst, KRegister src);
1157   void kmov(KRegister dst, Register src);
1158 
1159   using Assembler::movddup;
1160   void movddup(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1161 
1162   using Assembler::vmovddup;
1163   void vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1164 
1165   // AVX Unaligned forms
1166   void vmovdqu(Address     dst, XMMRegister    src);
1167   void vmovdqu(XMMRegister dst, Address        src);
1168   void vmovdqu(XMMRegister dst, XMMRegister    src);
1169   void vmovdqu(XMMRegister dst, AddressLiteral src,                 Register rscratch = noreg);
1170   void vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1171 
1172   // AVX512 Unaligned
1173   void evmovdqu(BasicType type, KRegister kmask, Address     dst, XMMRegister src, bool merge, int vector_len);
1174   void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address     src, bool merge, int vector_len);
1175 
1176   void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::evmovdqub(dst, src, vector_len); }
1177   void evmovdqub(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdqub(dst, src, vector_len); }
1178 
1179   void evmovdqub(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1180     if (dst->encoding() != src->encoding() || mask != k0)  {
1181       Assembler::evmovdqub(dst, mask, src, merge, vector_len);
1182     }
1183   }
1184   void evmovdqub(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
1185   void evmovdqub(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
1186   void evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1187 
1188   void evmovdquw(Address     dst, XMMRegister src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
1189   void evmovdquw(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
1190 
1191   void evmovdquw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1192     if (dst->encoding() != src->encoding() || mask != k0) {
1193       Assembler::evmovdquw(dst, mask, src, merge, vector_len);
1194     }
1195   }
1196   void evmovdquw(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
1197   void evmovdquw(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
1198   void evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1199 
1200   void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
1201      if (dst->encoding() != src->encoding()) {
1202        Assembler::evmovdqul(dst, src, vector_len);
1203      }
1204   }
1205   void evmovdqul(Address     dst, XMMRegister src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
1206   void evmovdqul(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
1207 
1208   void evmovdqul(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1209     if (dst->encoding() != src->encoding() || mask != k0)  {
1210       Assembler::evmovdqul(dst, mask, src, merge, vector_len);
1211     }
1212   }
1213   void evmovdqul(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
1214   void evmovdqul(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
1215   void evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1216 
1217   void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) {
1218     if (dst->encoding() != src->encoding()) {
1219       Assembler::evmovdquq(dst, src, vector_len);
1220     }
1221   }
1222   void evmovdquq(XMMRegister dst, Address        src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1223   void evmovdquq(Address     dst, XMMRegister    src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1224   void evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1225 
1226   void evmovdquq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1227     if (dst->encoding() != src->encoding() || mask != k0) {
1228       Assembler::evmovdquq(dst, mask, src, merge, vector_len);
1229     }
1230   }
1231   void evmovdquq(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
1232   void evmovdquq(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
1233   void evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1234 
1235   // Move Aligned Double Quadword
1236   void movdqa(XMMRegister dst, XMMRegister    src) { Assembler::movdqa(dst, src); }
1237   void movdqa(XMMRegister dst, Address        src) { Assembler::movdqa(dst, src); }
1238   void movdqa(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1239 
1240   void movsd(Address     dst, XMMRegister    src) { Assembler::movsd(dst, src); }
1241   void movsd(XMMRegister dst, XMMRegister    src) { Assembler::movsd(dst, src); }
1242   void movsd(XMMRegister dst, Address        src) { Assembler::movsd(dst, src); }
1243   void movsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1244 
1245   void mulpd(XMMRegister dst, XMMRegister    src) { Assembler::mulpd(dst, src); }
1246   void mulpd(XMMRegister dst, Address        src) { Assembler::mulpd(dst, src); }
1247   void mulpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1248 
1249   void mulsd(XMMRegister dst, XMMRegister    src) { Assembler::mulsd(dst, src); }
1250   void mulsd(XMMRegister dst, Address        src) { Assembler::mulsd(dst, src); }
1251   void mulsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1252 
1253   void mulss(XMMRegister dst, XMMRegister    src) { Assembler::mulss(dst, src); }
1254   void mulss(XMMRegister dst, Address        src) { Assembler::mulss(dst, src); }
1255   void mulss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1256 
1257   // Carry-Less Multiplication Quadword
1258   void pclmulldq(XMMRegister dst, XMMRegister src) {
1259     // 0x00 - multiply lower 64 bits [0:63]
1260     Assembler::pclmulqdq(dst, src, 0x00);
1261   }
1262   void pclmulhdq(XMMRegister dst, XMMRegister src) {
1263     // 0x11 - multiply upper 64 bits [64:127]
1264     Assembler::pclmulqdq(dst, src, 0x11);
1265   }
1266 
1267   void pcmpeqb(XMMRegister dst, XMMRegister src);
1268   void pcmpeqw(XMMRegister dst, XMMRegister src);
1269 
1270   void pcmpestri(XMMRegister dst, Address src, int imm8);
1271   void pcmpestri(XMMRegister dst, XMMRegister src, int imm8);
1272 
1273   void pmovzxbw(XMMRegister dst, XMMRegister src);
1274   void pmovzxbw(XMMRegister dst, Address src);
1275 
1276   void pmovmskb(Register dst, XMMRegister src);
1277 
1278   void ptest(XMMRegister dst, XMMRegister src);
1279 
1280   void roundsd(XMMRegister dst, XMMRegister    src, int32_t rmode) { Assembler::roundsd(dst, src, rmode); }
1281   void roundsd(XMMRegister dst, Address        src, int32_t rmode) { Assembler::roundsd(dst, src, rmode); }
1282   void roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch = noreg);
1283 
1284   void sqrtss(XMMRegister dst, XMMRegister     src) { Assembler::sqrtss(dst, src); }
1285   void sqrtss(XMMRegister dst, Address         src) { Assembler::sqrtss(dst, src); }
1286   void sqrtss(XMMRegister dst, AddressLiteral  src, Register rscratch = noreg);
1287 
1288   void subsd(XMMRegister dst, XMMRegister    src) { Assembler::subsd(dst, src); }
1289   void subsd(XMMRegister dst, Address        src) { Assembler::subsd(dst, src); }
1290   void subsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1291 
1292   void subss(XMMRegister dst, XMMRegister    src) { Assembler::subss(dst, src); }
1293   void subss(XMMRegister dst, Address        src) { Assembler::subss(dst, src); }
1294   void subss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1295 
1296   void ucomiss(XMMRegister dst, XMMRegister    src) { Assembler::ucomiss(dst, src); }
1297   void ucomiss(XMMRegister dst, Address        src) { Assembler::ucomiss(dst, src); }
1298   void ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1299 
1300   void ucomisd(XMMRegister dst, XMMRegister    src) { Assembler::ucomisd(dst, src); }
1301   void ucomisd(XMMRegister dst, Address        src) { Assembler::ucomisd(dst, src); }
1302   void ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1303 
1304   // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
1305   void xorpd(XMMRegister dst, XMMRegister    src);
1306   void xorpd(XMMRegister dst, Address        src) { Assembler::xorpd(dst, src); }
1307   void xorpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1308 
1309   // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
1310   void xorps(XMMRegister dst, XMMRegister    src);
1311   void xorps(XMMRegister dst, Address        src) { Assembler::xorps(dst, src); }
1312   void xorps(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1313 
1314   // Shuffle Bytes
1315   void pshufb(XMMRegister dst, XMMRegister    src) { Assembler::pshufb(dst, src); }
1316   void pshufb(XMMRegister dst, Address        src) { Assembler::pshufb(dst, src); }
1317   void pshufb(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1318   // AVX 3-operands instructions
1319 
1320   void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vaddsd(dst, nds, src); }
1321   void vaddsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vaddsd(dst, nds, src); }
1322   void vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1323 
1324   void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vaddss(dst, nds, src); }
1325   void vaddss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vaddss(dst, nds, src); }
1326   void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1327 
1328   void vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch = noreg);
1329   void vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch = noreg);
1330 
1331   void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len);
1332   void vpaddb(XMMRegister dst, XMMRegister nds, Address        src, int vector_len);
1333   void vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1334 
1335   void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1336   void vpaddw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1337 
1338   void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1339   void vpaddd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1340   void vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1341 
1342   void vpand(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1343   void vpand(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1344   void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1345 
1346   using Assembler::vpbroadcastd;
1347   void vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1348 
1349   using Assembler::vpbroadcastq;
1350   void vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1351 
1352   void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1353 
1354   void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1355   void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1356 
1357   // Vector compares
1358   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1359     Assembler::evpcmpd(kdst, mask, nds, src, comparison, is_signed, vector_len);
1360   }
1361   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1362 
1363   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1364     Assembler::evpcmpq(kdst, mask, nds, src, comparison, is_signed, vector_len);
1365   }
1366   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1367 
1368   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1369     Assembler::evpcmpb(kdst, mask, nds, src, comparison, is_signed, vector_len);
1370   }
1371   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1372 
1373   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1374     Assembler::evpcmpw(kdst, mask, nds, src, comparison, is_signed, vector_len);
1375   }
1376   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1377 
1378   void evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len);
1379 
1380   // Emit comparison instruction for the specified comparison predicate.
1381   void vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len);
1382   void vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len);
1383 
1384   void vpmovzxbw(XMMRegister dst, Address     src, int vector_len);
1385   void vpmovzxbw(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vpmovzxbw(dst, src, vector_len); }
1386 
1387   void vpmovmskb(Register dst, XMMRegister src, int vector_len = Assembler::AVX_256bit);
1388 
1389   void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1390   void vpmullw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1391 
1392   void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); }
1393   void vpmulld(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); }
1394   void vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1395 
1396   void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1397   void vpsubb(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1398 
1399   void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1400   void vpsubw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1401 
1402   void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1403   void vpsraw(XMMRegister dst, XMMRegister nds, int         shift, int vector_len);
1404 
1405   void evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1406   void evpsraq(XMMRegister dst, XMMRegister nds, int         shift, int vector_len);
1407 
1408   void evpsllw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1409     if (!is_varshift) {
1410       Assembler::evpsllw(dst, mask, nds, src, merge, vector_len);
1411     } else {
1412       Assembler::evpsllvw(dst, mask, nds, src, merge, vector_len);
1413     }
1414   }
1415   void evpslld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1416     if (!is_varshift) {
1417       Assembler::evpslld(dst, mask, nds, src, merge, vector_len);
1418     } else {
1419       Assembler::evpsllvd(dst, mask, nds, src, merge, vector_len);
1420     }
1421   }
1422   void evpsllq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1423     if (!is_varshift) {
1424       Assembler::evpsllq(dst, mask, nds, src, merge, vector_len);
1425     } else {
1426       Assembler::evpsllvq(dst, mask, nds, src, merge, vector_len);
1427     }
1428   }
1429   void evpsrlw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1430     if (!is_varshift) {
1431       Assembler::evpsrlw(dst, mask, nds, src, merge, vector_len);
1432     } else {
1433       Assembler::evpsrlvw(dst, mask, nds, src, merge, vector_len);
1434     }
1435   }
1436   void evpsrld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1437     if (!is_varshift) {
1438       Assembler::evpsrld(dst, mask, nds, src, merge, vector_len);
1439     } else {
1440       Assembler::evpsrlvd(dst, mask, nds, src, merge, vector_len);
1441     }
1442   }
1443   void evpsrlq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1444     if (!is_varshift) {
1445       Assembler::evpsrlq(dst, mask, nds, src, merge, vector_len);
1446     } else {
1447       Assembler::evpsrlvq(dst, mask, nds, src, merge, vector_len);
1448     }
1449   }
1450   void evpsraw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1451     if (!is_varshift) {
1452       Assembler::evpsraw(dst, mask, nds, src, merge, vector_len);
1453     } else {
1454       Assembler::evpsravw(dst, mask, nds, src, merge, vector_len);
1455     }
1456   }
1457   void evpsrad(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1458     if (!is_varshift) {
1459       Assembler::evpsrad(dst, mask, nds, src, merge, vector_len);
1460     } else {
1461       Assembler::evpsravd(dst, mask, nds, src, merge, vector_len);
1462     }
1463   }
1464   void evpsraq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1465     if (!is_varshift) {
1466       Assembler::evpsraq(dst, mask, nds, src, merge, vector_len);
1467     } else {
1468       Assembler::evpsravq(dst, mask, nds, src, merge, vector_len);
1469     }
1470   }
1471 
1472   void evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1473   void evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1474   void evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1475   void evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1476 
1477   void vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1478   void vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1479 
1480   void vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1481   void vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1482 
1483   void vptest(XMMRegister dst, XMMRegister src);
1484   void vptest(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vptest(dst, src, vector_len); }
1485 
1486   void punpcklbw(XMMRegister dst, XMMRegister src);
1487   void punpcklbw(XMMRegister dst, Address src) { Assembler::punpcklbw(dst, src); }
1488 
1489   void pshufd(XMMRegister dst, Address src, int mode);
1490   void pshufd(XMMRegister dst, XMMRegister src, int mode) { Assembler::pshufd(dst, src, mode); }
1491 
1492   void pshuflw(XMMRegister dst, XMMRegister src, int mode);
1493   void pshuflw(XMMRegister dst, Address src, int mode) { Assembler::pshuflw(dst, src, mode); }
1494 
1495   void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1496   void vandpd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1497   void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1498 
1499   void vandps(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1500   void vandps(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1501   void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1502 
1503   void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1504 
1505   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vdivsd(dst, nds, src); }
1506   void vdivsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vdivsd(dst, nds, src); }
1507   void vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1508 
1509   void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vdivss(dst, nds, src); }
1510   void vdivss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vdivss(dst, nds, src); }
1511   void vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1512 
1513   void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vmulsd(dst, nds, src); }
1514   void vmulsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vmulsd(dst, nds, src); }
1515   void vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1516 
1517   void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vmulss(dst, nds, src); }
1518   void vmulss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vmulss(dst, nds, src); }
1519   void vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1520 
1521   void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vsubsd(dst, nds, src); }
1522   void vsubsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vsubsd(dst, nds, src); }
1523   void vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1524 
1525   void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vsubss(dst, nds, src); }
1526   void vsubss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vsubss(dst, nds, src); }
1527   void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1528 
1529   void vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1530   void vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1531 
1532   // AVX Vector instructions
1533 
1534   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1535   void vxorpd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1536   void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1537 
1538   void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1539   void vxorps(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1540   void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1541 
1542   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1543     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1544       Assembler::vpxor(dst, nds, src, vector_len);
1545     else
1546       Assembler::vxorpd(dst, nds, src, vector_len);
1547   }
1548   void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
1549     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1550       Assembler::vpxor(dst, nds, src, vector_len);
1551     else
1552       Assembler::vxorpd(dst, nds, src, vector_len);
1553   }
1554   void vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1555 
1556   // Simple version for AVX2 256bit vectors
1557   void vpxor(XMMRegister dst, XMMRegister src) {
1558     assert(UseAVX >= 2, "Should be at least AVX2");
1559     Assembler::vpxor(dst, dst, src, AVX_256bit);
1560   }
1561   void vpxor(XMMRegister dst, Address src) {
1562     assert(UseAVX >= 2, "Should be at least AVX2");
1563     Assembler::vpxor(dst, dst, src, AVX_256bit);
1564   }
1565 
1566   void vpermd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpermd(dst, nds, src, vector_len); }
1567   void vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1568 
1569   void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
1570     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1571       Assembler::vinserti32x4(dst, nds, src, imm8);
1572     } else if (UseAVX > 1) {
1573       // vinserti128 is available only in AVX2
1574       Assembler::vinserti128(dst, nds, src, imm8);
1575     } else {
1576       Assembler::vinsertf128(dst, nds, src, imm8);
1577     }
1578   }
1579 
1580   void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
1581     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1582       Assembler::vinserti32x4(dst, nds, src, imm8);
1583     } else if (UseAVX > 1) {
1584       // vinserti128 is available only in AVX2
1585       Assembler::vinserti128(dst, nds, src, imm8);
1586     } else {
1587       Assembler::vinsertf128(dst, nds, src, imm8);
1588     }
1589   }
1590 
1591   void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
1592     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1593       Assembler::vextracti32x4(dst, src, imm8);
1594     } else if (UseAVX > 1) {
1595       // vextracti128 is available only in AVX2
1596       Assembler::vextracti128(dst, src, imm8);
1597     } else {
1598       Assembler::vextractf128(dst, src, imm8);
1599     }
1600   }
1601 
1602   void vextracti128(Address dst, XMMRegister src, uint8_t imm8) {
1603     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1604       Assembler::vextracti32x4(dst, src, imm8);
1605     } else if (UseAVX > 1) {
1606       // vextracti128 is available only in AVX2
1607       Assembler::vextracti128(dst, src, imm8);
1608     } else {
1609       Assembler::vextractf128(dst, src, imm8);
1610     }
1611   }
1612 
1613   // 128bit copy to/from high 128 bits of 256bit (YMM) vector registers
1614   void vinserti128_high(XMMRegister dst, XMMRegister src) {
1615     vinserti128(dst, dst, src, 1);
1616   }
1617   void vinserti128_high(XMMRegister dst, Address src) {
1618     vinserti128(dst, dst, src, 1);
1619   }
1620   void vextracti128_high(XMMRegister dst, XMMRegister src) {
1621     vextracti128(dst, src, 1);
1622   }
1623   void vextracti128_high(Address dst, XMMRegister src) {
1624     vextracti128(dst, src, 1);
1625   }
1626 
1627   void vinsertf128_high(XMMRegister dst, XMMRegister src) {
1628     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1629       Assembler::vinsertf32x4(dst, dst, src, 1);
1630     } else {
1631       Assembler::vinsertf128(dst, dst, src, 1);
1632     }
1633   }
1634 
1635   void vinsertf128_high(XMMRegister dst, Address src) {
1636     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1637       Assembler::vinsertf32x4(dst, dst, src, 1);
1638     } else {
1639       Assembler::vinsertf128(dst, dst, src, 1);
1640     }
1641   }
1642 
1643   void vextractf128_high(XMMRegister dst, XMMRegister src) {
1644     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1645       Assembler::vextractf32x4(dst, src, 1);
1646     } else {
1647       Assembler::vextractf128(dst, src, 1);
1648     }
1649   }
1650 
1651   void vextractf128_high(Address dst, XMMRegister src) {
1652     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1653       Assembler::vextractf32x4(dst, src, 1);
1654     } else {
1655       Assembler::vextractf128(dst, src, 1);
1656     }
1657   }
1658 
1659   // 256bit copy to/from high 256 bits of 512bit (ZMM) vector registers
1660   void vinserti64x4_high(XMMRegister dst, XMMRegister src) {
1661     Assembler::vinserti64x4(dst, dst, src, 1);
1662   }
1663   void vinsertf64x4_high(XMMRegister dst, XMMRegister src) {
1664     Assembler::vinsertf64x4(dst, dst, src, 1);
1665   }
1666   void vextracti64x4_high(XMMRegister dst, XMMRegister src) {
1667     Assembler::vextracti64x4(dst, src, 1);
1668   }
1669   void vextractf64x4_high(XMMRegister dst, XMMRegister src) {
1670     Assembler::vextractf64x4(dst, src, 1);
1671   }
1672   void vextractf64x4_high(Address dst, XMMRegister src) {
1673     Assembler::vextractf64x4(dst, src, 1);
1674   }
1675   void vinsertf64x4_high(XMMRegister dst, Address src) {
1676     Assembler::vinsertf64x4(dst, dst, src, 1);
1677   }
1678 
1679   // 128bit copy to/from low 128 bits of 256bit (YMM) vector registers
1680   void vinserti128_low(XMMRegister dst, XMMRegister src) {
1681     vinserti128(dst, dst, src, 0);
1682   }
1683   void vinserti128_low(XMMRegister dst, Address src) {
1684     vinserti128(dst, dst, src, 0);
1685   }
1686   void vextracti128_low(XMMRegister dst, XMMRegister src) {
1687     vextracti128(dst, src, 0);
1688   }
1689   void vextracti128_low(Address dst, XMMRegister src) {
1690     vextracti128(dst, src, 0);
1691   }
1692 
1693   void vinsertf128_low(XMMRegister dst, XMMRegister src) {
1694     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1695       Assembler::vinsertf32x4(dst, dst, src, 0);
1696     } else {
1697       Assembler::vinsertf128(dst, dst, src, 0);
1698     }
1699   }
1700 
1701   void vinsertf128_low(XMMRegister dst, Address src) {
1702     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1703       Assembler::vinsertf32x4(dst, dst, src, 0);
1704     } else {
1705       Assembler::vinsertf128(dst, dst, src, 0);
1706     }
1707   }
1708 
1709   void vextractf128_low(XMMRegister dst, XMMRegister src) {
1710     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1711       Assembler::vextractf32x4(dst, src, 0);
1712     } else {
1713       Assembler::vextractf128(dst, src, 0);
1714     }
1715   }
1716 
1717   void vextractf128_low(Address dst, XMMRegister src) {
1718     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1719       Assembler::vextractf32x4(dst, src, 0);
1720     } else {
1721       Assembler::vextractf128(dst, src, 0);
1722     }
1723   }
1724 
1725   // 256bit copy to/from low 256 bits of 512bit (ZMM) vector registers
1726   void vinserti64x4_low(XMMRegister dst, XMMRegister src) {
1727     Assembler::vinserti64x4(dst, dst, src, 0);
1728   }
1729   void vinsertf64x4_low(XMMRegister dst, XMMRegister src) {
1730     Assembler::vinsertf64x4(dst, dst, src, 0);
1731   }
1732   void vextracti64x4_low(XMMRegister dst, XMMRegister src) {
1733     Assembler::vextracti64x4(dst, src, 0);
1734   }
1735   void vextractf64x4_low(XMMRegister dst, XMMRegister src) {
1736     Assembler::vextractf64x4(dst, src, 0);
1737   }
1738   void vextractf64x4_low(Address dst, XMMRegister src) {
1739     Assembler::vextractf64x4(dst, src, 0);
1740   }
1741   void vinsertf64x4_low(XMMRegister dst, Address src) {
1742     Assembler::vinsertf64x4(dst, dst, src, 0);
1743   }
1744 
1745   // Carry-Less Multiplication Quadword
1746   void vpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1747     // 0x00 - multiply lower 64 bits [0:63]
1748     Assembler::vpclmulqdq(dst, nds, src, 0x00);
1749   }
1750   void vpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1751     // 0x11 - multiply upper 64 bits [64:127]
1752     Assembler::vpclmulqdq(dst, nds, src, 0x11);
1753   }
1754   void vpclmullqhqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1755     // 0x10 - multiply nds[0:63] and src[64:127]
1756     Assembler::vpclmulqdq(dst, nds, src, 0x10);
1757   }
1758   void vpclmulhqlqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1759     //0x01 - multiply nds[64:127] and src[0:63]
1760     Assembler::vpclmulqdq(dst, nds, src, 0x01);
1761   }
1762 
1763   void evpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1764     // 0x00 - multiply lower 64 bits [0:63]
1765     Assembler::evpclmulqdq(dst, nds, src, 0x00, vector_len);
1766   }
1767   void evpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1768     // 0x11 - multiply upper 64 bits [64:127]
1769     Assembler::evpclmulqdq(dst, nds, src, 0x11, vector_len);
1770   }
1771 
1772   // AVX-512 mask operations.
1773   void kand(BasicType etype, KRegister dst, KRegister src1, KRegister src2);
1774   void kor(BasicType type, KRegister dst, KRegister src1, KRegister src2);
1775   void knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp = knoreg, Register rtmp = noreg);
1776   void kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2);
1777   void kortest(uint masklen, KRegister src1, KRegister src2);
1778   void ktest(uint masklen, KRegister src1, KRegister src2);
1779 
1780   void evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1781   void evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1782 
1783   void evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1784   void evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1785 
1786   void evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1787   void evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1788 
1789   void evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1790   void evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1791 
1792   void evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
1793   void evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
1794   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
1795   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
1796 
1797   using Assembler::evpandq;
1798   void evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1799 
1800   using Assembler::evporq;
1801   void evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1802 
1803   using Assembler::vpternlogq;
1804   void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch = noreg);
1805 
1806   void cmov32( Condition cc, Register dst, Address  src);
1807   void cmov32( Condition cc, Register dst, Register src);
1808 
1809   void cmov(   Condition cc, Register dst, Register src) { cmovptr(cc, dst, src); }
1810 
1811   void cmovptr(Condition cc, Register dst, Address  src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
1812   void cmovptr(Condition cc, Register dst, Register src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
1813 
1814   void movoop(Register dst, jobject obj);
1815   void movoop(Address  dst, jobject obj, Register rscratch);
1816 
1817   void mov_metadata(Register dst, Metadata* obj);
1818   void mov_metadata(Address  dst, Metadata* obj, Register rscratch);
1819 
1820   void movptr(Register     dst, Register       src);
1821   void movptr(Register     dst, Address        src);
1822   void movptr(Register     dst, AddressLiteral src);
1823   void movptr(Register     dst, ArrayAddress   src);
1824   void movptr(Register     dst, intptr_t       src);
1825   void movptr(Address      dst, Register       src);
1826   void movptr(Address      dst, int32_t        imm);
1827   void movptr(Address      dst, intptr_t       src, Register rscratch);
1828   void movptr(ArrayAddress dst, Register       src, Register rscratch);
1829 
1830   void movptr(Register dst, RegisterOrConstant src) {
1831     if (src.is_constant()) movptr(dst, src.as_constant());
1832     else                   movptr(dst, src.as_register());
1833   }
1834 
1835 
1836   // to avoid hiding movl
1837   void mov32(Register       dst, AddressLiteral src);
1838   void mov32(AddressLiteral dst, Register        src, Register rscratch = noreg);
1839 
1840   // Import other mov() methods from the parent class or else
1841   // they will be hidden by the following overriding declaration.
1842   using Assembler::movdl;
1843   void movdl(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1844 
1845   using Assembler::movq;
1846   void movq(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1847 
1848   // Can push value or effective address
1849   void pushptr(AddressLiteral src, Register rscratch);
1850 
1851   void pushptr(Address src) { LP64_ONLY(pushq(src)) NOT_LP64(pushl(src)); }
1852   void popptr(Address src) { LP64_ONLY(popq(src)) NOT_LP64(popl(src)); }
1853 
1854   void pushoop(jobject obj, Register rscratch);
1855   void pushklass(Metadata* obj, Register rscratch);
1856 
1857   // sign extend as need a l to ptr sized element
1858   void movl2ptr(Register dst, Address src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src)); }
1859   void movl2ptr(Register dst, Register src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(if (dst != src) movl(dst, src)); }
1860 
1861 
1862  public:
1863   // clear memory of size 'cnt' qwords, starting at 'base';
1864   // if 'is_large' is set, do not try to produce short loop
1865   void clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, bool is_large, KRegister mask=knoreg);
1866 
1867   // clear memory initialization sequence for constant size;
1868   void clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
1869 
1870   // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
1871   void xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
1872 
1873   // Fill primitive arrays
1874   void generate_fill(BasicType t, bool aligned,
1875                      Register to, Register value, Register count,
1876                      Register rtmp, XMMRegister xtmp);
1877 
1878   void encode_iso_array(Register src, Register dst, Register len,
1879                         XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
1880                         XMMRegister tmp4, Register tmp5, Register result, bool ascii);
1881 
1882 #ifdef _LP64
1883   void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2);
1884   void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
1885                              Register y, Register y_idx, Register z,
1886                              Register carry, Register product,
1887                              Register idx, Register kdx);
1888   void multiply_add_128_x_128(Register x_xstart, Register y, Register z,
1889                               Register yz_idx, Register idx,
1890                               Register carry, Register product, int offset);
1891   void multiply_128_x_128_bmi2_loop(Register y, Register z,
1892                                     Register carry, Register carry2,
1893                                     Register idx, Register jdx,
1894                                     Register yz_idx1, Register yz_idx2,
1895                                     Register tmp, Register tmp3, Register tmp4);
1896   void multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
1897                                Register yz_idx, Register idx, Register jdx,
1898                                Register carry, Register product,
1899                                Register carry2);
1900   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
1901                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
1902   void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
1903                      Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
1904   void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
1905                             Register tmp2);
1906   void multiply_add_64(Register sum, Register op1, Register op2, Register carry,
1907                        Register rdxReg, Register raxReg);
1908   void add_one_64(Register z, Register zlen, Register carry, Register tmp1);
1909   void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
1910                        Register tmp3, Register tmp4);
1911   void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
1912                      Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
1913 
1914   void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1,
1915                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
1916                Register raxReg);
1917   void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
1918                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
1919                Register raxReg);
1920   void vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
1921                            Register result, Register tmp1, Register tmp2,
1922                            XMMRegister vec1, XMMRegister vec2, XMMRegister vec3);
1923 #endif
1924 
1925   // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
1926   void update_byte_crc32(Register crc, Register val, Register table);
1927   void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
1928 
1929 
1930 #ifdef _LP64
1931   void kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2);
1932   void kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
1933                                 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
1934                                 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup);
1935 #endif // _LP64
1936 
1937   // CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
1938   // Note on a naming convention:
1939   // Prefix w = register only used on a Westmere+ architecture
1940   // Prefix n = register only used on a Nehalem architecture
1941 #ifdef _LP64
1942   void crc32c_ipl_alg4(Register in_out, uint32_t n,
1943                        Register tmp1, Register tmp2, Register tmp3);
1944 #else
1945   void crc32c_ipl_alg4(Register in_out, uint32_t n,
1946                        Register tmp1, Register tmp2, Register tmp3,
1947                        XMMRegister xtmp1, XMMRegister xtmp2);
1948 #endif
1949   void crc32c_pclmulqdq(XMMRegister w_xtmp1,
1950                         Register in_out,
1951                         uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
1952                         XMMRegister w_xtmp2,
1953                         Register tmp1,
1954                         Register n_tmp2, Register n_tmp3);
1955   void crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
1956                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
1957                        Register tmp1, Register tmp2,
1958                        Register n_tmp3);
1959   void crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
1960                          Register in_out1, Register in_out2, Register in_out3,
1961                          Register tmp1, Register tmp2, Register tmp3,
1962                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
1963                          Register tmp4, Register tmp5,
1964                          Register n_tmp6);
1965   void crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
1966                             Register tmp1, Register tmp2, Register tmp3,
1967                             Register tmp4, Register tmp5, Register tmp6,
1968                             XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
1969                             bool is_pclmulqdq_supported);
1970   // Fold 128-bit data chunk
1971   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
1972   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf);
1973 #ifdef _LP64
1974   // Fold 512-bit data chunk
1975   void fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, Register pos, int offset);
1976 #endif // _LP64
1977   // Fold 8-bit data
1978   void fold_8bit_crc32(Register crc, Register table, Register tmp);
1979   void fold_8bit_crc32(XMMRegister crc, Register table, XMMRegister xtmp, Register tmp);
1980 
1981   // Compress char[] array to byte[].
1982   void char_array_compress(Register src, Register dst, Register len,
1983                            XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
1984                            XMMRegister tmp4, Register tmp5, Register result,
1985                            KRegister mask1 = knoreg, KRegister mask2 = knoreg);
1986 
1987   // Inflate byte[] array to char[].
1988   void byte_array_inflate(Register src, Register dst, Register len,
1989                           XMMRegister tmp1, Register tmp2, KRegister mask = knoreg);
1990 
1991   void fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
1992                    Register length, Register temp, int vec_enc);
1993 
1994   void fill64_masked(uint shift, Register dst, int disp,
1995                          XMMRegister xmm, KRegister mask, Register length,
1996                          Register temp, bool use64byteVector = false);
1997 
1998   void fill32_masked(uint shift, Register dst, int disp,
1999                          XMMRegister xmm, KRegister mask, Register length,
2000                          Register temp);
2001 
2002   void fill32(Address dst, XMMRegister xmm);
2003 
2004   void fill32(Register dst, int disp, XMMRegister xmm);
2005 
2006   void fill64(Address dst, XMMRegister xmm, bool use64byteVector = false);
2007 
2008   void fill64(Register dst, int dis, XMMRegister xmm, bool use64byteVector = false);
2009 
2010 #ifdef _LP64
2011   void convert_f2i(Register dst, XMMRegister src);
2012   void convert_d2i(Register dst, XMMRegister src);
2013   void convert_f2l(Register dst, XMMRegister src);
2014   void convert_d2l(Register dst, XMMRegister src);
2015   void round_double(Register dst, XMMRegister src, Register rtmp, Register rcx);
2016   void round_float(Register dst, XMMRegister src, Register rtmp, Register rcx);
2017 
2018   void cache_wb(Address line);
2019   void cache_wbsync(bool is_pre);
2020 
2021 #ifdef COMPILER2_OR_JVMCI
2022   void generate_fill_avx3(BasicType type, Register to, Register value,
2023                           Register count, Register rtmp, XMMRegister xtmp);
2024 #endif // COMPILER2_OR_JVMCI
2025 #endif // _LP64
2026 
2027   void vallones(XMMRegister dst, int vector_len);
2028 
2029   void check_stack_alignment(Register sp, const char* msg, unsigned bias = 0, Register tmp = noreg);
2030 
2031   void fast_lock_impl(Register obj, Register hdr, Register thread, Register tmp, Label& slow, bool rt_check_stack = true);
2032   void fast_unlock_impl(Register obj, Register hdr, Register tmp, Label& slow);
2033 };
2034 
2035 /**
2036  * class SkipIfEqual:
2037  *
2038  * Instantiating this class will result in assembly code being output that will
2039  * jump around any code emitted between the creation of the instance and it's
2040  * automatic destruction at the end of a scope block, depending on the value of
2041  * the flag passed to the constructor, which will be checked at run-time.
2042  */
2043 class SkipIfEqual {
2044  private:
2045   MacroAssembler* _masm;
2046   Label _label;
2047 
2048  public:
2049    SkipIfEqual(MacroAssembler*, const bool* flag_addr, bool value, Register rscratch);
2050    ~SkipIfEqual();
2051 };
2052 
2053 #endif // CPU_X86_MACROASSEMBLER_X86_HPP