1 /*
   2  * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef CPU_X86_MACROASSEMBLER_X86_HPP
  26 #define CPU_X86_MACROASSEMBLER_X86_HPP
  27 
  28 #include "asm/assembler.hpp"
  29 #include "asm/register.hpp"
  30 #include "code/vmreg.inline.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "utilities/macros.hpp"
  33 #include "runtime/rtmLocking.hpp"
  34 #include "runtime/vm_version.hpp"
  35 #include "utilities/checkedCast.hpp"
  36 
  37 // MacroAssembler extends Assembler by frequently used macros.
  38 //
  39 // Instructions for which a 'better' code sequence exists depending
  40 // on arguments should also go in here.
  41 
  42 class MacroAssembler: public Assembler {
  43   friend class LIR_Assembler;
  44   friend class Runtime1;      // as_Address()
  45 
  46  public:
  47   // Support for VM calls
  48   //
  49   // This is the base routine called by the different versions of call_VM_leaf. The interpreter
  50   // may customize this version by overriding it for its purposes (e.g., to save/restore
  51   // additional registers when doing a VM call).
  52 
  53   virtual void call_VM_leaf_base(
  54     address entry_point,               // the entry point
  55     int     number_of_arguments        // the number of arguments to pop after the call
  56   );
  57 
  58  protected:
  59   // This is the base routine called by the different versions of call_VM. The interpreter
  60   // may customize this version by overriding it for its purposes (e.g., to save/restore
  61   // additional registers when doing a VM call).
  62   //
  63   // If no java_thread register is specified (noreg) than rdi will be used instead. call_VM_base
  64   // returns the register which contains the thread upon return. If a thread register has been
  65   // specified, the return value will correspond to that register. If no last_java_sp is specified
  66   // (noreg) than rsp will be used instead.
  67   virtual void call_VM_base(           // returns the register containing the thread upon return
  68     Register oop_result,               // where an oop-result ends up if any; use noreg otherwise
  69     Register java_thread,              // the thread if computed before     ; use noreg otherwise
  70     Register last_java_sp,             // to set up last_Java_frame in stubs; use noreg otherwise
  71     address  entry_point,              // the entry point
  72     int      number_of_arguments,      // the number of arguments (w/o thread) to pop after the call
  73     bool     check_exceptions          // whether to check for pending exceptions after return
  74   );
  75 
  76   void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions = true);
  77 
  78   // helpers for FPU flag access
  79   // tmp is a temporary register, if none is available use noreg
  80   void save_rax   (Register tmp);
  81   void restore_rax(Register tmp);
  82 
  83  public:
  84   MacroAssembler(CodeBuffer* code) : Assembler(code) {}
  85 
  86  // These routines should emit JVMTI PopFrame and ForceEarlyReturn handling code.
  87  // The implementation is only non-empty for the InterpreterMacroAssembler,
  88  // as only the interpreter handles PopFrame and ForceEarlyReturn requests.
  89  virtual void check_and_handle_popframe(Register java_thread);
  90  virtual void check_and_handle_earlyret(Register java_thread);
  91 
  92   Address as_Address(AddressLiteral adr);
  93   Address as_Address(ArrayAddress adr, Register rscratch);
  94 
  95   // Support for null-checks
  96   //
  97   // Generates code that causes a null OS exception if the content of reg is null.
  98   // If the accessed location is M[reg + offset] and the offset is known, provide the
  99   // offset. No explicit code generation is needed if the offset is within a certain
 100   // range (0 <= offset <= page_size).
 101 
 102   void null_check(Register reg, int offset = -1);
 103   static bool needs_explicit_null_check(intptr_t offset);
 104   static bool uses_implicit_null_check(void* address);
 105 
 106   // Required platform-specific helpers for Label::patch_instructions.
 107   // They _shadow_ the declarations in AbstractAssembler, which are undefined.
 108   void pd_patch_instruction(address branch, address target, const char* file, int line) {
 109     unsigned char op = branch[0];
 110     assert(op == 0xE8 /* call */ ||
 111         op == 0xE9 /* jmp */ ||
 112         op == 0xEB /* short jmp */ ||
 113         (op & 0xF0) == 0x70 /* short jcc */ ||
 114         (op == 0x0F && (branch[1] & 0xF0) == 0x80) /* jcc */ ||
 115         (op == 0xC7 && branch[1] == 0xF8) /* xbegin */,
 116         "Invalid opcode at patch point");
 117 
 118     if (op == 0xEB || (op & 0xF0) == 0x70) {
 119       // short offset operators (jmp and jcc)
 120       char* disp = (char*) &branch[1];
 121       int imm8 = checked_cast<int>(target - (address) &disp[1]);
 122       guarantee(this->is8bit(imm8), "Short forward jump exceeds 8-bit offset at %s:%d",
 123                 file == nullptr ? "<null>" : file, line);
 124       *disp = (char)imm8;
 125     } else {
 126       int* disp = (int*) &branch[(op == 0x0F || op == 0xC7)? 2: 1];
 127       int imm32 = checked_cast<int>(target - (address) &disp[1]);
 128       *disp = imm32;
 129     }
 130   }
 131 
 132   // The following 4 methods return the offset of the appropriate move instruction
 133 
 134   // Support for fast byte/short loading with zero extension (depending on particular CPU)
 135   int load_unsigned_byte(Register dst, Address src);
 136   int load_unsigned_short(Register dst, Address src);
 137 
 138   // Support for fast byte/short loading with sign extension (depending on particular CPU)
 139   int load_signed_byte(Register dst, Address src);
 140   int load_signed_short(Register dst, Address src);
 141 
 142   // Support for sign-extension (hi:lo = extend_sign(lo))
 143   void extend_sign(Register hi, Register lo);
 144 
 145   // Load and store values by size and signed-ness
 146   void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2 = noreg);
 147   void store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2 = noreg);
 148 
 149   // Support for inc/dec with optimal instruction selection depending on value
 150 
 151   void increment(Register reg, int value = 1) { LP64_ONLY(incrementq(reg, value)) NOT_LP64(incrementl(reg, value)) ; }
 152   void decrement(Register reg, int value = 1) { LP64_ONLY(decrementq(reg, value)) NOT_LP64(decrementl(reg, value)) ; }
 153   void increment(Address dst, int value = 1)  { LP64_ONLY(incrementq(dst, value)) NOT_LP64(incrementl(dst, value)) ; }
 154   void decrement(Address dst, int value = 1)  { LP64_ONLY(decrementq(dst, value)) NOT_LP64(decrementl(dst, value)) ; }
 155 
 156   void decrementl(Address dst, int value = 1);
 157   void decrementl(Register reg, int value = 1);
 158 
 159   void decrementq(Register reg, int value = 1);
 160   void decrementq(Address dst, int value = 1);
 161 
 162   void incrementl(Address dst, int value = 1);
 163   void incrementl(Register reg, int value = 1);
 164 
 165   void incrementq(Register reg, int value = 1);
 166   void incrementq(Address dst, int value = 1);
 167 
 168   void incrementl(AddressLiteral dst, Register rscratch = noreg);
 169   void incrementl(ArrayAddress   dst, Register rscratch);
 170 
 171   void incrementq(AddressLiteral dst, Register rscratch = noreg);
 172 
 173   // Support optimal SSE move instructions.
 174   void movflt(XMMRegister dst, XMMRegister src) {
 175     if (dst-> encoding() == src->encoding()) return;
 176     if (UseXmmRegToRegMoveAll) { movaps(dst, src); return; }
 177     else                       { movss (dst, src); return; }
 178   }
 179   void movflt(XMMRegister dst, Address src) { movss(dst, src); }
 180   void movflt(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 181   void movflt(Address dst, XMMRegister src) { movss(dst, src); }
 182 
 183   // Move with zero extension
 184   void movfltz(XMMRegister dst, XMMRegister src) { movss(dst, src); }
 185 
 186   void movdbl(XMMRegister dst, XMMRegister src) {
 187     if (dst-> encoding() == src->encoding()) return;
 188     if (UseXmmRegToRegMoveAll) { movapd(dst, src); return; }
 189     else                       { movsd (dst, src); return; }
 190   }
 191 
 192   void movdbl(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 193 
 194   void movdbl(XMMRegister dst, Address src) {
 195     if (UseXmmLoadAndClearUpper) { movsd (dst, src); return; }
 196     else                         { movlpd(dst, src); return; }
 197   }
 198   void movdbl(Address dst, XMMRegister src) { movsd(dst, src); }
 199 
 200   void flt_to_flt16(Register dst, XMMRegister src, XMMRegister tmp) {
 201     // Use separate tmp XMM register because caller may
 202     // requires src XMM register to be unchanged (as in x86.ad).
 203     vcvtps2ph(tmp, src, 0x04, Assembler::AVX_128bit);
 204     movdl(dst, tmp);
 205     movswl(dst, dst);
 206   }
 207 
 208   void flt16_to_flt(XMMRegister dst, Register src) {
 209     movdl(dst, src);
 210     vcvtph2ps(dst, dst, Assembler::AVX_128bit);
 211   }
 212 
 213   // Alignment
 214   void align32();
 215   void align64();
 216   void align(int modulus);
 217   void align(int modulus, int target);
 218 
 219   void post_call_nop();
 220   // A 5 byte nop that is safe for patching (see patch_verified_entry)
 221   void fat_nop();
 222 
 223   // Stack frame creation/removal
 224   void enter();
 225   void leave();
 226 
 227   // Support for getting the JavaThread pointer (i.e.; a reference to thread-local information)
 228   // The pointer will be loaded into the thread register.
 229   void get_thread(Register thread);
 230 
 231 #ifdef _LP64
 232   // Support for argument shuffling
 233 
 234   // bias in bytes
 235   void move32_64(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 236   void long_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 237   void float_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 238   void double_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 239   void move_ptr(VMRegPair src, VMRegPair dst);
 240   void object_move(OopMap* map,
 241                    int oop_handle_offset,
 242                    int framesize_in_slots,
 243                    VMRegPair src,
 244                    VMRegPair dst,
 245                    bool is_receiver,
 246                    int* receiver_offset);
 247 #endif // _LP64
 248 
 249   // Support for VM calls
 250   //
 251   // It is imperative that all calls into the VM are handled via the call_VM macros.
 252   // They make sure that the stack linkage is setup correctly. call_VM's correspond
 253   // to ENTRY/ENTRY_X entry points while call_VM_leaf's correspond to LEAF entry points.
 254 
 255 
 256   void call_VM(Register oop_result,
 257                address entry_point,
 258                bool check_exceptions = true);
 259   void call_VM(Register oop_result,
 260                address entry_point,
 261                Register arg_1,
 262                bool check_exceptions = true);
 263   void call_VM(Register oop_result,
 264                address entry_point,
 265                Register arg_1, Register arg_2,
 266                bool check_exceptions = true);
 267   void call_VM(Register oop_result,
 268                address entry_point,
 269                Register arg_1, Register arg_2, Register arg_3,
 270                bool check_exceptions = true);
 271 
 272   // Overloadings with last_Java_sp
 273   void call_VM(Register oop_result,
 274                Register last_java_sp,
 275                address entry_point,
 276                int number_of_arguments = 0,
 277                bool check_exceptions = true);
 278   void call_VM(Register oop_result,
 279                Register last_java_sp,
 280                address entry_point,
 281                Register arg_1, bool
 282                check_exceptions = true);
 283   void call_VM(Register oop_result,
 284                Register last_java_sp,
 285                address entry_point,
 286                Register arg_1, Register arg_2,
 287                bool check_exceptions = true);
 288   void call_VM(Register oop_result,
 289                Register last_java_sp,
 290                address entry_point,
 291                Register arg_1, Register arg_2, Register arg_3,
 292                bool check_exceptions = true);
 293 
 294   void get_vm_result  (Register oop_result, Register thread);
 295   void get_vm_result_2(Register metadata_result, Register thread);
 296 
 297   // These always tightly bind to MacroAssembler::call_VM_base
 298   // bypassing the virtual implementation
 299   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments = 0, bool check_exceptions = true);
 300   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions = true);
 301   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions = true);
 302   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions = true);
 303   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4, bool check_exceptions = true);
 304 
 305   void call_VM_leaf0(address entry_point);
 306   void call_VM_leaf(address entry_point,
 307                     int number_of_arguments = 0);
 308   void call_VM_leaf(address entry_point,
 309                     Register arg_1);
 310   void call_VM_leaf(address entry_point,
 311                     Register arg_1, Register arg_2);
 312   void call_VM_leaf(address entry_point,
 313                     Register arg_1, Register arg_2, Register arg_3);
 314 
 315   void call_VM_leaf(address entry_point,
 316                     Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 317 
 318   // These always tightly bind to MacroAssembler::call_VM_leaf_base
 319   // bypassing the virtual implementation
 320   void super_call_VM_leaf(address entry_point);
 321   void super_call_VM_leaf(address entry_point, Register arg_1);
 322   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
 323   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
 324   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 325 
 326   // last Java Frame (fills frame anchor)
 327   void set_last_Java_frame(Register thread,
 328                            Register last_java_sp,
 329                            Register last_java_fp,
 330                            address  last_java_pc,
 331                            Register rscratch);
 332 
 333   // thread in the default location (r15_thread on 64bit)
 334   void set_last_Java_frame(Register last_java_sp,
 335                            Register last_java_fp,
 336                            address  last_java_pc,
 337                            Register rscratch);
 338 
 339   void reset_last_Java_frame(Register thread, bool clear_fp);
 340 
 341   // thread in the default location (r15_thread on 64bit)
 342   void reset_last_Java_frame(bool clear_fp);
 343 
 344   // jobjects
 345   void clear_jobject_tag(Register possibly_non_local);
 346   void resolve_jobject(Register value, Register thread, Register tmp);
 347   void resolve_global_jobject(Register value, Register thread, Register tmp);
 348 
 349   // C 'boolean' to Java boolean: x == 0 ? 0 : 1
 350   void c2bool(Register x);
 351 
 352   // C++ bool manipulation
 353 
 354   void movbool(Register dst, Address src);
 355   void movbool(Address dst, bool boolconst);
 356   void movbool(Address dst, Register src);
 357   void testbool(Register dst);
 358 
 359   void resolve_oop_handle(Register result, Register tmp);
 360   void resolve_weak_handle(Register result, Register tmp);
 361   void load_mirror(Register mirror, Register method, Register tmp);
 362   void load_method_holder_cld(Register rresult, Register rmethod);
 363 
 364   void load_method_holder(Register holder, Register method);
 365 
 366   // oop manipulations
 367 #ifdef _LP64
 368   void load_nklass_compact(Register dst, Register src);
 369 #endif
 370   void load_klass(Register dst, Register src, Register tmp);
 371   void store_klass(Register dst, Register src, Register tmp);
 372 
 373   // Compares the Klass pointer of an object to a given Klass (which might be narrow,
 374   // depending on UseCompressedClassPointers).
 375   void cmp_klass(Register klass, Register dst, Register tmp);
 376 
 377   // Compares the Klass pointer of two objects o1 and o2. Result is in the condition flags.
 378   // Uses t1 and t2 as temporary registers.
 379   void cmp_klass(Register src, Register dst, Register tmp1, Register tmp2);
 380 
 381   void access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
 382                       Register tmp1, Register thread_tmp);
 383   void access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val,
 384                        Register tmp1, Register tmp2, Register tmp3);
 385 
 386   void load_heap_oop(Register dst, Address src, Register tmp1 = noreg,
 387                      Register thread_tmp = noreg, DecoratorSet decorators = 0);
 388   void load_heap_oop_not_null(Register dst, Address src, Register tmp1 = noreg,
 389                               Register thread_tmp = noreg, DecoratorSet decorators = 0);
 390   void store_heap_oop(Address dst, Register val, Register tmp1 = noreg,
 391                       Register tmp2 = noreg, Register tmp3 = noreg, DecoratorSet decorators = 0);
 392 
 393   // Used for storing null. All other oop constants should be
 394   // stored using routines that take a jobject.
 395   void store_heap_oop_null(Address dst);
 396 
 397 #ifdef _LP64
 398   void store_klass_gap(Register dst, Register src);
 399 
 400   // This dummy is to prevent a call to store_heap_oop from
 401   // converting a zero (like null) into a Register by giving
 402   // the compiler two choices it can't resolve
 403 
 404   void store_heap_oop(Address dst, void* dummy);
 405 
 406   void encode_heap_oop(Register r);
 407   void decode_heap_oop(Register r);
 408   void encode_heap_oop_not_null(Register r);
 409   void decode_heap_oop_not_null(Register r);
 410   void encode_heap_oop_not_null(Register dst, Register src);
 411   void decode_heap_oop_not_null(Register dst, Register src);
 412 
 413   void set_narrow_oop(Register dst, jobject obj);
 414   void set_narrow_oop(Address dst, jobject obj);
 415   void cmp_narrow_oop(Register dst, jobject obj);
 416   void cmp_narrow_oop(Address dst, jobject obj);
 417 
 418   void encode_klass_not_null(Register r, Register tmp);
 419   void decode_klass_not_null(Register r, Register tmp);
 420   void encode_and_move_klass_not_null(Register dst, Register src);
 421   void decode_and_move_klass_not_null(Register dst, Register src);
 422   void set_narrow_klass(Register dst, Klass* k);
 423   void set_narrow_klass(Address dst, Klass* k);
 424   void cmp_narrow_klass(Register dst, Klass* k);
 425   void cmp_narrow_klass(Address dst, Klass* k);
 426 
 427   // if heap base register is used - reinit it with the correct value
 428   void reinit_heapbase();
 429 
 430   DEBUG_ONLY(void verify_heapbase(const char* msg);)
 431 
 432 #endif // _LP64
 433 
 434   // Int division/remainder for Java
 435   // (as idivl, but checks for special case as described in JVM spec.)
 436   // returns idivl instruction offset for implicit exception handling
 437   int corrected_idivl(Register reg);
 438 
 439   // Long division/remainder for Java
 440   // (as idivq, but checks for special case as described in JVM spec.)
 441   // returns idivq instruction offset for implicit exception handling
 442   int corrected_idivq(Register reg);
 443 
 444   void int3();
 445 
 446   // Long operation macros for a 32bit cpu
 447   // Long negation for Java
 448   void lneg(Register hi, Register lo);
 449 
 450   // Long multiplication for Java
 451   // (destroys contents of eax, ebx, ecx and edx)
 452   void lmul(int x_rsp_offset, int y_rsp_offset); // rdx:rax = x * y
 453 
 454   // Long shifts for Java
 455   // (semantics as described in JVM spec.)
 456   void lshl(Register hi, Register lo);                               // hi:lo << (rcx & 0x3f)
 457   void lshr(Register hi, Register lo, bool sign_extension = false);  // hi:lo >> (rcx & 0x3f)
 458 
 459   // Long compare for Java
 460   // (semantics as described in JVM spec.)
 461   void lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo); // x_hi = lcmp(x, y)
 462 
 463 
 464   // misc
 465 
 466   // Sign extension
 467   void sign_extend_short(Register reg);
 468   void sign_extend_byte(Register reg);
 469 
 470   // Division by power of 2, rounding towards 0
 471   void division_with_shift(Register reg, int shift_value);
 472 
 473 #ifndef _LP64
 474   // Compares the top-most stack entries on the FPU stack and sets the eflags as follows:
 475   //
 476   // CF (corresponds to C0) if x < y
 477   // PF (corresponds to C2) if unordered
 478   // ZF (corresponds to C3) if x = y
 479   //
 480   // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
 481   // tmp is a temporary register, if none is available use noreg (only matters for non-P6 code)
 482   void fcmp(Register tmp);
 483   // Variant of the above which allows y to be further down the stack
 484   // and which only pops x and y if specified. If pop_right is
 485   // specified then pop_left must also be specified.
 486   void fcmp(Register tmp, int index, bool pop_left, bool pop_right);
 487 
 488   // Floating-point comparison for Java
 489   // Compares the top-most stack entries on the FPU stack and stores the result in dst.
 490   // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
 491   // (semantics as described in JVM spec.)
 492   void fcmp2int(Register dst, bool unordered_is_less);
 493   // Variant of the above which allows y to be further down the stack
 494   // and which only pops x and y if specified. If pop_right is
 495   // specified then pop_left must also be specified.
 496   void fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right);
 497 
 498   // Floating-point remainder for Java (ST0 = ST0 fremr ST1, ST1 is empty afterwards)
 499   // tmp is a temporary register, if none is available use noreg
 500   void fremr(Register tmp);
 501 
 502   // only if +VerifyFPU
 503   void verify_FPU(int stack_depth, const char* s = "illegal FPU state");
 504 #endif // !LP64
 505 
 506   // dst = c = a * b + c
 507   void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 508   void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 509 
 510   void vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 511   void vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 512   void vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 513   void vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 514 
 515 
 516   // same as fcmp2int, but using SSE2
 517   void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 518   void cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 519 
 520   // branch to L if FPU flag C2 is set/not set
 521   // tmp is a temporary register, if none is available use noreg
 522   void jC2 (Register tmp, Label& L);
 523   void jnC2(Register tmp, Label& L);
 524 
 525   // Load float value from 'address'. If UseSSE >= 1, the value is loaded into
 526   // register xmm0. Otherwise, the value is loaded onto the FPU stack.
 527   void load_float(Address src);
 528 
 529   // Store float value to 'address'. If UseSSE >= 1, the value is stored
 530   // from register xmm0. Otherwise, the value is stored from the FPU stack.
 531   void store_float(Address dst);
 532 
 533   // Load double value from 'address'. If UseSSE >= 2, the value is loaded into
 534   // register xmm0. Otherwise, the value is loaded onto the FPU stack.
 535   void load_double(Address src);
 536 
 537   // Store double value to 'address'. If UseSSE >= 2, the value is stored
 538   // from register xmm0. Otherwise, the value is stored from the FPU stack.
 539   void store_double(Address dst);
 540 
 541 #ifndef _LP64
 542   // Pop ST (ffree & fincstp combined)
 543   void fpop();
 544 
 545   void empty_FPU_stack();
 546 #endif // !_LP64
 547 
 548   void push_IU_state();
 549   void pop_IU_state();
 550 
 551   void push_FPU_state();
 552   void pop_FPU_state();
 553 
 554   void push_CPU_state();
 555   void pop_CPU_state();
 556 
 557   void push_cont_fastpath();
 558   void pop_cont_fastpath();
 559 
 560   void inc_held_monitor_count();
 561   void dec_held_monitor_count();
 562 
 563   DEBUG_ONLY(void stop_if_in_cont(Register cont_reg, const char* name);)
 564 
 565   // Round up to a power of two
 566   void round_to(Register reg, int modulus);
 567 
 568 private:
 569   // General purpose and XMM registers potentially clobbered by native code; there
 570   // is no need for FPU or AVX opmask related methods because C1/interpreter
 571   // - we save/restore FPU state as a whole always
 572   // - do not care about AVX-512 opmask
 573   static RegSet call_clobbered_gp_registers();
 574   static XMMRegSet call_clobbered_xmm_registers();
 575 
 576   void push_set(XMMRegSet set, int offset);
 577   void pop_set(XMMRegSet set, int offset);
 578 
 579 public:
 580   void push_set(RegSet set, int offset = -1);
 581   void pop_set(RegSet set, int offset = -1);
 582 
 583   // Push and pop everything that might be clobbered by a native
 584   // runtime call.
 585   // Only save the lower 64 bits of each vector register.
 586   // Additional registers can be excluded in a passed RegSet.
 587   void push_call_clobbered_registers_except(RegSet exclude, bool save_fpu = true);
 588   void pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu = true);
 589 
 590   void push_call_clobbered_registers(bool save_fpu = true) {
 591     push_call_clobbered_registers_except(RegSet(), save_fpu);
 592   }
 593   void pop_call_clobbered_registers(bool restore_fpu = true) {
 594     pop_call_clobbered_registers_except(RegSet(), restore_fpu);
 595   }
 596 
 597   // allocation
 598   void tlab_allocate(
 599     Register thread,                   // Current thread
 600     Register obj,                      // result: pointer to object after successful allocation
 601     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
 602     int      con_size_in_bytes,        // object size in bytes if   known at compile time
 603     Register t1,                       // temp register
 604     Register t2,                       // temp register
 605     Label&   slow_case                 // continuation point if fast allocation fails
 606   );
 607   void zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp);
 608 
 609   // interface method calling
 610   void lookup_interface_method(Register recv_klass,
 611                                Register intf_klass,
 612                                RegisterOrConstant itable_index,
 613                                Register method_result,
 614                                Register scan_temp,
 615                                Label& no_such_interface,
 616                                bool return_method = true);
 617 
 618   void lookup_interface_method_stub(Register recv_klass,
 619                                     Register holder_klass,
 620                                     Register resolved_klass,
 621                                     Register method_result,
 622                                     Register scan_temp,
 623                                     Register temp_reg2,
 624                                     Register receiver,
 625                                     int itable_index,
 626                                     Label& L_no_such_interface);
 627 
 628   // virtual method calling
 629   void lookup_virtual_method(Register recv_klass,
 630                              RegisterOrConstant vtable_index,
 631                              Register method_result);
 632 
 633   // Test sub_klass against super_klass, with fast and slow paths.
 634 
 635   // The fast path produces a tri-state answer: yes / no / maybe-slow.
 636   // One of the three labels can be null, meaning take the fall-through.
 637   // If super_check_offset is -1, the value is loaded up from super_klass.
 638   // No registers are killed, except temp_reg.
 639   void check_klass_subtype_fast_path(Register sub_klass,
 640                                      Register super_klass,
 641                                      Register temp_reg,
 642                                      Label* L_success,
 643                                      Label* L_failure,
 644                                      Label* L_slow_path,
 645                 RegisterOrConstant super_check_offset = RegisterOrConstant(-1));
 646 
 647   // The rest of the type check; must be wired to a corresponding fast path.
 648   // It does not repeat the fast path logic, so don't use it standalone.
 649   // The temp_reg and temp2_reg can be noreg, if no temps are available.
 650   // Updates the sub's secondary super cache as necessary.
 651   // If set_cond_codes, condition codes will be Z on success, NZ on failure.
 652   void check_klass_subtype_slow_path(Register sub_klass,
 653                                      Register super_klass,
 654                                      Register temp_reg,
 655                                      Register temp2_reg,
 656                                      Label* L_success,
 657                                      Label* L_failure,
 658                                      bool set_cond_codes = false);
 659 
 660   // Simplified, combined version, good for typical uses.
 661   // Falls through on failure.
 662   void check_klass_subtype(Register sub_klass,
 663                            Register super_klass,
 664                            Register temp_reg,
 665                            Label& L_success);
 666 
 667   void clinit_barrier(Register klass,
 668                       Register thread,
 669                       Label* L_fast_path = nullptr,
 670                       Label* L_slow_path = nullptr);
 671 
 672   // method handles (JSR 292)
 673   Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0);
 674 
 675   // Debugging
 676 
 677   // only if +VerifyOops
 678   void _verify_oop(Register reg, const char* s, const char* file, int line);
 679   void _verify_oop_addr(Address addr, const char* s, const char* file, int line);
 680 
 681   void _verify_oop_checked(Register reg, const char* s, const char* file, int line) {
 682     if (VerifyOops) {
 683       _verify_oop(reg, s, file, line);
 684     }
 685   }
 686   void _verify_oop_addr_checked(Address reg, const char* s, const char* file, int line) {
 687     if (VerifyOops) {
 688       _verify_oop_addr(reg, s, file, line);
 689     }
 690   }
 691 
 692   // TODO: verify method and klass metadata (compare against vptr?)
 693   void _verify_method_ptr(Register reg, const char * msg, const char * file, int line) {}
 694   void _verify_klass_ptr(Register reg, const char * msg, const char * file, int line){}
 695 
 696 #define verify_oop(reg) _verify_oop_checked(reg, "broken oop " #reg, __FILE__, __LINE__)
 697 #define verify_oop_msg(reg, msg) _verify_oop_checked(reg, "broken oop " #reg ", " #msg, __FILE__, __LINE__)
 698 #define verify_oop_addr(addr) _verify_oop_addr_checked(addr, "broken oop addr " #addr, __FILE__, __LINE__)
 699 #define verify_method_ptr(reg) _verify_method_ptr(reg, "broken method " #reg, __FILE__, __LINE__)
 700 #define verify_klass_ptr(reg) _verify_klass_ptr(reg, "broken klass " #reg, __FILE__, __LINE__)
 701 
 702   // Verify or restore cpu control state after JNI call
 703   void restore_cpu_control_state_after_jni(Register rscratch);
 704 
 705   // prints msg, dumps registers and stops execution
 706   void stop(const char* msg);
 707 
 708   // prints msg and continues
 709   void warn(const char* msg);
 710 
 711   // dumps registers and other state
 712   void print_state();
 713 
 714   static void debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg);
 715   static void debug64(char* msg, int64_t pc, int64_t regs[]);
 716   static void print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip);
 717   static void print_state64(int64_t pc, int64_t regs[]);
 718 
 719   void os_breakpoint();
 720 
 721   void untested()                                { stop("untested"); }
 722 
 723   void unimplemented(const char* what = "");
 724 
 725   void should_not_reach_here()                   { stop("should not reach here"); }
 726 
 727   void print_CPU_state();
 728 
 729   // Stack overflow checking
 730   void bang_stack_with_offset(int offset) {
 731     // stack grows down, caller passes positive offset
 732     assert(offset > 0, "must bang with negative offset");
 733     movl(Address(rsp, (-offset)), rax);
 734   }
 735 
 736   // Writes to stack successive pages until offset reached to check for
 737   // stack overflow + shadow pages.  Also, clobbers tmp
 738   void bang_stack_size(Register size, Register tmp);
 739 
 740   // Check for reserved stack access in method being exited (for JIT)
 741   void reserved_stack_check();
 742 
 743   void safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod);
 744 
 745   void verify_tlab();
 746 
 747   static Condition negate_condition(Condition cond);
 748 
 749   // Instructions that use AddressLiteral operands. These instruction can handle 32bit/64bit
 750   // operands. In general the names are modified to avoid hiding the instruction in Assembler
 751   // so that we don't need to implement all the varieties in the Assembler with trivial wrappers
 752   // here in MacroAssembler. The major exception to this rule is call
 753 
 754   // Arithmetics
 755 
 756 
 757   void addptr(Address dst, int32_t src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)) ; }
 758   void addptr(Address dst, Register src);
 759 
 760   void addptr(Register dst, Address src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); }
 761   void addptr(Register dst, int32_t src);
 762   void addptr(Register dst, Register src);
 763   void addptr(Register dst, RegisterOrConstant src) {
 764     if (src.is_constant()) addptr(dst, checked_cast<int>(src.as_constant()));
 765     else                   addptr(dst, src.as_register());
 766   }
 767 
 768   void andptr(Register dst, int32_t src);
 769   void andptr(Register src1, Register src2) { LP64_ONLY(andq(src1, src2)) NOT_LP64(andl(src1, src2)) ; }
 770 
 771 #ifdef _LP64
 772   using Assembler::andq;
 773   void andq(Register dst, AddressLiteral src, Register rscratch = noreg);
 774 #endif
 775 
 776   void cmp8(AddressLiteral src1, int imm, Register rscratch = noreg);
 777 
 778   // renamed to drag out the casting of address to int32_t/intptr_t
 779   void cmp32(Register src1, int32_t imm);
 780 
 781   void cmp32(AddressLiteral src1, int32_t imm, Register rscratch = noreg);
 782   // compare reg - mem, or reg - &mem
 783   void cmp32(Register src1, AddressLiteral src2, Register rscratch = noreg);
 784 
 785   void cmp32(Register src1, Address src2);
 786 
 787 #ifndef _LP64
 788   void cmpklass(Address dst, Metadata* obj);
 789   void cmpklass(Register dst, Metadata* obj);
 790   void cmpoop(Address dst, jobject obj);
 791 #endif // _LP64
 792 
 793   void cmpoop(Register src1, Register src2);
 794   void cmpoop(Register src1, Address src2);
 795   void cmpoop(Register dst, jobject obj, Register rscratch);
 796 
 797   // NOTE src2 must be the lval. This is NOT an mem-mem compare
 798   void cmpptr(Address src1, AddressLiteral src2, Register rscratch);
 799 
 800   void cmpptr(Register src1, AddressLiteral src2, Register rscratch = noreg);
 801 
 802   void cmpptr(Register src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 803   void cmpptr(Register src1, Address src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 804   // void cmpptr(Address src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 805 
 806   void cmpptr(Register src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 807   void cmpptr(Address src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 808 
 809   // cmp64 to avoild hiding cmpq
 810   void cmp64(Register src1, AddressLiteral src, Register rscratch = noreg);
 811 
 812   void cmpxchgptr(Register reg, Address adr);
 813 
 814   void locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch = noreg);
 815 
 816   void imulptr(Register dst, Register src) { LP64_ONLY(imulq(dst, src)) NOT_LP64(imull(dst, src)); }
 817   void imulptr(Register dst, Register src, int imm32) { LP64_ONLY(imulq(dst, src, imm32)) NOT_LP64(imull(dst, src, imm32)); }
 818 
 819 
 820   void negptr(Register dst) { LP64_ONLY(negq(dst)) NOT_LP64(negl(dst)); }
 821 
 822   void notptr(Register dst) { LP64_ONLY(notq(dst)) NOT_LP64(notl(dst)); }
 823 
 824   void shlptr(Register dst, int32_t shift);
 825   void shlptr(Register dst) { LP64_ONLY(shlq(dst)) NOT_LP64(shll(dst)); }
 826 
 827   void shrptr(Register dst, int32_t shift);
 828   void shrptr(Register dst) { LP64_ONLY(shrq(dst)) NOT_LP64(shrl(dst)); }
 829 
 830   void sarptr(Register dst) { LP64_ONLY(sarq(dst)) NOT_LP64(sarl(dst)); }
 831   void sarptr(Register dst, int32_t src) { LP64_ONLY(sarq(dst, src)) NOT_LP64(sarl(dst, src)); }
 832 
 833   void subptr(Address dst, int32_t src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
 834 
 835   void subptr(Register dst, Address src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
 836   void subptr(Register dst, int32_t src);
 837   // Force generation of a 4 byte immediate value even if it fits into 8bit
 838   void subptr_imm32(Register dst, int32_t src);
 839   void subptr(Register dst, Register src);
 840   void subptr(Register dst, RegisterOrConstant src) {
 841     if (src.is_constant()) subptr(dst, (int) src.as_constant());
 842     else                   subptr(dst,       src.as_register());
 843   }
 844 
 845   void sbbptr(Address dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
 846   void sbbptr(Register dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
 847 
 848   void xchgptr(Register src1, Register src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
 849   void xchgptr(Register src1, Address src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
 850 
 851   void xaddptr(Address src1, Register src2) { LP64_ONLY(xaddq(src1, src2)) NOT_LP64(xaddl(src1, src2)) ; }
 852 
 853 
 854 
 855   // Helper functions for statistics gathering.
 856   // Conditionally (atomically, on MPs) increments passed counter address, preserving condition codes.
 857   void cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch = noreg);
 858   // Unconditional atomic increment.
 859   void atomic_incl(Address counter_addr);
 860   void atomic_incl(AddressLiteral counter_addr, Register rscratch = noreg);
 861 #ifdef _LP64
 862   void atomic_incq(Address counter_addr);
 863   void atomic_incq(AddressLiteral counter_addr, Register rscratch = noreg);
 864 #endif
 865   void atomic_incptr(AddressLiteral counter_addr, Register rscratch = noreg) { LP64_ONLY(atomic_incq(counter_addr, rscratch)) NOT_LP64(atomic_incl(counter_addr, rscratch)) ; }
 866   void atomic_incptr(Address counter_addr) { LP64_ONLY(atomic_incq(counter_addr)) NOT_LP64(atomic_incl(counter_addr)) ; }
 867 
 868   void lea(Register dst, Address        adr) { Assembler::lea(dst, adr); }
 869   void lea(Register dst, AddressLiteral adr);
 870   void lea(Address  dst, AddressLiteral adr, Register rscratch);
 871 
 872   void leal32(Register dst, Address src) { leal(dst, src); }
 873 
 874   // Import other testl() methods from the parent class or else
 875   // they will be hidden by the following overriding declaration.
 876   using Assembler::testl;
 877   void testl(Address dst, int32_t imm32);
 878   void testl(Register dst, int32_t imm32);
 879   void testl(Register dst, AddressLiteral src); // requires reachable address
 880   using Assembler::testq;
 881   void testq(Address dst, int32_t imm32);
 882   void testq(Register dst, int32_t imm32);
 883 
 884   void orptr(Register dst, Address src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 885   void orptr(Register dst, Register src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 886   void orptr(Register dst, int32_t src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 887   void orptr(Address dst, int32_t imm32) { LP64_ONLY(orq(dst, imm32)) NOT_LP64(orl(dst, imm32)); }
 888 
 889   void testptr(Register src, int32_t imm32) {  LP64_ONLY(testq(src, imm32)) NOT_LP64(testl(src, imm32)); }
 890   void testptr(Register src1, Address src2) { LP64_ONLY(testq(src1, src2)) NOT_LP64(testl(src1, src2)); }
 891   void testptr(Address src, int32_t imm32) {  LP64_ONLY(testq(src, imm32)) NOT_LP64(testl(src, imm32)); }
 892   void testptr(Register src1, Register src2);
 893 
 894   void xorptr(Register dst, Register src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
 895   void xorptr(Register dst, Address src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
 896 
 897   // Calls
 898 
 899   void call(Label& L, relocInfo::relocType rtype);
 900   void call(Register entry);
 901   void call(Address addr) { Assembler::call(addr); }
 902 
 903   // NOTE: this call transfers to the effective address of entry NOT
 904   // the address contained by entry. This is because this is more natural
 905   // for jumps/calls.
 906   void call(AddressLiteral entry, Register rscratch = rax);
 907 
 908   // Emit the CompiledIC call idiom
 909   void ic_call(address entry, jint method_index = 0);
 910   static int ic_check_size();
 911   int ic_check(int end_alignment);
 912 
 913   void emit_static_call_stub();
 914 
 915   // Jumps
 916 
 917   // NOTE: these jumps transfer to the effective address of dst NOT
 918   // the address contained by dst. This is because this is more natural
 919   // for jumps/calls.
 920   void jump(AddressLiteral dst, Register rscratch = noreg);
 921 
 922   void jump_cc(Condition cc, AddressLiteral dst, Register rscratch = noreg);
 923 
 924   // 32bit can do a case table jump in one instruction but we no longer allow the base
 925   // to be installed in the Address class. This jump will transfer to the address
 926   // contained in the location described by entry (not the address of entry)
 927   void jump(ArrayAddress entry, Register rscratch);
 928 
 929   // Floating
 930 
 931   void push_f(XMMRegister r);
 932   void pop_f(XMMRegister r);
 933   void push_d(XMMRegister r);
 934   void pop_d(XMMRegister r);
 935 
 936   void andpd(XMMRegister dst, XMMRegister    src) { Assembler::andpd(dst, src); }
 937   void andpd(XMMRegister dst, Address        src) { Assembler::andpd(dst, src); }
 938   void andpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 939 
 940   void andps(XMMRegister dst, XMMRegister    src) { Assembler::andps(dst, src); }
 941   void andps(XMMRegister dst, Address        src) { Assembler::andps(dst, src); }
 942   void andps(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 943 
 944   void comiss(XMMRegister dst, XMMRegister    src) { Assembler::comiss(dst, src); }
 945   void comiss(XMMRegister dst, Address        src) { Assembler::comiss(dst, src); }
 946   void comiss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 947 
 948   void comisd(XMMRegister dst, XMMRegister    src) { Assembler::comisd(dst, src); }
 949   void comisd(XMMRegister dst, Address        src) { Assembler::comisd(dst, src); }
 950   void comisd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 951 
 952 #ifndef _LP64
 953   void fadd_s(Address        src) { Assembler::fadd_s(src); }
 954   void fadd_s(AddressLiteral src) { Assembler::fadd_s(as_Address(src)); }
 955 
 956   void fldcw(Address        src) { Assembler::fldcw(src); }
 957   void fldcw(AddressLiteral src);
 958 
 959   void fld_s(int index)          { Assembler::fld_s(index); }
 960   void fld_s(Address        src) { Assembler::fld_s(src); }
 961   void fld_s(AddressLiteral src);
 962 
 963   void fld_d(Address        src) { Assembler::fld_d(src); }
 964   void fld_d(AddressLiteral src);
 965 
 966   void fld_x(Address        src) { Assembler::fld_x(src); }
 967   void fld_x(AddressLiteral src) { Assembler::fld_x(as_Address(src)); }
 968 
 969   void fmul_s(Address        src) { Assembler::fmul_s(src); }
 970   void fmul_s(AddressLiteral src) { Assembler::fmul_s(as_Address(src)); }
 971 #endif // !_LP64
 972 
 973   void ldmxcsr(Address src) { Assembler::ldmxcsr(src); }
 974   void ldmxcsr(AddressLiteral src, Register rscratch = noreg);
 975 
 976 #ifdef _LP64
 977  private:
 978   void sha256_AVX2_one_round_compute(
 979     Register  reg_old_h,
 980     Register  reg_a,
 981     Register  reg_b,
 982     Register  reg_c,
 983     Register  reg_d,
 984     Register  reg_e,
 985     Register  reg_f,
 986     Register  reg_g,
 987     Register  reg_h,
 988     int iter);
 989   void sha256_AVX2_four_rounds_compute_first(int start);
 990   void sha256_AVX2_four_rounds_compute_last(int start);
 991   void sha256_AVX2_one_round_and_sched(
 992         XMMRegister xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
 993         XMMRegister xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
 994         XMMRegister xmm_2,     /* ymm6 */
 995         XMMRegister xmm_3,     /* ymm7 */
 996         Register    reg_a,      /* == eax on 0 iteration, then rotate 8 register right on each next iteration */
 997         Register    reg_b,      /* ebx */    /* full cycle is 8 iterations */
 998         Register    reg_c,      /* edi */
 999         Register    reg_d,      /* esi */
1000         Register    reg_e,      /* r8d */
1001         Register    reg_f,      /* r9d */
1002         Register    reg_g,      /* r10d */
1003         Register    reg_h,      /* r11d */
1004         int iter);
1005 
1006   void addm(int disp, Register r1, Register r2);
1007 
1008   void sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c, Register d,
1009                                      Register e, Register f, Register g, Register h, int iteration);
1010 
1011   void sha512_AVX2_one_round_and_schedule(XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1012                                           Register a, Register b, Register c, Register d, Register e, Register f,
1013                                           Register g, Register h, int iteration);
1014 
1015   void addmq(int disp, Register r1, Register r2);
1016  public:
1017   void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1018                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1019                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1020                    bool multi_block, XMMRegister shuf_mask);
1021   void sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1022                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1023                    Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block,
1024                    XMMRegister shuf_mask);
1025 #endif // _LP64
1026 
1027   void fast_md5(Register buf, Address state, Address ofs, Address limit,
1028                 bool multi_block);
1029 
1030   void fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
1031                  XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
1032                  Register buf, Register state, Register ofs, Register limit, Register rsp,
1033                  bool multi_block);
1034 
1035 #ifdef _LP64
1036   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1037                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1038                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1039                    bool multi_block, XMMRegister shuf_mask);
1040 #else
1041   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1042                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1043                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1044                    bool multi_block);
1045 #endif
1046 
1047   void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1048                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1049                 Register rax, Register rcx, Register rdx, Register tmp);
1050 
1051 #ifndef _LP64
1052  private:
1053   // Initialized in macroAssembler_x86_constants.cpp
1054   static address ONES;
1055   static address L_2IL0FLOATPACKET_0;
1056   static address PI4_INV;
1057   static address PI4X3;
1058   static address PI4X4;
1059 
1060  public:
1061   void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1062                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1063                 Register rax, Register rcx, Register rdx, Register tmp1);
1064 
1065   void fast_log10(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1066                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1067                 Register rax, Register rcx, Register rdx, Register tmp);
1068 
1069   void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1070                 XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
1071                 Register rdx, Register tmp);
1072 
1073   void fast_sin(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1074                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1075                 Register rax, Register rbx, Register rdx);
1076 
1077   void fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1078                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1079                 Register rax, Register rcx, Register rdx, Register tmp);
1080 
1081   void libm_sincos_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1082                         Register edx, Register ebx, Register esi, Register edi,
1083                         Register ebp, Register esp);
1084 
1085   void libm_reduce_pi04l(Register eax, Register ecx, Register edx, Register ebx,
1086                          Register esi, Register edi, Register ebp, Register esp);
1087 
1088   void libm_tancot_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1089                         Register edx, Register ebx, Register esi, Register edi,
1090                         Register ebp, Register esp);
1091 
1092   void fast_tan(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1093                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1094                 Register rax, Register rcx, Register rdx, Register tmp);
1095 #endif // !_LP64
1096 
1097 private:
1098 
1099   // these are private because users should be doing movflt/movdbl
1100 
1101   void movss(Address     dst, XMMRegister    src) { Assembler::movss(dst, src); }
1102   void movss(XMMRegister dst, XMMRegister    src) { Assembler::movss(dst, src); }
1103   void movss(XMMRegister dst, Address        src) { Assembler::movss(dst, src); }
1104   void movss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1105 
1106   void movlpd(XMMRegister dst, Address        src) {Assembler::movlpd(dst, src); }
1107   void movlpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1108 
1109 public:
1110 
1111   void addsd(XMMRegister dst, XMMRegister    src) { Assembler::addsd(dst, src); }
1112   void addsd(XMMRegister dst, Address        src) { Assembler::addsd(dst, src); }
1113   void addsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1114 
1115   void addss(XMMRegister dst, XMMRegister    src) { Assembler::addss(dst, src); }
1116   void addss(XMMRegister dst, Address        src) { Assembler::addss(dst, src); }
1117   void addss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1118 
1119   void addpd(XMMRegister dst, XMMRegister    src) { Assembler::addpd(dst, src); }
1120   void addpd(XMMRegister dst, Address        src) { Assembler::addpd(dst, src); }
1121   void addpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1122 
1123   using Assembler::vbroadcastsd;
1124   void vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1125 
1126   using Assembler::vbroadcastss;
1127   void vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1128 
1129   // Vector float blend
1130   void vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg);
1131   void vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg);
1132 
1133   void divsd(XMMRegister dst, XMMRegister    src) { Assembler::divsd(dst, src); }
1134   void divsd(XMMRegister dst, Address        src) { Assembler::divsd(dst, src); }
1135   void divsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1136 
1137   void divss(XMMRegister dst, XMMRegister    src) { Assembler::divss(dst, src); }
1138   void divss(XMMRegister dst, Address        src) { Assembler::divss(dst, src); }
1139   void divss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1140 
1141   // Move Unaligned Double Quadword
1142   void movdqu(Address     dst, XMMRegister    src);
1143   void movdqu(XMMRegister dst, XMMRegister    src);
1144   void movdqu(XMMRegister dst, Address        src);
1145   void movdqu(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1146 
1147   void kmovwl(Register  dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1148   void kmovwl(Address   dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1149   void kmovwl(KRegister dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1150   void kmovwl(KRegister dst, Register       src) { Assembler::kmovwl(dst, src); }
1151   void kmovwl(KRegister dst, Address        src) { Assembler::kmovwl(dst, src); }
1152   void kmovwl(KRegister dst, AddressLiteral src, Register rscratch = noreg);
1153 
1154   void kmovql(KRegister dst, KRegister      src) { Assembler::kmovql(dst, src); }
1155   void kmovql(KRegister dst, Register       src) { Assembler::kmovql(dst, src); }
1156   void kmovql(Register  dst, KRegister      src) { Assembler::kmovql(dst, src); }
1157   void kmovql(KRegister dst, Address        src) { Assembler::kmovql(dst, src); }
1158   void kmovql(Address   dst, KRegister      src) { Assembler::kmovql(dst, src); }
1159   void kmovql(KRegister dst, AddressLiteral src, Register rscratch = noreg);
1160 
1161   // Safe move operation, lowers down to 16bit moves for targets supporting
1162   // AVX512F feature and 64bit moves for targets supporting AVX512BW feature.
1163   void kmov(Address  dst, KRegister src);
1164   void kmov(KRegister dst, Address src);
1165   void kmov(KRegister dst, KRegister src);
1166   void kmov(Register dst, KRegister src);
1167   void kmov(KRegister dst, Register src);
1168 
1169   using Assembler::movddup;
1170   void movddup(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1171 
1172   using Assembler::vmovddup;
1173   void vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1174 
1175   // AVX Unaligned forms
1176   void vmovdqu(Address     dst, XMMRegister    src);
1177   void vmovdqu(XMMRegister dst, Address        src);
1178   void vmovdqu(XMMRegister dst, XMMRegister    src);
1179   void vmovdqu(XMMRegister dst, AddressLiteral src,                 Register rscratch = noreg);
1180   void vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1181 
1182   // AVX512 Unaligned
1183   void evmovdqu(BasicType type, KRegister kmask, Address     dst, XMMRegister src, bool merge, int vector_len);
1184   void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address     src, bool merge, int vector_len);
1185 
1186   void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::evmovdqub(dst, src, vector_len); }
1187   void evmovdqub(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdqub(dst, src, vector_len); }
1188 
1189   void evmovdqub(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1190     if (dst->encoding() != src->encoding() || mask != k0)  {
1191       Assembler::evmovdqub(dst, mask, src, merge, vector_len);
1192     }
1193   }
1194   void evmovdqub(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
1195   void evmovdqub(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
1196   void evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1197 
1198   void evmovdquw(Address     dst, XMMRegister src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
1199   void evmovdquw(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
1200 
1201   void evmovdquw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1202     if (dst->encoding() != src->encoding() || mask != k0) {
1203       Assembler::evmovdquw(dst, mask, src, merge, vector_len);
1204     }
1205   }
1206   void evmovdquw(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
1207   void evmovdquw(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
1208   void evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1209 
1210   void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
1211      if (dst->encoding() != src->encoding()) {
1212        Assembler::evmovdqul(dst, src, vector_len);
1213      }
1214   }
1215   void evmovdqul(Address     dst, XMMRegister src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
1216   void evmovdqul(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
1217 
1218   void evmovdqul(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1219     if (dst->encoding() != src->encoding() || mask != k0)  {
1220       Assembler::evmovdqul(dst, mask, src, merge, vector_len);
1221     }
1222   }
1223   void evmovdqul(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
1224   void evmovdqul(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
1225   void evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1226 
1227   void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) {
1228     if (dst->encoding() != src->encoding()) {
1229       Assembler::evmovdquq(dst, src, vector_len);
1230     }
1231   }
1232   void evmovdquq(XMMRegister dst, Address        src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1233   void evmovdquq(Address     dst, XMMRegister    src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1234   void evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1235 
1236   void evmovdquq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1237     if (dst->encoding() != src->encoding() || mask != k0) {
1238       Assembler::evmovdquq(dst, mask, src, merge, vector_len);
1239     }
1240   }
1241   void evmovdquq(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
1242   void evmovdquq(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
1243   void evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1244 
1245   // Move Aligned Double Quadword
1246   void movdqa(XMMRegister dst, XMMRegister    src) { Assembler::movdqa(dst, src); }
1247   void movdqa(XMMRegister dst, Address        src) { Assembler::movdqa(dst, src); }
1248   void movdqa(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1249 
1250   void movsd(Address     dst, XMMRegister    src) { Assembler::movsd(dst, src); }
1251   void movsd(XMMRegister dst, XMMRegister    src) { Assembler::movsd(dst, src); }
1252   void movsd(XMMRegister dst, Address        src) { Assembler::movsd(dst, src); }
1253   void movsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1254 
1255   void mulpd(XMMRegister dst, XMMRegister    src) { Assembler::mulpd(dst, src); }
1256   void mulpd(XMMRegister dst, Address        src) { Assembler::mulpd(dst, src); }
1257   void mulpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1258 
1259   void mulsd(XMMRegister dst, XMMRegister    src) { Assembler::mulsd(dst, src); }
1260   void mulsd(XMMRegister dst, Address        src) { Assembler::mulsd(dst, src); }
1261   void mulsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1262 
1263   void mulss(XMMRegister dst, XMMRegister    src) { Assembler::mulss(dst, src); }
1264   void mulss(XMMRegister dst, Address        src) { Assembler::mulss(dst, src); }
1265   void mulss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1266 
1267   // Carry-Less Multiplication Quadword
1268   void pclmulldq(XMMRegister dst, XMMRegister src) {
1269     // 0x00 - multiply lower 64 bits [0:63]
1270     Assembler::pclmulqdq(dst, src, 0x00);
1271   }
1272   void pclmulhdq(XMMRegister dst, XMMRegister src) {
1273     // 0x11 - multiply upper 64 bits [64:127]
1274     Assembler::pclmulqdq(dst, src, 0x11);
1275   }
1276 
1277   void pcmpeqb(XMMRegister dst, XMMRegister src);
1278   void pcmpeqw(XMMRegister dst, XMMRegister src);
1279 
1280   void pcmpestri(XMMRegister dst, Address src, int imm8);
1281   void pcmpestri(XMMRegister dst, XMMRegister src, int imm8);
1282 
1283   void pmovzxbw(XMMRegister dst, XMMRegister src);
1284   void pmovzxbw(XMMRegister dst, Address src);
1285 
1286   void pmovmskb(Register dst, XMMRegister src);
1287 
1288   void ptest(XMMRegister dst, XMMRegister src);
1289 
1290   void roundsd(XMMRegister dst, XMMRegister    src, int32_t rmode) { Assembler::roundsd(dst, src, rmode); }
1291   void roundsd(XMMRegister dst, Address        src, int32_t rmode) { Assembler::roundsd(dst, src, rmode); }
1292   void roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch = noreg);
1293 
1294   void sqrtss(XMMRegister dst, XMMRegister     src) { Assembler::sqrtss(dst, src); }
1295   void sqrtss(XMMRegister dst, Address         src) { Assembler::sqrtss(dst, src); }
1296   void sqrtss(XMMRegister dst, AddressLiteral  src, Register rscratch = noreg);
1297 
1298   void subsd(XMMRegister dst, XMMRegister    src) { Assembler::subsd(dst, src); }
1299   void subsd(XMMRegister dst, Address        src) { Assembler::subsd(dst, src); }
1300   void subsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1301 
1302   void subss(XMMRegister dst, XMMRegister    src) { Assembler::subss(dst, src); }
1303   void subss(XMMRegister dst, Address        src) { Assembler::subss(dst, src); }
1304   void subss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1305 
1306   void ucomiss(XMMRegister dst, XMMRegister    src) { Assembler::ucomiss(dst, src); }
1307   void ucomiss(XMMRegister dst, Address        src) { Assembler::ucomiss(dst, src); }
1308   void ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1309 
1310   void ucomisd(XMMRegister dst, XMMRegister    src) { Assembler::ucomisd(dst, src); }
1311   void ucomisd(XMMRegister dst, Address        src) { Assembler::ucomisd(dst, src); }
1312   void ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1313 
1314   // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
1315   void xorpd(XMMRegister dst, XMMRegister    src);
1316   void xorpd(XMMRegister dst, Address        src) { Assembler::xorpd(dst, src); }
1317   void xorpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1318 
1319   // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
1320   void xorps(XMMRegister dst, XMMRegister    src);
1321   void xorps(XMMRegister dst, Address        src) { Assembler::xorps(dst, src); }
1322   void xorps(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1323 
1324   // Shuffle Bytes
1325   void pshufb(XMMRegister dst, XMMRegister    src) { Assembler::pshufb(dst, src); }
1326   void pshufb(XMMRegister dst, Address        src) { Assembler::pshufb(dst, src); }
1327   void pshufb(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1328   // AVX 3-operands instructions
1329 
1330   void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vaddsd(dst, nds, src); }
1331   void vaddsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vaddsd(dst, nds, src); }
1332   void vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1333 
1334   void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vaddss(dst, nds, src); }
1335   void vaddss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vaddss(dst, nds, src); }
1336   void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1337 
1338   void vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch = noreg);
1339   void vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch = noreg);
1340 
1341   void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len);
1342   void vpaddb(XMMRegister dst, XMMRegister nds, Address        src, int vector_len);
1343   void vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1344 
1345   void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1346   void vpaddw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1347 
1348   void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1349   void vpaddd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1350   void vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1351 
1352   void vpand(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1353   void vpand(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1354   void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1355 
1356   using Assembler::vpbroadcastd;
1357   void vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1358 
1359   using Assembler::vpbroadcastq;
1360   void vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1361 
1362   void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1363 
1364   void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1365   void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1366 
1367   // Vector compares
1368   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1369     Assembler::evpcmpd(kdst, mask, nds, src, comparison, is_signed, vector_len);
1370   }
1371   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1372 
1373   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1374     Assembler::evpcmpq(kdst, mask, nds, src, comparison, is_signed, vector_len);
1375   }
1376   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1377 
1378   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1379     Assembler::evpcmpb(kdst, mask, nds, src, comparison, is_signed, vector_len);
1380   }
1381   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1382 
1383   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1384     Assembler::evpcmpw(kdst, mask, nds, src, comparison, is_signed, vector_len);
1385   }
1386   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1387 
1388   void evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len);
1389 
1390   // Emit comparison instruction for the specified comparison predicate.
1391   void vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len);
1392   void vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len);
1393 
1394   void vpmovzxbw(XMMRegister dst, Address     src, int vector_len);
1395   void vpmovzxbw(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vpmovzxbw(dst, src, vector_len); }
1396 
1397   void vpmovmskb(Register dst, XMMRegister src, int vector_len = Assembler::AVX_256bit);
1398 
1399   void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1400   void vpmullw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1401 
1402   void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); }
1403   void vpmulld(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); }
1404   void vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1405 
1406   void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1407   void vpsubb(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1408 
1409   void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1410   void vpsubw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1411 
1412   void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1413   void vpsraw(XMMRegister dst, XMMRegister nds, int         shift, int vector_len);
1414 
1415   void evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1416   void evpsraq(XMMRegister dst, XMMRegister nds, int         shift, int vector_len);
1417 
1418   void evpsllw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1419     if (!is_varshift) {
1420       Assembler::evpsllw(dst, mask, nds, src, merge, vector_len);
1421     } else {
1422       Assembler::evpsllvw(dst, mask, nds, src, merge, vector_len);
1423     }
1424   }
1425   void evpslld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1426     if (!is_varshift) {
1427       Assembler::evpslld(dst, mask, nds, src, merge, vector_len);
1428     } else {
1429       Assembler::evpsllvd(dst, mask, nds, src, merge, vector_len);
1430     }
1431   }
1432   void evpsllq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1433     if (!is_varshift) {
1434       Assembler::evpsllq(dst, mask, nds, src, merge, vector_len);
1435     } else {
1436       Assembler::evpsllvq(dst, mask, nds, src, merge, vector_len);
1437     }
1438   }
1439   void evpsrlw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1440     if (!is_varshift) {
1441       Assembler::evpsrlw(dst, mask, nds, src, merge, vector_len);
1442     } else {
1443       Assembler::evpsrlvw(dst, mask, nds, src, merge, vector_len);
1444     }
1445   }
1446   void evpsrld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1447     if (!is_varshift) {
1448       Assembler::evpsrld(dst, mask, nds, src, merge, vector_len);
1449     } else {
1450       Assembler::evpsrlvd(dst, mask, nds, src, merge, vector_len);
1451     }
1452   }
1453   void evpsrlq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1454     if (!is_varshift) {
1455       Assembler::evpsrlq(dst, mask, nds, src, merge, vector_len);
1456     } else {
1457       Assembler::evpsrlvq(dst, mask, nds, src, merge, vector_len);
1458     }
1459   }
1460   void evpsraw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1461     if (!is_varshift) {
1462       Assembler::evpsraw(dst, mask, nds, src, merge, vector_len);
1463     } else {
1464       Assembler::evpsravw(dst, mask, nds, src, merge, vector_len);
1465     }
1466   }
1467   void evpsrad(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1468     if (!is_varshift) {
1469       Assembler::evpsrad(dst, mask, nds, src, merge, vector_len);
1470     } else {
1471       Assembler::evpsravd(dst, mask, nds, src, merge, vector_len);
1472     }
1473   }
1474   void evpsraq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1475     if (!is_varshift) {
1476       Assembler::evpsraq(dst, mask, nds, src, merge, vector_len);
1477     } else {
1478       Assembler::evpsravq(dst, mask, nds, src, merge, vector_len);
1479     }
1480   }
1481 
1482   void evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1483   void evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1484   void evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1485   void evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1486 
1487   void vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1488   void vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1489 
1490   void vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1491   void vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1492 
1493   void vptest(XMMRegister dst, XMMRegister src);
1494   void vptest(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vptest(dst, src, vector_len); }
1495 
1496   void punpcklbw(XMMRegister dst, XMMRegister src);
1497   void punpcklbw(XMMRegister dst, Address src) { Assembler::punpcklbw(dst, src); }
1498 
1499   void pshufd(XMMRegister dst, Address src, int mode);
1500   void pshufd(XMMRegister dst, XMMRegister src, int mode) { Assembler::pshufd(dst, src, mode); }
1501 
1502   void pshuflw(XMMRegister dst, XMMRegister src, int mode);
1503   void pshuflw(XMMRegister dst, Address src, int mode) { Assembler::pshuflw(dst, src, mode); }
1504 
1505   void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1506   void vandpd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1507   void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1508 
1509   void vandps(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1510   void vandps(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1511   void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1512 
1513   void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1514 
1515   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vdivsd(dst, nds, src); }
1516   void vdivsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vdivsd(dst, nds, src); }
1517   void vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1518 
1519   void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vdivss(dst, nds, src); }
1520   void vdivss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vdivss(dst, nds, src); }
1521   void vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1522 
1523   void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vmulsd(dst, nds, src); }
1524   void vmulsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vmulsd(dst, nds, src); }
1525   void vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1526 
1527   void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vmulss(dst, nds, src); }
1528   void vmulss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vmulss(dst, nds, src); }
1529   void vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1530 
1531   void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vsubsd(dst, nds, src); }
1532   void vsubsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vsubsd(dst, nds, src); }
1533   void vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1534 
1535   void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vsubss(dst, nds, src); }
1536   void vsubss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vsubss(dst, nds, src); }
1537   void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1538 
1539   void vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1540   void vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1541 
1542   // AVX Vector instructions
1543 
1544   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1545   void vxorpd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1546   void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1547 
1548   void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1549   void vxorps(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1550   void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1551 
1552   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1553     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1554       Assembler::vpxor(dst, nds, src, vector_len);
1555     else
1556       Assembler::vxorpd(dst, nds, src, vector_len);
1557   }
1558   void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
1559     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1560       Assembler::vpxor(dst, nds, src, vector_len);
1561     else
1562       Assembler::vxorpd(dst, nds, src, vector_len);
1563   }
1564   void vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1565 
1566   // Simple version for AVX2 256bit vectors
1567   void vpxor(XMMRegister dst, XMMRegister src) {
1568     assert(UseAVX >= 2, "Should be at least AVX2");
1569     Assembler::vpxor(dst, dst, src, AVX_256bit);
1570   }
1571   void vpxor(XMMRegister dst, Address src) {
1572     assert(UseAVX >= 2, "Should be at least AVX2");
1573     Assembler::vpxor(dst, dst, src, AVX_256bit);
1574   }
1575 
1576   void vpermd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpermd(dst, nds, src, vector_len); }
1577   void vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1578 
1579   void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
1580     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1581       Assembler::vinserti32x4(dst, nds, src, imm8);
1582     } else if (UseAVX > 1) {
1583       // vinserti128 is available only in AVX2
1584       Assembler::vinserti128(dst, nds, src, imm8);
1585     } else {
1586       Assembler::vinsertf128(dst, nds, src, imm8);
1587     }
1588   }
1589 
1590   void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
1591     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1592       Assembler::vinserti32x4(dst, nds, src, imm8);
1593     } else if (UseAVX > 1) {
1594       // vinserti128 is available only in AVX2
1595       Assembler::vinserti128(dst, nds, src, imm8);
1596     } else {
1597       Assembler::vinsertf128(dst, nds, src, imm8);
1598     }
1599   }
1600 
1601   void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
1602     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1603       Assembler::vextracti32x4(dst, src, imm8);
1604     } else if (UseAVX > 1) {
1605       // vextracti128 is available only in AVX2
1606       Assembler::vextracti128(dst, src, imm8);
1607     } else {
1608       Assembler::vextractf128(dst, src, imm8);
1609     }
1610   }
1611 
1612   void vextracti128(Address dst, XMMRegister src, uint8_t imm8) {
1613     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1614       Assembler::vextracti32x4(dst, src, imm8);
1615     } else if (UseAVX > 1) {
1616       // vextracti128 is available only in AVX2
1617       Assembler::vextracti128(dst, src, imm8);
1618     } else {
1619       Assembler::vextractf128(dst, src, imm8);
1620     }
1621   }
1622 
1623   // 128bit copy to/from high 128 bits of 256bit (YMM) vector registers
1624   void vinserti128_high(XMMRegister dst, XMMRegister src) {
1625     vinserti128(dst, dst, src, 1);
1626   }
1627   void vinserti128_high(XMMRegister dst, Address src) {
1628     vinserti128(dst, dst, src, 1);
1629   }
1630   void vextracti128_high(XMMRegister dst, XMMRegister src) {
1631     vextracti128(dst, src, 1);
1632   }
1633   void vextracti128_high(Address dst, XMMRegister src) {
1634     vextracti128(dst, src, 1);
1635   }
1636 
1637   void vinsertf128_high(XMMRegister dst, XMMRegister src) {
1638     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1639       Assembler::vinsertf32x4(dst, dst, src, 1);
1640     } else {
1641       Assembler::vinsertf128(dst, dst, src, 1);
1642     }
1643   }
1644 
1645   void vinsertf128_high(XMMRegister dst, Address src) {
1646     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1647       Assembler::vinsertf32x4(dst, dst, src, 1);
1648     } else {
1649       Assembler::vinsertf128(dst, dst, src, 1);
1650     }
1651   }
1652 
1653   void vextractf128_high(XMMRegister dst, XMMRegister src) {
1654     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1655       Assembler::vextractf32x4(dst, src, 1);
1656     } else {
1657       Assembler::vextractf128(dst, src, 1);
1658     }
1659   }
1660 
1661   void vextractf128_high(Address dst, XMMRegister src) {
1662     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1663       Assembler::vextractf32x4(dst, src, 1);
1664     } else {
1665       Assembler::vextractf128(dst, src, 1);
1666     }
1667   }
1668 
1669   // 256bit copy to/from high 256 bits of 512bit (ZMM) vector registers
1670   void vinserti64x4_high(XMMRegister dst, XMMRegister src) {
1671     Assembler::vinserti64x4(dst, dst, src, 1);
1672   }
1673   void vinsertf64x4_high(XMMRegister dst, XMMRegister src) {
1674     Assembler::vinsertf64x4(dst, dst, src, 1);
1675   }
1676   void vextracti64x4_high(XMMRegister dst, XMMRegister src) {
1677     Assembler::vextracti64x4(dst, src, 1);
1678   }
1679   void vextractf64x4_high(XMMRegister dst, XMMRegister src) {
1680     Assembler::vextractf64x4(dst, src, 1);
1681   }
1682   void vextractf64x4_high(Address dst, XMMRegister src) {
1683     Assembler::vextractf64x4(dst, src, 1);
1684   }
1685   void vinsertf64x4_high(XMMRegister dst, Address src) {
1686     Assembler::vinsertf64x4(dst, dst, src, 1);
1687   }
1688 
1689   // 128bit copy to/from low 128 bits of 256bit (YMM) vector registers
1690   void vinserti128_low(XMMRegister dst, XMMRegister src) {
1691     vinserti128(dst, dst, src, 0);
1692   }
1693   void vinserti128_low(XMMRegister dst, Address src) {
1694     vinserti128(dst, dst, src, 0);
1695   }
1696   void vextracti128_low(XMMRegister dst, XMMRegister src) {
1697     vextracti128(dst, src, 0);
1698   }
1699   void vextracti128_low(Address dst, XMMRegister src) {
1700     vextracti128(dst, src, 0);
1701   }
1702 
1703   void vinsertf128_low(XMMRegister dst, XMMRegister src) {
1704     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1705       Assembler::vinsertf32x4(dst, dst, src, 0);
1706     } else {
1707       Assembler::vinsertf128(dst, dst, src, 0);
1708     }
1709   }
1710 
1711   void vinsertf128_low(XMMRegister dst, Address src) {
1712     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1713       Assembler::vinsertf32x4(dst, dst, src, 0);
1714     } else {
1715       Assembler::vinsertf128(dst, dst, src, 0);
1716     }
1717   }
1718 
1719   void vextractf128_low(XMMRegister dst, XMMRegister src) {
1720     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1721       Assembler::vextractf32x4(dst, src, 0);
1722     } else {
1723       Assembler::vextractf128(dst, src, 0);
1724     }
1725   }
1726 
1727   void vextractf128_low(Address dst, XMMRegister src) {
1728     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1729       Assembler::vextractf32x4(dst, src, 0);
1730     } else {
1731       Assembler::vextractf128(dst, src, 0);
1732     }
1733   }
1734 
1735   // 256bit copy to/from low 256 bits of 512bit (ZMM) vector registers
1736   void vinserti64x4_low(XMMRegister dst, XMMRegister src) {
1737     Assembler::vinserti64x4(dst, dst, src, 0);
1738   }
1739   void vinsertf64x4_low(XMMRegister dst, XMMRegister src) {
1740     Assembler::vinsertf64x4(dst, dst, src, 0);
1741   }
1742   void vextracti64x4_low(XMMRegister dst, XMMRegister src) {
1743     Assembler::vextracti64x4(dst, src, 0);
1744   }
1745   void vextractf64x4_low(XMMRegister dst, XMMRegister src) {
1746     Assembler::vextractf64x4(dst, src, 0);
1747   }
1748   void vextractf64x4_low(Address dst, XMMRegister src) {
1749     Assembler::vextractf64x4(dst, src, 0);
1750   }
1751   void vinsertf64x4_low(XMMRegister dst, Address src) {
1752     Assembler::vinsertf64x4(dst, dst, src, 0);
1753   }
1754 
1755   // Carry-Less Multiplication Quadword
1756   void vpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1757     // 0x00 - multiply lower 64 bits [0:63]
1758     Assembler::vpclmulqdq(dst, nds, src, 0x00);
1759   }
1760   void vpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1761     // 0x11 - multiply upper 64 bits [64:127]
1762     Assembler::vpclmulqdq(dst, nds, src, 0x11);
1763   }
1764   void vpclmullqhqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1765     // 0x10 - multiply nds[0:63] and src[64:127]
1766     Assembler::vpclmulqdq(dst, nds, src, 0x10);
1767   }
1768   void vpclmulhqlqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1769     //0x01 - multiply nds[64:127] and src[0:63]
1770     Assembler::vpclmulqdq(dst, nds, src, 0x01);
1771   }
1772 
1773   void evpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1774     // 0x00 - multiply lower 64 bits [0:63]
1775     Assembler::evpclmulqdq(dst, nds, src, 0x00, vector_len);
1776   }
1777   void evpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1778     // 0x11 - multiply upper 64 bits [64:127]
1779     Assembler::evpclmulqdq(dst, nds, src, 0x11, vector_len);
1780   }
1781 
1782   // AVX-512 mask operations.
1783   void kand(BasicType etype, KRegister dst, KRegister src1, KRegister src2);
1784   void kor(BasicType type, KRegister dst, KRegister src1, KRegister src2);
1785   void knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp = knoreg, Register rtmp = noreg);
1786   void kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2);
1787   void kortest(uint masklen, KRegister src1, KRegister src2);
1788   void ktest(uint masklen, KRegister src1, KRegister src2);
1789 
1790   void evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1791   void evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1792 
1793   void evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1794   void evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1795 
1796   void evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1797   void evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1798 
1799   void evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1800   void evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1801 
1802   void evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
1803   void evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
1804   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
1805   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
1806 
1807   using Assembler::evpandq;
1808   void evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1809 
1810   using Assembler::evpaddq;
1811   void evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1812 
1813   using Assembler::evporq;
1814   void evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1815 
1816   using Assembler::vpshufb;
1817   void vpshufb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1818 
1819   using Assembler::vpternlogq;
1820   void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch = noreg);
1821 
1822   void cmov32( Condition cc, Register dst, Address  src);
1823   void cmov32( Condition cc, Register dst, Register src);
1824 
1825   void cmov(   Condition cc, Register dst, Register src) { cmovptr(cc, dst, src); }
1826 
1827   void cmovptr(Condition cc, Register dst, Address  src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
1828   void cmovptr(Condition cc, Register dst, Register src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
1829 
1830   void movoop(Register dst, jobject obj);
1831   void movoop(Address  dst, jobject obj, Register rscratch);
1832 
1833   void mov_metadata(Register dst, Metadata* obj);
1834   void mov_metadata(Address  dst, Metadata* obj, Register rscratch);
1835 
1836   void movptr(Register     dst, Register       src);
1837   void movptr(Register     dst, Address        src);
1838   void movptr(Register     dst, AddressLiteral src);
1839   void movptr(Register     dst, ArrayAddress   src);
1840   void movptr(Register     dst, intptr_t       src);
1841   void movptr(Address      dst, Register       src);
1842   void movptr(Address      dst, int32_t        imm);
1843   void movptr(Address      dst, intptr_t       src, Register rscratch);
1844   void movptr(ArrayAddress dst, Register       src, Register rscratch);
1845 
1846   void movptr(Register dst, RegisterOrConstant src) {
1847     if (src.is_constant()) movptr(dst, src.as_constant());
1848     else                   movptr(dst, src.as_register());
1849   }
1850 
1851 
1852   // to avoid hiding movl
1853   void mov32(Register       dst, AddressLiteral src);
1854   void mov32(AddressLiteral dst, Register        src, Register rscratch = noreg);
1855 
1856   // Import other mov() methods from the parent class or else
1857   // they will be hidden by the following overriding declaration.
1858   using Assembler::movdl;
1859   void movdl(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1860 
1861   using Assembler::movq;
1862   void movq(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1863 
1864   // Can push value or effective address
1865   void pushptr(AddressLiteral src, Register rscratch);
1866 
1867   void pushptr(Address src) { LP64_ONLY(pushq(src)) NOT_LP64(pushl(src)); }
1868   void popptr(Address src) { LP64_ONLY(popq(src)) NOT_LP64(popl(src)); }
1869 
1870   void pushoop(jobject obj, Register rscratch);
1871   void pushklass(Metadata* obj, Register rscratch);
1872 
1873   // sign extend as need a l to ptr sized element
1874   void movl2ptr(Register dst, Address src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src)); }
1875   void movl2ptr(Register dst, Register src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(if (dst != src) movl(dst, src)); }
1876 
1877 
1878  public:
1879   // clear memory of size 'cnt' qwords, starting at 'base';
1880   // if 'is_large' is set, do not try to produce short loop
1881   void clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, bool is_large, KRegister mask=knoreg);
1882 
1883   // clear memory initialization sequence for constant size;
1884   void clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
1885 
1886   // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
1887   void xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
1888 
1889   // Fill primitive arrays
1890   void generate_fill(BasicType t, bool aligned,
1891                      Register to, Register value, Register count,
1892                      Register rtmp, XMMRegister xtmp);
1893 
1894   void encode_iso_array(Register src, Register dst, Register len,
1895                         XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
1896                         XMMRegister tmp4, Register tmp5, Register result, bool ascii);
1897 
1898 #ifdef _LP64
1899   void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2);
1900   void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
1901                              Register y, Register y_idx, Register z,
1902                              Register carry, Register product,
1903                              Register idx, Register kdx);
1904   void multiply_add_128_x_128(Register x_xstart, Register y, Register z,
1905                               Register yz_idx, Register idx,
1906                               Register carry, Register product, int offset);
1907   void multiply_128_x_128_bmi2_loop(Register y, Register z,
1908                                     Register carry, Register carry2,
1909                                     Register idx, Register jdx,
1910                                     Register yz_idx1, Register yz_idx2,
1911                                     Register tmp, Register tmp3, Register tmp4);
1912   void multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
1913                                Register yz_idx, Register idx, Register jdx,
1914                                Register carry, Register product,
1915                                Register carry2);
1916   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
1917                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
1918   void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
1919                      Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
1920   void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
1921                             Register tmp2);
1922   void multiply_add_64(Register sum, Register op1, Register op2, Register carry,
1923                        Register rdxReg, Register raxReg);
1924   void add_one_64(Register z, Register zlen, Register carry, Register tmp1);
1925   void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
1926                        Register tmp3, Register tmp4);
1927   void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
1928                      Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
1929 
1930   void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1,
1931                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
1932                Register raxReg);
1933   void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
1934                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
1935                Register raxReg);
1936   void vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
1937                            Register result, Register tmp1, Register tmp2,
1938                            XMMRegister vec1, XMMRegister vec2, XMMRegister vec3);
1939 #endif
1940 
1941   // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
1942   void update_byte_crc32(Register crc, Register val, Register table);
1943   void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
1944 
1945 
1946 #ifdef _LP64
1947   void kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2);
1948   void kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
1949                                 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
1950                                 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup);
1951 #endif // _LP64
1952 
1953   // CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
1954   // Note on a naming convention:
1955   // Prefix w = register only used on a Westmere+ architecture
1956   // Prefix n = register only used on a Nehalem architecture
1957 #ifdef _LP64
1958   void crc32c_ipl_alg4(Register in_out, uint32_t n,
1959                        Register tmp1, Register tmp2, Register tmp3);
1960 #else
1961   void crc32c_ipl_alg4(Register in_out, uint32_t n,
1962                        Register tmp1, Register tmp2, Register tmp3,
1963                        XMMRegister xtmp1, XMMRegister xtmp2);
1964 #endif
1965   void crc32c_pclmulqdq(XMMRegister w_xtmp1,
1966                         Register in_out,
1967                         uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
1968                         XMMRegister w_xtmp2,
1969                         Register tmp1,
1970                         Register n_tmp2, Register n_tmp3);
1971   void crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
1972                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
1973                        Register tmp1, Register tmp2,
1974                        Register n_tmp3);
1975   void crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
1976                          Register in_out1, Register in_out2, Register in_out3,
1977                          Register tmp1, Register tmp2, Register tmp3,
1978                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
1979                          Register tmp4, Register tmp5,
1980                          Register n_tmp6);
1981   void crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
1982                             Register tmp1, Register tmp2, Register tmp3,
1983                             Register tmp4, Register tmp5, Register tmp6,
1984                             XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
1985                             bool is_pclmulqdq_supported);
1986   // Fold 128-bit data chunk
1987   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
1988   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf);
1989 #ifdef _LP64
1990   // Fold 512-bit data chunk
1991   void fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, Register pos, int offset);
1992 #endif // _LP64
1993   // Fold 8-bit data
1994   void fold_8bit_crc32(Register crc, Register table, Register tmp);
1995   void fold_8bit_crc32(XMMRegister crc, Register table, XMMRegister xtmp, Register tmp);
1996 
1997   // Compress char[] array to byte[].
1998   void char_array_compress(Register src, Register dst, Register len,
1999                            XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
2000                            XMMRegister tmp4, Register tmp5, Register result,
2001                            KRegister mask1 = knoreg, KRegister mask2 = knoreg);
2002 
2003   // Inflate byte[] array to char[].
2004   void byte_array_inflate(Register src, Register dst, Register len,
2005                           XMMRegister tmp1, Register tmp2, KRegister mask = knoreg);
2006 
2007   void fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
2008                    Register length, Register temp, int vec_enc);
2009 
2010   void fill64_masked(uint shift, Register dst, int disp,
2011                          XMMRegister xmm, KRegister mask, Register length,
2012                          Register temp, bool use64byteVector = false);
2013 
2014   void fill32_masked(uint shift, Register dst, int disp,
2015                          XMMRegister xmm, KRegister mask, Register length,
2016                          Register temp);
2017 
2018   void fill32(Address dst, XMMRegister xmm);
2019 
2020   void fill32(Register dst, int disp, XMMRegister xmm);
2021 
2022   void fill64(Address dst, XMMRegister xmm, bool use64byteVector = false);
2023 
2024   void fill64(Register dst, int dis, XMMRegister xmm, bool use64byteVector = false);
2025 
2026 #ifdef _LP64
2027   void convert_f2i(Register dst, XMMRegister src);
2028   void convert_d2i(Register dst, XMMRegister src);
2029   void convert_f2l(Register dst, XMMRegister src);
2030   void convert_d2l(Register dst, XMMRegister src);
2031   void round_double(Register dst, XMMRegister src, Register rtmp, Register rcx);
2032   void round_float(Register dst, XMMRegister src, Register rtmp, Register rcx);
2033 
2034   void cache_wb(Address line);
2035   void cache_wbsync(bool is_pre);
2036 
2037 #ifdef COMPILER2_OR_JVMCI
2038   void generate_fill_avx3(BasicType type, Register to, Register value,
2039                           Register count, Register rtmp, XMMRegister xtmp);
2040 #endif // COMPILER2_OR_JVMCI
2041 #endif // _LP64
2042 
2043   void vallones(XMMRegister dst, int vector_len);
2044 
2045   void check_stack_alignment(Register sp, const char* msg, unsigned bias = 0, Register tmp = noreg);
2046 
2047   void lightweight_lock(Register obj, Register reg_rax, Register thread, Register tmp, Label& slow);
2048   void lightweight_unlock(Register obj, Register reg_rax, Register thread, Register tmp, Label& slow);
2049 };
2050 
2051 /**
2052  * class SkipIfEqual:
2053  *
2054  * Instantiating this class will result in assembly code being output that will
2055  * jump around any code emitted between the creation of the instance and it's
2056  * automatic destruction at the end of a scope block, depending on the value of
2057  * the flag passed to the constructor, which will be checked at run-time.
2058  */
2059 class SkipIfEqual {
2060  private:
2061   MacroAssembler* _masm;
2062   Label _label;
2063 
2064  public:
2065    SkipIfEqual(MacroAssembler*, const bool* flag_addr, bool value, Register rscratch);
2066    ~SkipIfEqual();
2067 };
2068 
2069 #endif // CPU_X86_MACROASSEMBLER_X86_HPP