1 /*
   2  * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef CPU_X86_MACROASSEMBLER_X86_HPP
  26 #define CPU_X86_MACROASSEMBLER_X86_HPP
  27 
  28 #include "asm/assembler.hpp"
  29 #include "asm/register.hpp"
  30 #include "code/vmreg.inline.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "utilities/macros.hpp"
  33 #include "runtime/vm_version.hpp"
  34 #include "utilities/checkedCast.hpp"
  35 
  36 // MacroAssembler extends Assembler by frequently used macros.
  37 //
  38 // Instructions for which a 'better' code sequence exists depending
  39 // on arguments should also go in here.
  40 
  41 class MacroAssembler: public Assembler {
  42   friend class LIR_Assembler;
  43   friend class Runtime1;      // as_Address()
  44 
  45  public:
  46   // Support for VM calls
  47   //
  48   // This is the base routine called by the different versions of call_VM_leaf. The interpreter
  49   // may customize this version by overriding it for its purposes (e.g., to save/restore
  50   // additional registers when doing a VM call).
  51 
  52   virtual void call_VM_leaf_base(
  53     address entry_point,               // the entry point
  54     int     number_of_arguments        // the number of arguments to pop after the call
  55   );
  56 
  57  protected:
  58   // This is the base routine called by the different versions of call_VM. The interpreter
  59   // may customize this version by overriding it for its purposes (e.g., to save/restore
  60   // additional registers when doing a VM call).
  61   //
  62   // call_VM_base returns the register which contains the thread upon return.
  63   // If no last_java_sp is specified (noreg) than rsp will be used instead.
  64   virtual void call_VM_base(           // returns the register containing the thread upon return
  65     Register oop_result,               // where an oop-result ends up if any; use noreg otherwise
  66     Register last_java_sp,             // to set up last_Java_frame in stubs; use noreg otherwise
  67     address  entry_point,              // the entry point
  68     int      number_of_arguments,      // the number of arguments (w/o thread) to pop after the call
  69     bool     check_exceptions          // whether to check for pending exceptions after return
  70   );
  71 
  72   void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions = true);
  73 
  74   // helpers for FPU flag access
  75   // tmp is a temporary register, if none is available use noreg
  76   void save_rax   (Register tmp);
  77   void restore_rax(Register tmp);
  78 
  79  public:
  80   MacroAssembler(CodeBuffer* code) : Assembler(code) {}
  81 
  82  // These routines should emit JVMTI PopFrame and ForceEarlyReturn handling code.
  83  // The implementation is only non-empty for the InterpreterMacroAssembler,
  84  // as only the interpreter handles PopFrame and ForceEarlyReturn requests.
  85  virtual void check_and_handle_popframe();
  86  virtual void check_and_handle_earlyret();
  87 
  88   Address as_Address(AddressLiteral adr);
  89   Address as_Address(ArrayAddress adr, Register rscratch);
  90 
  91   // Support for null-checks
  92   //
  93   // Generates code that causes a null OS exception if the content of reg is null.
  94   // If the accessed location is M[reg + offset] and the offset is known, provide the
  95   // offset. No explicit code generation is needed if the offset is within a certain
  96   // range (0 <= offset <= page_size).
  97 
  98   void null_check(Register reg, int offset = -1);
  99   static bool needs_explicit_null_check(intptr_t offset);
 100   static bool uses_implicit_null_check(void* address);
 101 
 102   // Required platform-specific helpers for Label::patch_instructions.
 103   // They _shadow_ the declarations in AbstractAssembler, which are undefined.
 104   void pd_patch_instruction(address branch, address target, const char* file, int line) {
 105     unsigned char op = branch[0];
 106     assert(op == 0xE8 /* call */ ||
 107         op == 0xE9 /* jmp */ ||
 108         op == 0xEB /* short jmp */ ||
 109         (op & 0xF0) == 0x70 /* short jcc */ ||
 110         (op == 0x0F && (branch[1] & 0xF0) == 0x80) /* jcc */ ||
 111         (op == 0xC7 && branch[1] == 0xF8) /* xbegin */ ||
 112         (op == 0x8D) /* lea */,
 113         "Invalid opcode at patch point");
 114 
 115     if (op == 0xEB || (op & 0xF0) == 0x70) {
 116       // short offset operators (jmp and jcc)
 117       char* disp = (char*) &branch[1];
 118       int imm8 = checked_cast<int>(target - (address) &disp[1]);
 119       guarantee(this->is8bit(imm8), "Short forward jump exceeds 8-bit offset at %s:%d",
 120                 file == nullptr ? "<null>" : file, line);
 121       *disp = (char)imm8;
 122     } else {
 123       int* disp = (int*) &branch[(op == 0x0F || op == 0xC7 || op == 0x8D) ? 2 : 1];
 124       int imm32 = checked_cast<int>(target - (address) &disp[1]);
 125       *disp = imm32;
 126     }
 127   }
 128 
 129   // The following 4 methods return the offset of the appropriate move instruction
 130 
 131   // Support for fast byte/short loading with zero extension (depending on particular CPU)
 132   int load_unsigned_byte(Register dst, Address src);
 133   int load_unsigned_short(Register dst, Address src);
 134 
 135   // Support for fast byte/short loading with sign extension (depending on particular CPU)
 136   int load_signed_byte(Register dst, Address src);
 137   int load_signed_short(Register dst, Address src);
 138 
 139   // Support for sign-extension (hi:lo = extend_sign(lo))
 140   void extend_sign(Register hi, Register lo);
 141 
 142   // Load and store values by size and signed-ness
 143   void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2 = noreg);
 144   void store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2 = noreg);
 145 
 146   // Support for inc/dec with optimal instruction selection depending on value
 147 
 148   void increment(Register reg, int value = 1) { LP64_ONLY(incrementq(reg, value)) NOT_LP64(incrementl(reg, value)) ; }
 149   void decrement(Register reg, int value = 1) { LP64_ONLY(decrementq(reg, value)) NOT_LP64(decrementl(reg, value)) ; }
 150   void increment(Address dst, int value = 1)  { LP64_ONLY(incrementq(dst, value)) NOT_LP64(incrementl(dst, value)) ; }
 151   void decrement(Address dst, int value = 1)  { LP64_ONLY(decrementq(dst, value)) NOT_LP64(decrementl(dst, value)) ; }
 152 
 153   void decrementl(Address dst, int value = 1);
 154   void decrementl(Register reg, int value = 1);
 155 
 156   void decrementq(Register reg, int value = 1);
 157   void decrementq(Address dst, int value = 1);
 158 
 159   void incrementl(Address dst, int value = 1);
 160   void incrementl(Register reg, int value = 1);
 161 
 162   void incrementq(Register reg, int value = 1);
 163   void incrementq(Address dst, int value = 1);
 164 
 165   void incrementl(AddressLiteral dst, Register rscratch = noreg);
 166   void incrementl(ArrayAddress   dst, Register rscratch);
 167 
 168   void incrementq(AddressLiteral dst, Register rscratch = noreg);
 169 
 170   // Support optimal SSE move instructions.
 171   void movflt(XMMRegister dst, XMMRegister src) {
 172     if (dst-> encoding() == src->encoding()) return;
 173     if (UseXmmRegToRegMoveAll) { movaps(dst, src); return; }
 174     else                       { movss (dst, src); return; }
 175   }
 176   void movflt(XMMRegister dst, Address src) { movss(dst, src); }
 177   void movflt(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 178   void movflt(Address dst, XMMRegister src) { movss(dst, src); }
 179 
 180   // Move with zero extension
 181   void movfltz(XMMRegister dst, XMMRegister src) { movss(dst, src); }
 182 
 183   void movdbl(XMMRegister dst, XMMRegister src) {
 184     if (dst-> encoding() == src->encoding()) return;
 185     if (UseXmmRegToRegMoveAll) { movapd(dst, src); return; }
 186     else                       { movsd (dst, src); return; }
 187   }
 188 
 189   void movdbl(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 190 
 191   void movdbl(XMMRegister dst, Address src) {
 192     if (UseXmmLoadAndClearUpper) { movsd (dst, src); return; }
 193     else                         { movlpd(dst, src); return; }
 194   }
 195   void movdbl(Address dst, XMMRegister src) { movsd(dst, src); }
 196 
 197   void flt_to_flt16(Register dst, XMMRegister src, XMMRegister tmp) {
 198     // Use separate tmp XMM register because caller may
 199     // requires src XMM register to be unchanged (as in x86.ad).
 200     vcvtps2ph(tmp, src, 0x04, Assembler::AVX_128bit);
 201     movdl(dst, tmp);
 202     movswl(dst, dst);
 203   }
 204 
 205   void flt16_to_flt(XMMRegister dst, Register src) {
 206     movdl(dst, src);
 207     vcvtph2ps(dst, dst, Assembler::AVX_128bit);
 208   }
 209 
 210   // Alignment
 211   void align32();
 212   void align64();
 213   void align(uint modulus);
 214   void align(uint modulus, uint target);
 215 
 216   void post_call_nop();
 217   // A 5 byte nop that is safe for patching (see patch_verified_entry)
 218   void fat_nop();
 219 
 220   // Stack frame creation/removal
 221   void enter();
 222   void leave();
 223 
 224   // Support for getting the JavaThread pointer (i.e.; a reference to thread-local information).
 225   // The pointer will be loaded into the thread register. This is a slow version that does native call.
 226   // Normally, JavaThread pointer is available in r15_thread, use that where possible.
 227   void get_thread_slow(Register thread);
 228 
 229 #ifdef _LP64
 230   // Support for argument shuffling
 231 
 232   // bias in bytes
 233   void move32_64(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 234   void long_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 235   void float_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 236   void double_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 237   void move_ptr(VMRegPair src, VMRegPair dst);
 238   void object_move(OopMap* map,
 239                    int oop_handle_offset,
 240                    int framesize_in_slots,
 241                    VMRegPair src,
 242                    VMRegPair dst,
 243                    bool is_receiver,
 244                    int* receiver_offset);
 245 #endif // _LP64
 246 
 247   // Support for VM calls
 248   //
 249   // It is imperative that all calls into the VM are handled via the call_VM macros.
 250   // They make sure that the stack linkage is setup correctly. call_VM's correspond
 251   // to ENTRY/ENTRY_X entry points while call_VM_leaf's correspond to LEAF entry points.
 252 
 253 
 254   void call_VM(Register oop_result,
 255                address entry_point,
 256                bool check_exceptions = true);
 257   void call_VM(Register oop_result,
 258                address entry_point,
 259                Register arg_1,
 260                bool check_exceptions = true);
 261   void call_VM(Register oop_result,
 262                address entry_point,
 263                Register arg_1, Register arg_2,
 264                bool check_exceptions = true);
 265   void call_VM(Register oop_result,
 266                address entry_point,
 267                Register arg_1, Register arg_2, Register arg_3,
 268                bool check_exceptions = true);
 269 
 270   // Overloadings with last_Java_sp
 271   void call_VM(Register oop_result,
 272                Register last_java_sp,
 273                address entry_point,
 274                int number_of_arguments = 0,
 275                bool check_exceptions = true);
 276   void call_VM(Register oop_result,
 277                Register last_java_sp,
 278                address entry_point,
 279                Register arg_1, bool
 280                check_exceptions = true);
 281   void call_VM(Register oop_result,
 282                Register last_java_sp,
 283                address entry_point,
 284                Register arg_1, Register arg_2,
 285                bool check_exceptions = true);
 286   void call_VM(Register oop_result,
 287                Register last_java_sp,
 288                address entry_point,
 289                Register arg_1, Register arg_2, Register arg_3,
 290                bool check_exceptions = true);
 291 
 292   void get_vm_result  (Register oop_result);
 293   void get_vm_result_2(Register metadata_result);
 294 
 295   // These always tightly bind to MacroAssembler::call_VM_base
 296   // bypassing the virtual implementation
 297   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments = 0, bool check_exceptions = true);
 298   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions = true);
 299   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions = true);
 300   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions = true);
 301   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4, bool check_exceptions = true);
 302 
 303   void call_VM_leaf0(address entry_point);
 304   void call_VM_leaf(address entry_point,
 305                     int number_of_arguments = 0);
 306   void call_VM_leaf(address entry_point,
 307                     Register arg_1);
 308   void call_VM_leaf(address entry_point,
 309                     Register arg_1, Register arg_2);
 310   void call_VM_leaf(address entry_point,
 311                     Register arg_1, Register arg_2, Register arg_3);
 312 
 313   void call_VM_leaf(address entry_point,
 314                     Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 315 
 316   // These always tightly bind to MacroAssembler::call_VM_leaf_base
 317   // bypassing the virtual implementation
 318   void super_call_VM_leaf(address entry_point);
 319   void super_call_VM_leaf(address entry_point, Register arg_1);
 320   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
 321   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
 322   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 323 
 324   void set_last_Java_frame(Register last_java_sp,
 325                            Register last_java_fp,
 326                            address  last_java_pc,
 327                            Register rscratch);
 328 
 329   void set_last_Java_frame(Register last_java_sp,
 330                            Register last_java_fp,
 331                            Label &last_java_pc,
 332                            Register scratch);
 333 
 334   void reset_last_Java_frame(bool clear_fp);
 335 
 336   // jobjects
 337   void clear_jobject_tag(Register possibly_non_local);
 338   void resolve_jobject(Register value, Register tmp);
 339   void resolve_global_jobject(Register value, Register tmp);
 340 
 341   // C 'boolean' to Java boolean: x == 0 ? 0 : 1
 342   void c2bool(Register x);
 343 
 344   // C++ bool manipulation
 345 
 346   void movbool(Register dst, Address src);
 347   void movbool(Address dst, bool boolconst);
 348   void movbool(Address dst, Register src);
 349   void testbool(Register dst);
 350 
 351   void resolve_oop_handle(Register result, Register tmp);
 352   void resolve_weak_handle(Register result, Register tmp);
 353   void load_mirror(Register mirror, Register method, Register tmp);
 354   void load_method_holder_cld(Register rresult, Register rmethod);
 355 
 356   void load_method_holder(Register holder, Register method);
 357 
 358   // oop manipulations
 359 #ifdef _LP64
 360   void load_narrow_klass_compact(Register dst, Register src);
 361 #endif
 362   void load_klass(Register dst, Register src, Register tmp);
 363   void store_klass(Register dst, Register src, Register tmp);
 364 
 365   // Compares the Klass pointer of an object to a given Klass (which might be narrow,
 366   // depending on UseCompressedClassPointers).
 367   void cmp_klass(Register klass, Register obj, Register tmp);
 368 
 369   // Compares the Klass pointer of two objects obj1 and obj2. Result is in the condition flags.
 370   // Uses tmp1 and tmp2 as temporary registers.
 371   void cmp_klasses_from_objects(Register obj1, Register obj2, Register tmp1, Register tmp2);
 372 
 373   void access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
 374                       Register tmp1, Register thread_tmp);
 375   void access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val,
 376                        Register tmp1, Register tmp2, Register tmp3);
 377 
 378   void load_heap_oop(Register dst, Address src, Register tmp1 = noreg,
 379                      Register thread_tmp = noreg, DecoratorSet decorators = 0);
 380   void load_heap_oop_not_null(Register dst, Address src, Register tmp1 = noreg,
 381                               Register thread_tmp = noreg, DecoratorSet decorators = 0);
 382   void store_heap_oop(Address dst, Register val, Register tmp1 = noreg,
 383                       Register tmp2 = noreg, Register tmp3 = noreg, DecoratorSet decorators = 0);
 384 
 385   // Used for storing null. All other oop constants should be
 386   // stored using routines that take a jobject.
 387   void store_heap_oop_null(Address dst);
 388 
 389 #ifdef _LP64
 390   void store_klass_gap(Register dst, Register src);
 391 
 392   // This dummy is to prevent a call to store_heap_oop from
 393   // converting a zero (like null) into a Register by giving
 394   // the compiler two choices it can't resolve
 395 
 396   void store_heap_oop(Address dst, void* dummy);
 397 
 398   void encode_heap_oop(Register r);
 399   void decode_heap_oop(Register r);
 400   void encode_heap_oop_not_null(Register r);
 401   void decode_heap_oop_not_null(Register r);
 402   void encode_heap_oop_not_null(Register dst, Register src);
 403   void decode_heap_oop_not_null(Register dst, Register src);
 404 
 405   void set_narrow_oop(Register dst, jobject obj);
 406   void set_narrow_oop(Address dst, jobject obj);
 407   void cmp_narrow_oop(Register dst, jobject obj);
 408   void cmp_narrow_oop(Address dst, jobject obj);
 409 
 410   void encode_klass_not_null(Register r, Register tmp);
 411   void decode_klass_not_null(Register r, Register tmp);
 412   void encode_and_move_klass_not_null(Register dst, Register src);
 413   void decode_and_move_klass_not_null(Register dst, Register src);
 414   void set_narrow_klass(Register dst, Klass* k);
 415   void set_narrow_klass(Address dst, Klass* k);
 416   void cmp_narrow_klass(Register dst, Klass* k);
 417   void cmp_narrow_klass(Address dst, Klass* k);
 418 
 419   // if heap base register is used - reinit it with the correct value
 420   void reinit_heapbase();
 421 
 422   DEBUG_ONLY(void verify_heapbase(const char* msg);)
 423 
 424 #endif // _LP64
 425 
 426   // Int division/remainder for Java
 427   // (as idivl, but checks for special case as described in JVM spec.)
 428   // returns idivl instruction offset for implicit exception handling
 429   int corrected_idivl(Register reg);
 430 
 431   // Long division/remainder for Java
 432   // (as idivq, but checks for special case as described in JVM spec.)
 433   // returns idivq instruction offset for implicit exception handling
 434   int corrected_idivq(Register reg);
 435 
 436   void int3();
 437 
 438   // Long operation macros for a 32bit cpu
 439   // Long negation for Java
 440   void lneg(Register hi, Register lo);
 441 
 442   // Long multiplication for Java
 443   // (destroys contents of eax, ebx, ecx and edx)
 444   void lmul(int x_rsp_offset, int y_rsp_offset); // rdx:rax = x * y
 445 
 446   // Long shifts for Java
 447   // (semantics as described in JVM spec.)
 448   void lshl(Register hi, Register lo);                               // hi:lo << (rcx & 0x3f)
 449   void lshr(Register hi, Register lo, bool sign_extension = false);  // hi:lo >> (rcx & 0x3f)
 450 
 451   // Long compare for Java
 452   // (semantics as described in JVM spec.)
 453   void lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo); // x_hi = lcmp(x, y)
 454 
 455 
 456   // misc
 457 
 458   // Sign extension
 459   void sign_extend_short(Register reg);
 460   void sign_extend_byte(Register reg);
 461 
 462   // Division by power of 2, rounding towards 0
 463   void division_with_shift(Register reg, int shift_value);
 464 
 465 #ifndef _LP64
 466   // Compares the top-most stack entries on the FPU stack and sets the eflags as follows:
 467   //
 468   // CF (corresponds to C0) if x < y
 469   // PF (corresponds to C2) if unordered
 470   // ZF (corresponds to C3) if x = y
 471   //
 472   // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
 473   // tmp is a temporary register, if none is available use noreg (only matters for non-P6 code)
 474   void fcmp(Register tmp);
 475   // Variant of the above which allows y to be further down the stack
 476   // and which only pops x and y if specified. If pop_right is
 477   // specified then pop_left must also be specified.
 478   void fcmp(Register tmp, int index, bool pop_left, bool pop_right);
 479 
 480   // Floating-point comparison for Java
 481   // Compares the top-most stack entries on the FPU stack and stores the result in dst.
 482   // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
 483   // (semantics as described in JVM spec.)
 484   void fcmp2int(Register dst, bool unordered_is_less);
 485   // Variant of the above which allows y to be further down the stack
 486   // and which only pops x and y if specified. If pop_right is
 487   // specified then pop_left must also be specified.
 488   void fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right);
 489 
 490   // Floating-point remainder for Java (ST0 = ST0 fremr ST1, ST1 is empty afterwards)
 491   // tmp is a temporary register, if none is available use noreg
 492   void fremr(Register tmp);
 493 
 494   // only if +VerifyFPU
 495   void verify_FPU(int stack_depth, const char* s = "illegal FPU state");
 496 #endif // !LP64
 497 
 498   // dst = c = a * b + c
 499   void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 500   void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 501 
 502   void vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 503   void vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 504   void vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 505   void vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 506 
 507 
 508   // same as fcmp2int, but using SSE2
 509   void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 510   void cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 511 
 512   // branch to L if FPU flag C2 is set/not set
 513   // tmp is a temporary register, if none is available use noreg
 514   void jC2 (Register tmp, Label& L);
 515   void jnC2(Register tmp, Label& L);
 516 
 517   // Load float value from 'address'. If UseSSE >= 1, the value is loaded into
 518   // register xmm0. Otherwise, the value is loaded onto the FPU stack.
 519   void load_float(Address src);
 520 
 521   // Store float value to 'address'. If UseSSE >= 1, the value is stored
 522   // from register xmm0. Otherwise, the value is stored from the FPU stack.
 523   void store_float(Address dst);
 524 
 525   // Load double value from 'address'. If UseSSE >= 2, the value is loaded into
 526   // register xmm0. Otherwise, the value is loaded onto the FPU stack.
 527   void load_double(Address src);
 528 
 529   // Store double value to 'address'. If UseSSE >= 2, the value is stored
 530   // from register xmm0. Otherwise, the value is stored from the FPU stack.
 531   void store_double(Address dst);
 532 
 533 #ifndef _LP64
 534   // Pop ST (ffree & fincstp combined)
 535   void fpop();
 536 
 537   void empty_FPU_stack();
 538 #endif // !_LP64
 539 
 540   void push_IU_state();
 541   void pop_IU_state();
 542 
 543   void push_FPU_state();
 544   void pop_FPU_state();
 545 
 546   void push_CPU_state();
 547   void pop_CPU_state();
 548 
 549   void push_cont_fastpath();
 550   void pop_cont_fastpath();
 551 
 552   void inc_held_monitor_count();
 553   void dec_held_monitor_count();
 554 
 555   DEBUG_ONLY(void stop_if_in_cont(Register cont_reg, const char* name);)
 556 
 557   // Round up to a power of two
 558   void round_to(Register reg, int modulus);
 559 
 560 private:
 561   // General purpose and XMM registers potentially clobbered by native code; there
 562   // is no need for FPU or AVX opmask related methods because C1/interpreter
 563   // - we save/restore FPU state as a whole always
 564   // - do not care about AVX-512 opmask
 565   static RegSet call_clobbered_gp_registers();
 566   static XMMRegSet call_clobbered_xmm_registers();
 567 
 568   void push_set(XMMRegSet set, int offset);
 569   void pop_set(XMMRegSet set, int offset);
 570 
 571 public:
 572   void push_set(RegSet set, int offset = -1);
 573   void pop_set(RegSet set, int offset = -1);
 574 
 575   // Push and pop everything that might be clobbered by a native
 576   // runtime call.
 577   // Only save the lower 64 bits of each vector register.
 578   // Additional registers can be excluded in a passed RegSet.
 579   void push_call_clobbered_registers_except(RegSet exclude, bool save_fpu = true);
 580   void pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu = true);
 581 
 582   void push_call_clobbered_registers(bool save_fpu = true) {
 583     push_call_clobbered_registers_except(RegSet(), save_fpu);
 584   }
 585   void pop_call_clobbered_registers(bool restore_fpu = true) {
 586     pop_call_clobbered_registers_except(RegSet(), restore_fpu);
 587   }
 588 
 589   // allocation
 590   void tlab_allocate(
 591     Register thread,                   // Current thread
 592     Register obj,                      // result: pointer to object after successful allocation
 593     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
 594     int      con_size_in_bytes,        // object size in bytes if   known at compile time
 595     Register t1,                       // temp register
 596     Register t2,                       // temp register
 597     Label&   slow_case                 // continuation point if fast allocation fails
 598   );
 599   void zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp);
 600 
 601   void population_count(Register dst, Register src, Register scratch1, Register scratch2);
 602 
 603   // interface method calling
 604   void lookup_interface_method(Register recv_klass,
 605                                Register intf_klass,
 606                                RegisterOrConstant itable_index,
 607                                Register method_result,
 608                                Register scan_temp,
 609                                Label& no_such_interface,
 610                                bool return_method = true);
 611 
 612   void lookup_interface_method_stub(Register recv_klass,
 613                                     Register holder_klass,
 614                                     Register resolved_klass,
 615                                     Register method_result,
 616                                     Register scan_temp,
 617                                     Register temp_reg2,
 618                                     Register receiver,
 619                                     int itable_index,
 620                                     Label& L_no_such_interface);
 621 
 622   // virtual method calling
 623   void lookup_virtual_method(Register recv_klass,
 624                              RegisterOrConstant vtable_index,
 625                              Register method_result);
 626 
 627   // Test sub_klass against super_klass, with fast and slow paths.
 628 
 629   // The fast path produces a tri-state answer: yes / no / maybe-slow.
 630   // One of the three labels can be null, meaning take the fall-through.
 631   // If super_check_offset is -1, the value is loaded up from super_klass.
 632   // No registers are killed, except temp_reg.
 633   void check_klass_subtype_fast_path(Register sub_klass,
 634                                      Register super_klass,
 635                                      Register temp_reg,
 636                                      Label* L_success,
 637                                      Label* L_failure,
 638                                      Label* L_slow_path,
 639                 RegisterOrConstant super_check_offset = RegisterOrConstant(-1));
 640 
 641   // The rest of the type check; must be wired to a corresponding fast path.
 642   // It does not repeat the fast path logic, so don't use it standalone.
 643   // The temp_reg and temp2_reg can be noreg, if no temps are available.
 644   // Updates the sub's secondary super cache as necessary.
 645   // If set_cond_codes, condition codes will be Z on success, NZ on failure.
 646   void check_klass_subtype_slow_path(Register sub_klass,
 647                                      Register super_klass,
 648                                      Register temp_reg,
 649                                      Register temp2_reg,
 650                                      Label* L_success,
 651                                      Label* L_failure,
 652                                      bool set_cond_codes = false);
 653 
 654 #ifdef _LP64
 655   // The 64-bit version, which may do a hashed subclass lookup.
 656   void check_klass_subtype_slow_path(Register sub_klass,
 657                                      Register super_klass,
 658                                      Register temp_reg,
 659                                      Register temp2_reg,
 660                                      Register temp3_reg,
 661                                      Register temp4_reg,
 662                                      Label* L_success,
 663                                      Label* L_failure);
 664 #endif
 665 
 666   // Three parts of a hashed subclass lookup: a simple linear search,
 667   // a table lookup, and a fallback that does linear probing in the
 668   // event of a hash collision.
 669   void check_klass_subtype_slow_path_linear(Register sub_klass,
 670                                             Register super_klass,
 671                                             Register temp_reg,
 672                                             Register temp2_reg,
 673                                             Label* L_success,
 674                                             Label* L_failure,
 675                                             bool set_cond_codes = false);
 676   void check_klass_subtype_slow_path_table(Register sub_klass,
 677                                            Register super_klass,
 678                                            Register temp_reg,
 679                                            Register temp2_reg,
 680                                            Register temp3_reg,
 681                                            Register result_reg,
 682                                            Label* L_success,
 683                                            Label* L_failure);
 684   void hashed_check_klass_subtype_slow_path(Register sub_klass,
 685                                             Register super_klass,
 686                                             Register temp_reg,
 687                                             Label* L_success,
 688                                             Label* L_failure);
 689 
 690   // As above, but with a constant super_klass.
 691   // The result is in Register result, not the condition codes.
 692   void lookup_secondary_supers_table_const(Register sub_klass,
 693                                            Register super_klass,
 694                                            Register temp1,
 695                                            Register temp2,
 696                                            Register temp3,
 697                                            Register temp4,
 698                                            Register result,
 699                                            u1 super_klass_slot);
 700 
 701 #ifdef _LP64
 702   using Assembler::salq;
 703   void salq(Register dest, Register count);
 704   using Assembler::rorq;
 705   void rorq(Register dest, Register count);
 706   void lookup_secondary_supers_table_var(Register sub_klass,
 707                                          Register super_klass,
 708                                          Register temp1,
 709                                          Register temp2,
 710                                          Register temp3,
 711                                          Register temp4,
 712                                          Register result);
 713 
 714   void lookup_secondary_supers_table_slow_path(Register r_super_klass,
 715                                                Register r_array_base,
 716                                                Register r_array_index,
 717                                                Register r_bitmap,
 718                                                Register temp1,
 719                                                Register temp2,
 720                                                Label* L_success,
 721                                                Label* L_failure = nullptr);
 722 
 723   void verify_secondary_supers_table(Register r_sub_klass,
 724                                      Register r_super_klass,
 725                                      Register expected,
 726                                      Register temp1,
 727                                      Register temp2,
 728                                      Register temp3);
 729 #endif
 730 
 731   void repne_scanq(Register addr, Register value, Register count, Register limit,
 732                    Label* L_success,
 733                    Label* L_failure = nullptr);
 734 
 735   // If r is valid, return r.
 736   // If r is invalid, remove a register r2 from available_regs, add r2
 737   // to regs_to_push, then return r2.
 738   Register allocate_if_noreg(const Register r,
 739                              RegSetIterator<Register> &available_regs,
 740                              RegSet &regs_to_push);
 741 
 742   // Simplified, combined version, good for typical uses.
 743   // Falls through on failure.
 744   void check_klass_subtype(Register sub_klass,
 745                            Register super_klass,
 746                            Register temp_reg,
 747                            Label& L_success);
 748 
 749   void clinit_barrier(Register klass,
 750                       Label* L_fast_path = nullptr,
 751                       Label* L_slow_path = nullptr);
 752 
 753   // method handles (JSR 292)
 754   Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0);
 755 
 756   // Debugging
 757 
 758   // only if +VerifyOops
 759   void _verify_oop(Register reg, const char* s, const char* file, int line);
 760   void _verify_oop_addr(Address addr, const char* s, const char* file, int line);
 761 
 762   void _verify_oop_checked(Register reg, const char* s, const char* file, int line) {
 763     if (VerifyOops) {
 764       _verify_oop(reg, s, file, line);
 765     }
 766   }
 767   void _verify_oop_addr_checked(Address reg, const char* s, const char* file, int line) {
 768     if (VerifyOops) {
 769       _verify_oop_addr(reg, s, file, line);
 770     }
 771   }
 772 
 773   // TODO: verify method and klass metadata (compare against vptr?)
 774   void _verify_method_ptr(Register reg, const char * msg, const char * file, int line) {}
 775   void _verify_klass_ptr(Register reg, const char * msg, const char * file, int line){}
 776 
 777 #define verify_oop(reg) _verify_oop_checked(reg, "broken oop " #reg, __FILE__, __LINE__)
 778 #define verify_oop_msg(reg, msg) _verify_oop_checked(reg, "broken oop " #reg ", " #msg, __FILE__, __LINE__)
 779 #define verify_oop_addr(addr) _verify_oop_addr_checked(addr, "broken oop addr " #addr, __FILE__, __LINE__)
 780 #define verify_method_ptr(reg) _verify_method_ptr(reg, "broken method " #reg, __FILE__, __LINE__)
 781 #define verify_klass_ptr(reg) _verify_klass_ptr(reg, "broken klass " #reg, __FILE__, __LINE__)
 782 
 783   // Verify or restore cpu control state after JNI call
 784   void restore_cpu_control_state_after_jni(Register rscratch);
 785 
 786   // prints msg, dumps registers and stops execution
 787   void stop(const char* msg);
 788 
 789   // prints msg and continues
 790   void warn(const char* msg);
 791 
 792   // dumps registers and other state
 793   void print_state();
 794 
 795   static void debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg);
 796   static void debug64(char* msg, int64_t pc, int64_t regs[]);
 797   static void print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip);
 798   static void print_state64(int64_t pc, int64_t regs[]);
 799 
 800   void os_breakpoint();
 801 
 802   void untested()                                { stop("untested"); }
 803 
 804   void unimplemented(const char* what = "");
 805 
 806   void should_not_reach_here()                   { stop("should not reach here"); }
 807 
 808   void print_CPU_state();
 809 
 810   // Stack overflow checking
 811   void bang_stack_with_offset(int offset) {
 812     // stack grows down, caller passes positive offset
 813     assert(offset > 0, "must bang with negative offset");
 814     movl(Address(rsp, (-offset)), rax);
 815   }
 816 
 817   // Writes to stack successive pages until offset reached to check for
 818   // stack overflow + shadow pages.  Also, clobbers tmp
 819   void bang_stack_size(Register size, Register tmp);
 820 
 821   // Check for reserved stack access in method being exited (for JIT)
 822   void reserved_stack_check();
 823 
 824   void safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod);
 825 
 826   void verify_tlab();
 827 
 828   static Condition negate_condition(Condition cond);
 829 
 830   // Instructions that use AddressLiteral operands. These instruction can handle 32bit/64bit
 831   // operands. In general the names are modified to avoid hiding the instruction in Assembler
 832   // so that we don't need to implement all the varieties in the Assembler with trivial wrappers
 833   // here in MacroAssembler. The major exception to this rule is call
 834 
 835   // Arithmetics
 836 
 837 
 838   void addptr(Address dst, int32_t src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)) ; }
 839   void addptr(Address dst, Register src);
 840 
 841   void addptr(Register dst, Address src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); }
 842   void addptr(Register dst, int32_t src);
 843   void addptr(Register dst, Register src);
 844   void addptr(Register dst, RegisterOrConstant src) {
 845     if (src.is_constant()) addptr(dst, checked_cast<int>(src.as_constant()));
 846     else                   addptr(dst, src.as_register());
 847   }
 848 
 849   void andptr(Register dst, int32_t src);
 850   void andptr(Register src1, Register src2) { LP64_ONLY(andq(src1, src2)) NOT_LP64(andl(src1, src2)) ; }
 851 
 852 #ifdef _LP64
 853   using Assembler::andq;
 854   void andq(Register dst, AddressLiteral src, Register rscratch = noreg);
 855 #endif
 856 
 857   void cmp8(AddressLiteral src1, int imm, Register rscratch = noreg);
 858 
 859   // renamed to drag out the casting of address to int32_t/intptr_t
 860   void cmp32(Register src1, int32_t imm);
 861 
 862   void cmp32(AddressLiteral src1, int32_t imm, Register rscratch = noreg);
 863   // compare reg - mem, or reg - &mem
 864   void cmp32(Register src1, AddressLiteral src2, Register rscratch = noreg);
 865 
 866   void cmp32(Register src1, Address src2);
 867 
 868 #ifndef _LP64
 869   void cmpklass(Address dst, Metadata* obj);
 870   void cmpklass(Register dst, Metadata* obj);
 871   void cmpoop(Address dst, jobject obj);
 872 #endif // _LP64
 873 
 874   void cmpoop(Register src1, Register src2);
 875   void cmpoop(Register src1, Address src2);
 876   void cmpoop(Register dst, jobject obj, Register rscratch);
 877 
 878   // NOTE src2 must be the lval. This is NOT an mem-mem compare
 879   void cmpptr(Address src1, AddressLiteral src2, Register rscratch);
 880 
 881   void cmpptr(Register src1, AddressLiteral src2, Register rscratch = noreg);
 882 
 883   void cmpptr(Register src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 884   void cmpptr(Register src1, Address src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 885   // void cmpptr(Address src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 886 
 887   void cmpptr(Register src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 888   void cmpptr(Address src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 889 
 890   // cmp64 to avoild hiding cmpq
 891   void cmp64(Register src1, AddressLiteral src, Register rscratch = noreg);
 892 
 893   void cmpxchgptr(Register reg, Address adr);
 894 
 895   void locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch = noreg);
 896 
 897   void imulptr(Register dst, Register src) { LP64_ONLY(imulq(dst, src)) NOT_LP64(imull(dst, src)); }
 898   void imulptr(Register dst, Register src, int imm32) { LP64_ONLY(imulq(dst, src, imm32)) NOT_LP64(imull(dst, src, imm32)); }
 899 
 900 
 901   void negptr(Register dst) { LP64_ONLY(negq(dst)) NOT_LP64(negl(dst)); }
 902 
 903   void notptr(Register dst) { LP64_ONLY(notq(dst)) NOT_LP64(notl(dst)); }
 904 
 905   void shlptr(Register dst, int32_t shift);
 906   void shlptr(Register dst) { LP64_ONLY(shlq(dst)) NOT_LP64(shll(dst)); }
 907 
 908   void shrptr(Register dst, int32_t shift);
 909   void shrptr(Register dst) { LP64_ONLY(shrq(dst)) NOT_LP64(shrl(dst)); }
 910 
 911   void sarptr(Register dst) { LP64_ONLY(sarq(dst)) NOT_LP64(sarl(dst)); }
 912   void sarptr(Register dst, int32_t src) { LP64_ONLY(sarq(dst, src)) NOT_LP64(sarl(dst, src)); }
 913 
 914   void subptr(Address dst, int32_t src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
 915 
 916   void subptr(Register dst, Address src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
 917   void subptr(Register dst, int32_t src);
 918   // Force generation of a 4 byte immediate value even if it fits into 8bit
 919   void subptr_imm32(Register dst, int32_t src);
 920   void subptr(Register dst, Register src);
 921   void subptr(Register dst, RegisterOrConstant src) {
 922     if (src.is_constant()) subptr(dst, (int) src.as_constant());
 923     else                   subptr(dst,       src.as_register());
 924   }
 925 
 926   void sbbptr(Address dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
 927   void sbbptr(Register dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
 928 
 929   void xchgptr(Register src1, Register src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
 930   void xchgptr(Register src1, Address src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
 931 
 932   void xaddptr(Address src1, Register src2) { LP64_ONLY(xaddq(src1, src2)) NOT_LP64(xaddl(src1, src2)) ; }
 933 
 934 
 935 
 936   // Helper functions for statistics gathering.
 937   // Conditionally (atomically, on MPs) increments passed counter address, preserving condition codes.
 938   void cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch = noreg);
 939   // Unconditional atomic increment.
 940   void atomic_incl(Address counter_addr);
 941   void atomic_incl(AddressLiteral counter_addr, Register rscratch = noreg);
 942 #ifdef _LP64
 943   void atomic_incq(Address counter_addr);
 944   void atomic_incq(AddressLiteral counter_addr, Register rscratch = noreg);
 945 #endif
 946   void atomic_incptr(AddressLiteral counter_addr, Register rscratch = noreg) { LP64_ONLY(atomic_incq(counter_addr, rscratch)) NOT_LP64(atomic_incl(counter_addr, rscratch)) ; }
 947   void atomic_incptr(Address counter_addr) { LP64_ONLY(atomic_incq(counter_addr)) NOT_LP64(atomic_incl(counter_addr)) ; }
 948 
 949   using Assembler::lea;
 950   void lea(Register dst, AddressLiteral adr);
 951   void lea(Address  dst, AddressLiteral adr, Register rscratch);
 952 
 953   void leal32(Register dst, Address src) { leal(dst, src); }
 954 
 955   // Import other testl() methods from the parent class or else
 956   // they will be hidden by the following overriding declaration.
 957   using Assembler::testl;
 958   void testl(Address dst, int32_t imm32);
 959   void testl(Register dst, int32_t imm32);
 960   void testl(Register dst, AddressLiteral src); // requires reachable address
 961   using Assembler::testq;
 962   void testq(Address dst, int32_t imm32);
 963   void testq(Register dst, int32_t imm32);
 964 
 965   void orptr(Register dst, Address src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 966   void orptr(Register dst, Register src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 967   void orptr(Register dst, int32_t src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 968   void orptr(Address dst, int32_t imm32) { LP64_ONLY(orq(dst, imm32)) NOT_LP64(orl(dst, imm32)); }
 969 
 970   void testptr(Register src, int32_t imm32) {  LP64_ONLY(testq(src, imm32)) NOT_LP64(testl(src, imm32)); }
 971   void testptr(Register src1, Address src2) { LP64_ONLY(testq(src1, src2)) NOT_LP64(testl(src1, src2)); }
 972   void testptr(Address src, int32_t imm32) {  LP64_ONLY(testq(src, imm32)) NOT_LP64(testl(src, imm32)); }
 973   void testptr(Register src1, Register src2);
 974 
 975   void xorptr(Register dst, Register src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
 976   void xorptr(Register dst, Address src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
 977 
 978   // Calls
 979 
 980   void call(Label& L, relocInfo::relocType rtype);
 981   void call(Register entry);
 982   void call(Address addr) { Assembler::call(addr); }
 983 
 984   // NOTE: this call transfers to the effective address of entry NOT
 985   // the address contained by entry. This is because this is more natural
 986   // for jumps/calls.
 987   void call(AddressLiteral entry, Register rscratch = rax);
 988 
 989   // Emit the CompiledIC call idiom
 990   void ic_call(address entry, jint method_index = 0);
 991   static int ic_check_size();
 992   int ic_check(int end_alignment);
 993 
 994   void emit_static_call_stub();
 995 
 996   // Jumps
 997 
 998   // NOTE: these jumps transfer to the effective address of dst NOT
 999   // the address contained by dst. This is because this is more natural
1000   // for jumps/calls.
1001   void jump(AddressLiteral dst, Register rscratch = noreg);
1002 
1003   void jump_cc(Condition cc, AddressLiteral dst, Register rscratch = noreg);
1004 
1005   // 32bit can do a case table jump in one instruction but we no longer allow the base
1006   // to be installed in the Address class. This jump will transfer to the address
1007   // contained in the location described by entry (not the address of entry)
1008   void jump(ArrayAddress entry, Register rscratch);
1009 
1010   // Adding more natural conditional jump instructions
1011   void ALWAYSINLINE jo(Label& L, bool maybe_short = true) { jcc(Assembler::overflow, L, maybe_short); }
1012   void ALWAYSINLINE jno(Label& L, bool maybe_short = true) { jcc(Assembler::noOverflow, L, maybe_short); }
1013   void ALWAYSINLINE js(Label& L, bool maybe_short = true) { jcc(Assembler::negative, L, maybe_short); }
1014   void ALWAYSINLINE jns(Label& L, bool maybe_short = true) { jcc(Assembler::positive, L, maybe_short); }
1015   void ALWAYSINLINE je(Label& L, bool maybe_short = true) { jcc(Assembler::equal, L, maybe_short); }
1016   void ALWAYSINLINE jz(Label& L, bool maybe_short = true) { jcc(Assembler::zero, L, maybe_short); }
1017   void ALWAYSINLINE jne(Label& L, bool maybe_short = true) { jcc(Assembler::notEqual, L, maybe_short); }
1018   void ALWAYSINLINE jnz(Label& L, bool maybe_short = true) { jcc(Assembler::notZero, L, maybe_short); }
1019   void ALWAYSINLINE jb(Label& L, bool maybe_short = true) { jcc(Assembler::below, L, maybe_short); }
1020   void ALWAYSINLINE jnae(Label& L, bool maybe_short = true) { jcc(Assembler::below, L, maybe_short); }
1021   void ALWAYSINLINE jc(Label& L, bool maybe_short = true) { jcc(Assembler::carrySet, L, maybe_short); }
1022   void ALWAYSINLINE jnb(Label& L, bool maybe_short = true) { jcc(Assembler::aboveEqual, L, maybe_short); }
1023   void ALWAYSINLINE jae(Label& L, bool maybe_short = true) { jcc(Assembler::aboveEqual, L, maybe_short); }
1024   void ALWAYSINLINE jnc(Label& L, bool maybe_short = true) { jcc(Assembler::carryClear, L, maybe_short); }
1025   void ALWAYSINLINE jbe(Label& L, bool maybe_short = true) { jcc(Assembler::belowEqual, L, maybe_short); }
1026   void ALWAYSINLINE jna(Label& L, bool maybe_short = true) { jcc(Assembler::belowEqual, L, maybe_short); }
1027   void ALWAYSINLINE ja(Label& L, bool maybe_short = true) { jcc(Assembler::above, L, maybe_short); }
1028   void ALWAYSINLINE jnbe(Label& L, bool maybe_short = true) { jcc(Assembler::above, L, maybe_short); }
1029   void ALWAYSINLINE jl(Label& L, bool maybe_short = true) { jcc(Assembler::less, L, maybe_short); }
1030   void ALWAYSINLINE jnge(Label& L, bool maybe_short = true) { jcc(Assembler::less, L, maybe_short); }
1031   void ALWAYSINLINE jge(Label& L, bool maybe_short = true) { jcc(Assembler::greaterEqual, L, maybe_short); }
1032   void ALWAYSINLINE jnl(Label& L, bool maybe_short = true) { jcc(Assembler::greaterEqual, L, maybe_short); }
1033   void ALWAYSINLINE jle(Label& L, bool maybe_short = true) { jcc(Assembler::lessEqual, L, maybe_short); }
1034   void ALWAYSINLINE jng(Label& L, bool maybe_short = true) { jcc(Assembler::lessEqual, L, maybe_short); }
1035   void ALWAYSINLINE jg(Label& L, bool maybe_short = true) { jcc(Assembler::greater, L, maybe_short); }
1036   void ALWAYSINLINE jnle(Label& L, bool maybe_short = true) { jcc(Assembler::greater, L, maybe_short); }
1037   void ALWAYSINLINE jp(Label& L, bool maybe_short = true) { jcc(Assembler::parity, L, maybe_short); }
1038   void ALWAYSINLINE jpe(Label& L, bool maybe_short = true) { jcc(Assembler::parity, L, maybe_short); }
1039   void ALWAYSINLINE jnp(Label& L, bool maybe_short = true) { jcc(Assembler::noParity, L, maybe_short); }
1040   void ALWAYSINLINE jpo(Label& L, bool maybe_short = true) { jcc(Assembler::noParity, L, maybe_short); }
1041   // * No condition for this *  void ALWAYSINLINE jcxz(Label& L, bool maybe_short = true) { jcc(Assembler::cxz, L, maybe_short); }
1042   // * No condition for this *  void ALWAYSINLINE jecxz(Label& L, bool maybe_short = true) { jcc(Assembler::cxz, L, maybe_short); }
1043 
1044   // Short versions of the above
1045   void ALWAYSINLINE jo_b(Label& L) { jccb(Assembler::overflow, L); }
1046   void ALWAYSINLINE jno_b(Label& L) { jccb(Assembler::noOverflow, L); }
1047   void ALWAYSINLINE js_b(Label& L) { jccb(Assembler::negative, L); }
1048   void ALWAYSINLINE jns_b(Label& L) { jccb(Assembler::positive, L); }
1049   void ALWAYSINLINE je_b(Label& L) { jccb(Assembler::equal, L); }
1050   void ALWAYSINLINE jz_b(Label& L) { jccb(Assembler::zero, L); }
1051   void ALWAYSINLINE jne_b(Label& L) { jccb(Assembler::notEqual, L); }
1052   void ALWAYSINLINE jnz_b(Label& L) { jccb(Assembler::notZero, L); }
1053   void ALWAYSINLINE jb_b(Label& L) { jccb(Assembler::below, L); }
1054   void ALWAYSINLINE jnae_b(Label& L) { jccb(Assembler::below, L); }
1055   void ALWAYSINLINE jc_b(Label& L) { jccb(Assembler::carrySet, L); }
1056   void ALWAYSINLINE jnb_b(Label& L) { jccb(Assembler::aboveEqual, L); }
1057   void ALWAYSINLINE jae_b(Label& L) { jccb(Assembler::aboveEqual, L); }
1058   void ALWAYSINLINE jnc_b(Label& L) { jccb(Assembler::carryClear, L); }
1059   void ALWAYSINLINE jbe_b(Label& L) { jccb(Assembler::belowEqual, L); }
1060   void ALWAYSINLINE jna_b(Label& L) { jccb(Assembler::belowEqual, L); }
1061   void ALWAYSINLINE ja_b(Label& L) { jccb(Assembler::above, L); }
1062   void ALWAYSINLINE jnbe_b(Label& L) { jccb(Assembler::above, L); }
1063   void ALWAYSINLINE jl_b(Label& L) { jccb(Assembler::less, L); }
1064   void ALWAYSINLINE jnge_b(Label& L) { jccb(Assembler::less, L); }
1065   void ALWAYSINLINE jge_b(Label& L) { jccb(Assembler::greaterEqual, L); }
1066   void ALWAYSINLINE jnl_b(Label& L) { jccb(Assembler::greaterEqual, L); }
1067   void ALWAYSINLINE jle_b(Label& L) { jccb(Assembler::lessEqual, L); }
1068   void ALWAYSINLINE jng_b(Label& L) { jccb(Assembler::lessEqual, L); }
1069   void ALWAYSINLINE jg_b(Label& L) { jccb(Assembler::greater, L); }
1070   void ALWAYSINLINE jnle_b(Label& L) { jccb(Assembler::greater, L); }
1071   void ALWAYSINLINE jp_b(Label& L) { jccb(Assembler::parity, L); }
1072   void ALWAYSINLINE jpe_b(Label& L) { jccb(Assembler::parity, L); }
1073   void ALWAYSINLINE jnp_b(Label& L) { jccb(Assembler::noParity, L); }
1074   void ALWAYSINLINE jpo_b(Label& L) { jccb(Assembler::noParity, L); }
1075   // * No condition for this *  void ALWAYSINLINE jcxz_b(Label& L) { jccb(Assembler::cxz, L); }
1076   // * No condition for this *  void ALWAYSINLINE jecxz_b(Label& L) { jccb(Assembler::cxz, L); }
1077 
1078   // Floating
1079 
1080   void push_f(XMMRegister r);
1081   void pop_f(XMMRegister r);
1082   void push_d(XMMRegister r);
1083   void pop_d(XMMRegister r);
1084 
1085   void andpd(XMMRegister dst, XMMRegister    src) { Assembler::andpd(dst, src); }
1086   void andpd(XMMRegister dst, Address        src) { Assembler::andpd(dst, src); }
1087   void andpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1088 
1089   void andps(XMMRegister dst, XMMRegister    src) { Assembler::andps(dst, src); }
1090   void andps(XMMRegister dst, Address        src) { Assembler::andps(dst, src); }
1091   void andps(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1092 
1093   void comiss(XMMRegister dst, XMMRegister    src) { Assembler::comiss(dst, src); }
1094   void comiss(XMMRegister dst, Address        src) { Assembler::comiss(dst, src); }
1095   void comiss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1096 
1097   void comisd(XMMRegister dst, XMMRegister    src) { Assembler::comisd(dst, src); }
1098   void comisd(XMMRegister dst, Address        src) { Assembler::comisd(dst, src); }
1099   void comisd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1100 
1101 #ifndef _LP64
1102   void fadd_s(Address        src) { Assembler::fadd_s(src); }
1103   void fadd_s(AddressLiteral src) { Assembler::fadd_s(as_Address(src)); }
1104 
1105   void fldcw(Address        src) { Assembler::fldcw(src); }
1106   void fldcw(AddressLiteral src);
1107 
1108   void fld_s(int index)          { Assembler::fld_s(index); }
1109   void fld_s(Address        src) { Assembler::fld_s(src); }
1110   void fld_s(AddressLiteral src);
1111 
1112   void fld_d(Address        src) { Assembler::fld_d(src); }
1113   void fld_d(AddressLiteral src);
1114 
1115   void fld_x(Address        src) { Assembler::fld_x(src); }
1116   void fld_x(AddressLiteral src) { Assembler::fld_x(as_Address(src)); }
1117 
1118   void fmul_s(Address        src) { Assembler::fmul_s(src); }
1119   void fmul_s(AddressLiteral src) { Assembler::fmul_s(as_Address(src)); }
1120 #endif // !_LP64
1121 
1122   void cmp32_mxcsr_std(Address mxcsr_save, Register tmp, Register rscratch = noreg);
1123   void ldmxcsr(Address src) { Assembler::ldmxcsr(src); }
1124   void ldmxcsr(AddressLiteral src, Register rscratch = noreg);
1125 
1126 #ifdef _LP64
1127  private:
1128   void sha256_AVX2_one_round_compute(
1129     Register  reg_old_h,
1130     Register  reg_a,
1131     Register  reg_b,
1132     Register  reg_c,
1133     Register  reg_d,
1134     Register  reg_e,
1135     Register  reg_f,
1136     Register  reg_g,
1137     Register  reg_h,
1138     int iter);
1139   void sha256_AVX2_four_rounds_compute_first(int start);
1140   void sha256_AVX2_four_rounds_compute_last(int start);
1141   void sha256_AVX2_one_round_and_sched(
1142         XMMRegister xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
1143         XMMRegister xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
1144         XMMRegister xmm_2,     /* ymm6 */
1145         XMMRegister xmm_3,     /* ymm7 */
1146         Register    reg_a,      /* == eax on 0 iteration, then rotate 8 register right on each next iteration */
1147         Register    reg_b,      /* ebx */    /* full cycle is 8 iterations */
1148         Register    reg_c,      /* edi */
1149         Register    reg_d,      /* esi */
1150         Register    reg_e,      /* r8d */
1151         Register    reg_f,      /* r9d */
1152         Register    reg_g,      /* r10d */
1153         Register    reg_h,      /* r11d */
1154         int iter);
1155 
1156   void addm(int disp, Register r1, Register r2);
1157 
1158   void sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c, Register d,
1159                                      Register e, Register f, Register g, Register h, int iteration);
1160 
1161   void sha512_AVX2_one_round_and_schedule(XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1162                                           Register a, Register b, Register c, Register d, Register e, Register f,
1163                                           Register g, Register h, int iteration);
1164 
1165   void addmq(int disp, Register r1, Register r2);
1166  public:
1167   void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1168                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1169                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1170                    bool multi_block, XMMRegister shuf_mask);
1171   void sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1172                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1173                    Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block,
1174                    XMMRegister shuf_mask);
1175   void sha512_update_ni_x1(Register arg_hash, Register arg_msg, Register ofs, Register limit, bool multi_block);
1176 #endif // _LP64
1177 
1178   void fast_md5(Register buf, Address state, Address ofs, Address limit,
1179                 bool multi_block);
1180 
1181   void fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
1182                  XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
1183                  Register buf, Register state, Register ofs, Register limit, Register rsp,
1184                  bool multi_block);
1185 
1186 #ifdef _LP64
1187   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1188                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1189                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1190                    bool multi_block, XMMRegister shuf_mask);
1191 #else
1192   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1193                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1194                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1195                    bool multi_block);
1196 #endif
1197 
1198   void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1199                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1200                 Register rax, Register rcx, Register rdx, Register tmp);
1201 
1202 #ifndef _LP64
1203  private:
1204   // Initialized in macroAssembler_x86_constants.cpp
1205   static address ONES;
1206   static address L_2IL0FLOATPACKET_0;
1207   static address PI4_INV;
1208   static address PI4X3;
1209   static address PI4X4;
1210 
1211  public:
1212   void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1213                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1214                 Register rax, Register rcx, Register rdx, Register tmp1);
1215 
1216   void fast_log10(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1217                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1218                 Register rax, Register rcx, Register rdx, Register tmp);
1219 
1220   void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1221                 XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
1222                 Register rdx, Register tmp);
1223 
1224   void fast_sin(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1225                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1226                 Register rax, Register rbx, Register rdx);
1227 
1228   void fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1229                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1230                 Register rax, Register rcx, Register rdx, Register tmp);
1231 
1232   void libm_sincos_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1233                         Register edx, Register ebx, Register esi, Register edi,
1234                         Register ebp, Register esp);
1235 
1236   void libm_reduce_pi04l(Register eax, Register ecx, Register edx, Register ebx,
1237                          Register esi, Register edi, Register ebp, Register esp);
1238 
1239   void libm_tancot_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1240                         Register edx, Register ebx, Register esi, Register edi,
1241                         Register ebp, Register esp);
1242 
1243   void fast_tan(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1244                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1245                 Register rax, Register rcx, Register rdx, Register tmp);
1246 #endif // !_LP64
1247 
1248 private:
1249 
1250   // these are private because users should be doing movflt/movdbl
1251 
1252   void movss(Address     dst, XMMRegister    src) { Assembler::movss(dst, src); }
1253   void movss(XMMRegister dst, XMMRegister    src) { Assembler::movss(dst, src); }
1254   void movss(XMMRegister dst, Address        src) { Assembler::movss(dst, src); }
1255   void movss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1256 
1257   void movlpd(XMMRegister dst, Address        src) {Assembler::movlpd(dst, src); }
1258   void movlpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1259 
1260 public:
1261 
1262   void addsd(XMMRegister dst, XMMRegister    src) { Assembler::addsd(dst, src); }
1263   void addsd(XMMRegister dst, Address        src) { Assembler::addsd(dst, src); }
1264   void addsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1265 
1266   void addss(XMMRegister dst, XMMRegister    src) { Assembler::addss(dst, src); }
1267   void addss(XMMRegister dst, Address        src) { Assembler::addss(dst, src); }
1268   void addss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1269 
1270   void addpd(XMMRegister dst, XMMRegister    src) { Assembler::addpd(dst, src); }
1271   void addpd(XMMRegister dst, Address        src) { Assembler::addpd(dst, src); }
1272   void addpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1273 
1274   using Assembler::vbroadcasti128;
1275   void vbroadcasti128(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1276 
1277   using Assembler::vbroadcastsd;
1278   void vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1279 
1280   using Assembler::vbroadcastss;
1281   void vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1282 
1283   // Vector float blend
1284   void vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg);
1285   void vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg);
1286 
1287   void divsd(XMMRegister dst, XMMRegister    src) { Assembler::divsd(dst, src); }
1288   void divsd(XMMRegister dst, Address        src) { Assembler::divsd(dst, src); }
1289   void divsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1290 
1291   void divss(XMMRegister dst, XMMRegister    src) { Assembler::divss(dst, src); }
1292   void divss(XMMRegister dst, Address        src) { Assembler::divss(dst, src); }
1293   void divss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1294 
1295   // Move Unaligned Double Quadword
1296   void movdqu(Address     dst, XMMRegister    src);
1297   void movdqu(XMMRegister dst, XMMRegister    src);
1298   void movdqu(XMMRegister dst, Address        src);
1299   void movdqu(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1300 
1301   void kmovwl(Register  dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1302   void kmovwl(Address   dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1303   void kmovwl(KRegister dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1304   void kmovwl(KRegister dst, Register       src) { Assembler::kmovwl(dst, src); }
1305   void kmovwl(KRegister dst, Address        src) { Assembler::kmovwl(dst, src); }
1306   void kmovwl(KRegister dst, AddressLiteral src, Register rscratch = noreg);
1307 
1308   void kmovql(KRegister dst, KRegister      src) { Assembler::kmovql(dst, src); }
1309   void kmovql(KRegister dst, Register       src) { Assembler::kmovql(dst, src); }
1310   void kmovql(Register  dst, KRegister      src) { Assembler::kmovql(dst, src); }
1311   void kmovql(KRegister dst, Address        src) { Assembler::kmovql(dst, src); }
1312   void kmovql(Address   dst, KRegister      src) { Assembler::kmovql(dst, src); }
1313   void kmovql(KRegister dst, AddressLiteral src, Register rscratch = noreg);
1314 
1315   // Safe move operation, lowers down to 16bit moves for targets supporting
1316   // AVX512F feature and 64bit moves for targets supporting AVX512BW feature.
1317   void kmov(Address  dst, KRegister src);
1318   void kmov(KRegister dst, Address src);
1319   void kmov(KRegister dst, KRegister src);
1320   void kmov(Register dst, KRegister src);
1321   void kmov(KRegister dst, Register src);
1322 
1323   using Assembler::movddup;
1324   void movddup(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1325 
1326   using Assembler::vmovddup;
1327   void vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1328 
1329   // AVX Unaligned forms
1330   void vmovdqu(Address     dst, XMMRegister    src);
1331   void vmovdqu(XMMRegister dst, Address        src);
1332   void vmovdqu(XMMRegister dst, XMMRegister    src);
1333   void vmovdqu(XMMRegister dst, AddressLiteral src,                 Register rscratch = noreg);
1334   void vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1335   void vmovdqu(XMMRegister dst, XMMRegister    src, int vector_len);
1336   void vmovdqu(XMMRegister dst, Address        src, int vector_len);
1337   void vmovdqu(Address     dst, XMMRegister    src, int vector_len);
1338 
1339   // AVX Aligned forms
1340   using Assembler::vmovdqa;
1341   void vmovdqa(XMMRegister dst, AddressLiteral src,                 Register rscratch = noreg);
1342   void vmovdqa(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1343 
1344   // AVX512 Unaligned
1345   void evmovdqu(BasicType type, KRegister kmask, Address     dst, XMMRegister src, bool merge, int vector_len);
1346   void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address     src, bool merge, int vector_len);
1347   void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len);
1348 
1349   void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::evmovdqub(dst, src, vector_len); }
1350   void evmovdqub(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdqub(dst, src, vector_len); }
1351 
1352   void evmovdqub(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1353     if (dst->encoding() != src->encoding() || mask != k0)  {
1354       Assembler::evmovdqub(dst, mask, src, merge, vector_len);
1355     }
1356   }
1357   void evmovdqub(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
1358   void evmovdqub(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
1359   void evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1360 
1361   void evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
1362   void evmovdquw(Address     dst, XMMRegister src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
1363   void evmovdquw(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
1364 
1365   void evmovdquw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1366     if (dst->encoding() != src->encoding() || mask != k0) {
1367       Assembler::evmovdquw(dst, mask, src, merge, vector_len);
1368     }
1369   }
1370   void evmovdquw(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
1371   void evmovdquw(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
1372   void evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1373 
1374   void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
1375      if (dst->encoding() != src->encoding()) {
1376        Assembler::evmovdqul(dst, src, vector_len);
1377      }
1378   }
1379   void evmovdqul(Address     dst, XMMRegister src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
1380   void evmovdqul(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
1381 
1382   void evmovdqul(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1383     if (dst->encoding() != src->encoding() || mask != k0)  {
1384       Assembler::evmovdqul(dst, mask, src, merge, vector_len);
1385     }
1386   }
1387   void evmovdqul(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
1388   void evmovdqul(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
1389   void evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1390 
1391   void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) {
1392     if (dst->encoding() != src->encoding()) {
1393       Assembler::evmovdquq(dst, src, vector_len);
1394     }
1395   }
1396   void evmovdquq(XMMRegister dst, Address        src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1397   void evmovdquq(Address     dst, XMMRegister    src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1398   void evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1399   void evmovdqaq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1400 
1401   void evmovdquq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1402     if (dst->encoding() != src->encoding() || mask != k0) {
1403       Assembler::evmovdquq(dst, mask, src, merge, vector_len);
1404     }
1405   }
1406   void evmovdquq(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
1407   void evmovdquq(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
1408   void evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1409   void evmovdqaq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1410 
1411   // Move Aligned Double Quadword
1412   void movdqa(XMMRegister dst, XMMRegister    src) { Assembler::movdqa(dst, src); }
1413   void movdqa(XMMRegister dst, Address        src) { Assembler::movdqa(dst, src); }
1414   void movdqa(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1415 
1416   void movsd(Address     dst, XMMRegister    src) { Assembler::movsd(dst, src); }
1417   void movsd(XMMRegister dst, XMMRegister    src) { Assembler::movsd(dst, src); }
1418   void movsd(XMMRegister dst, Address        src) { Assembler::movsd(dst, src); }
1419   void movsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1420 
1421   void mulpd(XMMRegister dst, XMMRegister    src) { Assembler::mulpd(dst, src); }
1422   void mulpd(XMMRegister dst, Address        src) { Assembler::mulpd(dst, src); }
1423   void mulpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1424 
1425   void mulsd(XMMRegister dst, XMMRegister    src) { Assembler::mulsd(dst, src); }
1426   void mulsd(XMMRegister dst, Address        src) { Assembler::mulsd(dst, src); }
1427   void mulsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1428 
1429   void mulss(XMMRegister dst, XMMRegister    src) { Assembler::mulss(dst, src); }
1430   void mulss(XMMRegister dst, Address        src) { Assembler::mulss(dst, src); }
1431   void mulss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1432 
1433   // Carry-Less Multiplication Quadword
1434   void pclmulldq(XMMRegister dst, XMMRegister src) {
1435     // 0x00 - multiply lower 64 bits [0:63]
1436     Assembler::pclmulqdq(dst, src, 0x00);
1437   }
1438   void pclmulhdq(XMMRegister dst, XMMRegister src) {
1439     // 0x11 - multiply upper 64 bits [64:127]
1440     Assembler::pclmulqdq(dst, src, 0x11);
1441   }
1442 
1443   void pcmpeqb(XMMRegister dst, XMMRegister src);
1444   void pcmpeqw(XMMRegister dst, XMMRegister src);
1445 
1446   void pcmpestri(XMMRegister dst, Address src, int imm8);
1447   void pcmpestri(XMMRegister dst, XMMRegister src, int imm8);
1448 
1449   void pmovzxbw(XMMRegister dst, XMMRegister src);
1450   void pmovzxbw(XMMRegister dst, Address src);
1451 
1452   void pmovmskb(Register dst, XMMRegister src);
1453 
1454   void ptest(XMMRegister dst, XMMRegister src);
1455 
1456   void roundsd(XMMRegister dst, XMMRegister    src, int32_t rmode) { Assembler::roundsd(dst, src, rmode); }
1457   void roundsd(XMMRegister dst, Address        src, int32_t rmode) { Assembler::roundsd(dst, src, rmode); }
1458   void roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch = noreg);
1459 
1460   void sqrtss(XMMRegister dst, XMMRegister     src) { Assembler::sqrtss(dst, src); }
1461   void sqrtss(XMMRegister dst, Address         src) { Assembler::sqrtss(dst, src); }
1462   void sqrtss(XMMRegister dst, AddressLiteral  src, Register rscratch = noreg);
1463 
1464   void subsd(XMMRegister dst, XMMRegister    src) { Assembler::subsd(dst, src); }
1465   void subsd(XMMRegister dst, Address        src) { Assembler::subsd(dst, src); }
1466   void subsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1467 
1468   void subss(XMMRegister dst, XMMRegister    src) { Assembler::subss(dst, src); }
1469   void subss(XMMRegister dst, Address        src) { Assembler::subss(dst, src); }
1470   void subss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1471 
1472   void ucomiss(XMMRegister dst, XMMRegister    src) { Assembler::ucomiss(dst, src); }
1473   void ucomiss(XMMRegister dst, Address        src) { Assembler::ucomiss(dst, src); }
1474   void ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1475 
1476   void ucomisd(XMMRegister dst, XMMRegister    src) { Assembler::ucomisd(dst, src); }
1477   void ucomisd(XMMRegister dst, Address        src) { Assembler::ucomisd(dst, src); }
1478   void ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1479 
1480   // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
1481   void xorpd(XMMRegister dst, XMMRegister    src);
1482   void xorpd(XMMRegister dst, Address        src) { Assembler::xorpd(dst, src); }
1483   void xorpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1484 
1485   // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
1486   void xorps(XMMRegister dst, XMMRegister    src);
1487   void xorps(XMMRegister dst, Address        src) { Assembler::xorps(dst, src); }
1488   void xorps(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1489 
1490   // Shuffle Bytes
1491   void pshufb(XMMRegister dst, XMMRegister    src) { Assembler::pshufb(dst, src); }
1492   void pshufb(XMMRegister dst, Address        src) { Assembler::pshufb(dst, src); }
1493   void pshufb(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1494   // AVX 3-operands instructions
1495 
1496   void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vaddsd(dst, nds, src); }
1497   void vaddsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vaddsd(dst, nds, src); }
1498   void vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1499 
1500   void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vaddss(dst, nds, src); }
1501   void vaddss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vaddss(dst, nds, src); }
1502   void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1503 
1504   void vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch = noreg);
1505   void vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch = noreg);
1506 
1507   void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len);
1508   void vpaddb(XMMRegister dst, XMMRegister nds, Address        src, int vector_len);
1509   void vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1510 
1511   void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1512   void vpaddw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1513 
1514   void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1515   void vpaddd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1516   void vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1517 
1518   void vpand(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1519   void vpand(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1520   void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1521 
1522   using Assembler::vpbroadcastd;
1523   void vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1524 
1525   using Assembler::vpbroadcastq;
1526   void vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1527 
1528   void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1529   void vpcmpeqb(XMMRegister dst, XMMRegister src1, Address src2, int vector_len);
1530 
1531   void vpcmpeqw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1532   void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1533   void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1534 
1535   // Vector compares
1536   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1537     Assembler::evpcmpd(kdst, mask, nds, src, comparison, is_signed, vector_len);
1538   }
1539   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1540 
1541   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1542     Assembler::evpcmpq(kdst, mask, nds, src, comparison, is_signed, vector_len);
1543   }
1544   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1545 
1546   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1547     Assembler::evpcmpb(kdst, mask, nds, src, comparison, is_signed, vector_len);
1548   }
1549   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1550 
1551   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1552     Assembler::evpcmpw(kdst, mask, nds, src, comparison, is_signed, vector_len);
1553   }
1554   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1555 
1556   void evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len);
1557 
1558   // Emit comparison instruction for the specified comparison predicate.
1559   void vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len);
1560   void vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len);
1561 
1562   void vpmovzxbw(XMMRegister dst, Address     src, int vector_len);
1563   void vpmovzxbw(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vpmovzxbw(dst, src, vector_len); }
1564 
1565   void vpmovmskb(Register dst, XMMRegister src, int vector_len = Assembler::AVX_256bit);
1566 
1567   void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1568   void vpmullw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1569 
1570   void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); }
1571   void vpmulld(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); }
1572   void vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1573 
1574   void vpmuldq(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpmuldq(dst, nds, src, vector_len); }
1575 
1576   void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1577   void vpsubb(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1578 
1579   void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1580   void vpsubw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1581 
1582   void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1583   void vpsraw(XMMRegister dst, XMMRegister nds, int         shift, int vector_len);
1584 
1585   void evpsrad(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1586   void evpsrad(XMMRegister dst, XMMRegister nds, int         shift, int vector_len);
1587 
1588   void evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1589   void evpsraq(XMMRegister dst, XMMRegister nds, int         shift, int vector_len);
1590 
1591   using Assembler::evpsllw;
1592   void evpsllw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1593     if (!is_varshift) {
1594       Assembler::evpsllw(dst, mask, nds, src, merge, vector_len);
1595     } else {
1596       Assembler::evpsllvw(dst, mask, nds, src, merge, vector_len);
1597     }
1598   }
1599   void evpslld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1600     if (!is_varshift) {
1601       Assembler::evpslld(dst, mask, nds, src, merge, vector_len);
1602     } else {
1603       Assembler::evpsllvd(dst, mask, nds, src, merge, vector_len);
1604     }
1605   }
1606   void evpsllq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1607     if (!is_varshift) {
1608       Assembler::evpsllq(dst, mask, nds, src, merge, vector_len);
1609     } else {
1610       Assembler::evpsllvq(dst, mask, nds, src, merge, vector_len);
1611     }
1612   }
1613   void evpsrlw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1614     if (!is_varshift) {
1615       Assembler::evpsrlw(dst, mask, nds, src, merge, vector_len);
1616     } else {
1617       Assembler::evpsrlvw(dst, mask, nds, src, merge, vector_len);
1618     }
1619   }
1620   void evpsrld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1621     if (!is_varshift) {
1622       Assembler::evpsrld(dst, mask, nds, src, merge, vector_len);
1623     } else {
1624       Assembler::evpsrlvd(dst, mask, nds, src, merge, vector_len);
1625     }
1626   }
1627 
1628   using Assembler::evpsrlq;
1629   void evpsrlq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1630     if (!is_varshift) {
1631       Assembler::evpsrlq(dst, mask, nds, src, merge, vector_len);
1632     } else {
1633       Assembler::evpsrlvq(dst, mask, nds, src, merge, vector_len);
1634     }
1635   }
1636   using Assembler::evpsraw;
1637   void evpsraw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1638     if (!is_varshift) {
1639       Assembler::evpsraw(dst, mask, nds, src, merge, vector_len);
1640     } else {
1641       Assembler::evpsravw(dst, mask, nds, src, merge, vector_len);
1642     }
1643   }
1644   using Assembler::evpsrad;
1645   void evpsrad(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1646     if (!is_varshift) {
1647       Assembler::evpsrad(dst, mask, nds, src, merge, vector_len);
1648     } else {
1649       Assembler::evpsravd(dst, mask, nds, src, merge, vector_len);
1650     }
1651   }
1652   using Assembler::evpsraq;
1653   void evpsraq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1654     if (!is_varshift) {
1655       Assembler::evpsraq(dst, mask, nds, src, merge, vector_len);
1656     } else {
1657       Assembler::evpsravq(dst, mask, nds, src, merge, vector_len);
1658     }
1659   }
1660 
1661   void evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1662   void evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1663   void evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1664   void evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1665 
1666   void evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1667   void evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1668   void evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1669   void evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1670 
1671   void vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1672   void vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1673 
1674   void vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1675   void vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1676 
1677   void vptest(XMMRegister dst, XMMRegister src);
1678   void vptest(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vptest(dst, src, vector_len); }
1679 
1680   void punpcklbw(XMMRegister dst, XMMRegister src);
1681   void punpcklbw(XMMRegister dst, Address src) { Assembler::punpcklbw(dst, src); }
1682 
1683   void pshufd(XMMRegister dst, Address src, int mode);
1684   void pshufd(XMMRegister dst, XMMRegister src, int mode) { Assembler::pshufd(dst, src, mode); }
1685 
1686   void pshuflw(XMMRegister dst, XMMRegister src, int mode);
1687   void pshuflw(XMMRegister dst, Address src, int mode) { Assembler::pshuflw(dst, src, mode); }
1688 
1689   void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1690   void vandpd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1691   void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1692 
1693   void vandps(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1694   void vandps(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1695   void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1696 
1697   void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1698 
1699   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vdivsd(dst, nds, src); }
1700   void vdivsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vdivsd(dst, nds, src); }
1701   void vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1702 
1703   void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vdivss(dst, nds, src); }
1704   void vdivss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vdivss(dst, nds, src); }
1705   void vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1706 
1707   void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vmulsd(dst, nds, src); }
1708   void vmulsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vmulsd(dst, nds, src); }
1709   void vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1710 
1711   void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vmulss(dst, nds, src); }
1712   void vmulss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vmulss(dst, nds, src); }
1713   void vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1714 
1715   void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vsubsd(dst, nds, src); }
1716   void vsubsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vsubsd(dst, nds, src); }
1717   void vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1718 
1719   void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vsubss(dst, nds, src); }
1720   void vsubss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vsubss(dst, nds, src); }
1721   void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1722 
1723   void vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1724   void vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1725 
1726   // AVX Vector instructions
1727 
1728   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1729   void vxorpd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1730   void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1731 
1732   void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1733   void vxorps(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1734   void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1735 
1736   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1737     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1738       Assembler::vpxor(dst, nds, src, vector_len);
1739     else
1740       Assembler::vxorpd(dst, nds, src, vector_len);
1741   }
1742   void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
1743     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1744       Assembler::vpxor(dst, nds, src, vector_len);
1745     else
1746       Assembler::vxorpd(dst, nds, src, vector_len);
1747   }
1748   void vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1749 
1750   // Simple version for AVX2 256bit vectors
1751   void vpxor(XMMRegister dst, XMMRegister src) {
1752     assert(UseAVX >= 2, "Should be at least AVX2");
1753     Assembler::vpxor(dst, dst, src, AVX_256bit);
1754   }
1755   void vpxor(XMMRegister dst, Address src) {
1756     assert(UseAVX >= 2, "Should be at least AVX2");
1757     Assembler::vpxor(dst, dst, src, AVX_256bit);
1758   }
1759 
1760   void vpermd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpermd(dst, nds, src, vector_len); }
1761   void vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1762 
1763   void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
1764     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1765       Assembler::vinserti32x4(dst, nds, src, imm8);
1766     } else if (UseAVX > 1) {
1767       // vinserti128 is available only in AVX2
1768       Assembler::vinserti128(dst, nds, src, imm8);
1769     } else {
1770       Assembler::vinsertf128(dst, nds, src, imm8);
1771     }
1772   }
1773 
1774   void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
1775     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1776       Assembler::vinserti32x4(dst, nds, src, imm8);
1777     } else if (UseAVX > 1) {
1778       // vinserti128 is available only in AVX2
1779       Assembler::vinserti128(dst, nds, src, imm8);
1780     } else {
1781       Assembler::vinsertf128(dst, nds, src, imm8);
1782     }
1783   }
1784 
1785   void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
1786     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1787       Assembler::vextracti32x4(dst, src, imm8);
1788     } else if (UseAVX > 1) {
1789       // vextracti128 is available only in AVX2
1790       Assembler::vextracti128(dst, src, imm8);
1791     } else {
1792       Assembler::vextractf128(dst, src, imm8);
1793     }
1794   }
1795 
1796   void vextracti128(Address dst, XMMRegister src, uint8_t imm8) {
1797     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1798       Assembler::vextracti32x4(dst, src, imm8);
1799     } else if (UseAVX > 1) {
1800       // vextracti128 is available only in AVX2
1801       Assembler::vextracti128(dst, src, imm8);
1802     } else {
1803       Assembler::vextractf128(dst, src, imm8);
1804     }
1805   }
1806 
1807   // 128bit copy to/from high 128 bits of 256bit (YMM) vector registers
1808   void vinserti128_high(XMMRegister dst, XMMRegister src) {
1809     vinserti128(dst, dst, src, 1);
1810   }
1811   void vinserti128_high(XMMRegister dst, Address src) {
1812     vinserti128(dst, dst, src, 1);
1813   }
1814   void vextracti128_high(XMMRegister dst, XMMRegister src) {
1815     vextracti128(dst, src, 1);
1816   }
1817   void vextracti128_high(Address dst, XMMRegister src) {
1818     vextracti128(dst, src, 1);
1819   }
1820 
1821   void vinsertf128_high(XMMRegister dst, XMMRegister src) {
1822     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1823       Assembler::vinsertf32x4(dst, dst, src, 1);
1824     } else {
1825       Assembler::vinsertf128(dst, dst, src, 1);
1826     }
1827   }
1828 
1829   void vinsertf128_high(XMMRegister dst, Address src) {
1830     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1831       Assembler::vinsertf32x4(dst, dst, src, 1);
1832     } else {
1833       Assembler::vinsertf128(dst, dst, src, 1);
1834     }
1835   }
1836 
1837   void vextractf128_high(XMMRegister dst, XMMRegister src) {
1838     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1839       Assembler::vextractf32x4(dst, src, 1);
1840     } else {
1841       Assembler::vextractf128(dst, src, 1);
1842     }
1843   }
1844 
1845   void vextractf128_high(Address dst, XMMRegister src) {
1846     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1847       Assembler::vextractf32x4(dst, src, 1);
1848     } else {
1849       Assembler::vextractf128(dst, src, 1);
1850     }
1851   }
1852 
1853   // 256bit copy to/from high 256 bits of 512bit (ZMM) vector registers
1854   void vinserti64x4_high(XMMRegister dst, XMMRegister src) {
1855     Assembler::vinserti64x4(dst, dst, src, 1);
1856   }
1857   void vinsertf64x4_high(XMMRegister dst, XMMRegister src) {
1858     Assembler::vinsertf64x4(dst, dst, src, 1);
1859   }
1860   void vextracti64x4_high(XMMRegister dst, XMMRegister src) {
1861     Assembler::vextracti64x4(dst, src, 1);
1862   }
1863   void vextractf64x4_high(XMMRegister dst, XMMRegister src) {
1864     Assembler::vextractf64x4(dst, src, 1);
1865   }
1866   void vextractf64x4_high(Address dst, XMMRegister src) {
1867     Assembler::vextractf64x4(dst, src, 1);
1868   }
1869   void vinsertf64x4_high(XMMRegister dst, Address src) {
1870     Assembler::vinsertf64x4(dst, dst, src, 1);
1871   }
1872 
1873   // 128bit copy to/from low 128 bits of 256bit (YMM) vector registers
1874   void vinserti128_low(XMMRegister dst, XMMRegister src) {
1875     vinserti128(dst, dst, src, 0);
1876   }
1877   void vinserti128_low(XMMRegister dst, Address src) {
1878     vinserti128(dst, dst, src, 0);
1879   }
1880   void vextracti128_low(XMMRegister dst, XMMRegister src) {
1881     vextracti128(dst, src, 0);
1882   }
1883   void vextracti128_low(Address dst, XMMRegister src) {
1884     vextracti128(dst, src, 0);
1885   }
1886 
1887   void vinsertf128_low(XMMRegister dst, XMMRegister src) {
1888     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1889       Assembler::vinsertf32x4(dst, dst, src, 0);
1890     } else {
1891       Assembler::vinsertf128(dst, dst, src, 0);
1892     }
1893   }
1894 
1895   void vinsertf128_low(XMMRegister dst, Address src) {
1896     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1897       Assembler::vinsertf32x4(dst, dst, src, 0);
1898     } else {
1899       Assembler::vinsertf128(dst, dst, src, 0);
1900     }
1901   }
1902 
1903   void vextractf128_low(XMMRegister dst, XMMRegister src) {
1904     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1905       Assembler::vextractf32x4(dst, src, 0);
1906     } else {
1907       Assembler::vextractf128(dst, src, 0);
1908     }
1909   }
1910 
1911   void vextractf128_low(Address dst, XMMRegister src) {
1912     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1913       Assembler::vextractf32x4(dst, src, 0);
1914     } else {
1915       Assembler::vextractf128(dst, src, 0);
1916     }
1917   }
1918 
1919   // 256bit copy to/from low 256 bits of 512bit (ZMM) vector registers
1920   void vinserti64x4_low(XMMRegister dst, XMMRegister src) {
1921     Assembler::vinserti64x4(dst, dst, src, 0);
1922   }
1923   void vinsertf64x4_low(XMMRegister dst, XMMRegister src) {
1924     Assembler::vinsertf64x4(dst, dst, src, 0);
1925   }
1926   void vextracti64x4_low(XMMRegister dst, XMMRegister src) {
1927     Assembler::vextracti64x4(dst, src, 0);
1928   }
1929   void vextractf64x4_low(XMMRegister dst, XMMRegister src) {
1930     Assembler::vextractf64x4(dst, src, 0);
1931   }
1932   void vextractf64x4_low(Address dst, XMMRegister src) {
1933     Assembler::vextractf64x4(dst, src, 0);
1934   }
1935   void vinsertf64x4_low(XMMRegister dst, Address src) {
1936     Assembler::vinsertf64x4(dst, dst, src, 0);
1937   }
1938 
1939   // Carry-Less Multiplication Quadword
1940   void vpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1941     // 0x00 - multiply lower 64 bits [0:63]
1942     Assembler::vpclmulqdq(dst, nds, src, 0x00);
1943   }
1944   void vpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1945     // 0x11 - multiply upper 64 bits [64:127]
1946     Assembler::vpclmulqdq(dst, nds, src, 0x11);
1947   }
1948   void vpclmullqhqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1949     // 0x10 - multiply nds[0:63] and src[64:127]
1950     Assembler::vpclmulqdq(dst, nds, src, 0x10);
1951   }
1952   void vpclmulhqlqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1953     //0x01 - multiply nds[64:127] and src[0:63]
1954     Assembler::vpclmulqdq(dst, nds, src, 0x01);
1955   }
1956 
1957   void evpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1958     // 0x00 - multiply lower 64 bits [0:63]
1959     Assembler::evpclmulqdq(dst, nds, src, 0x00, vector_len);
1960   }
1961   void evpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1962     // 0x11 - multiply upper 64 bits [64:127]
1963     Assembler::evpclmulqdq(dst, nds, src, 0x11, vector_len);
1964   }
1965 
1966   // AVX-512 mask operations.
1967   void kand(BasicType etype, KRegister dst, KRegister src1, KRegister src2);
1968   void kor(BasicType type, KRegister dst, KRegister src1, KRegister src2);
1969   void knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp = knoreg, Register rtmp = noreg);
1970   void kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2);
1971   void kortest(uint masklen, KRegister src1, KRegister src2);
1972   void ktest(uint masklen, KRegister src1, KRegister src2);
1973 
1974   void evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1975   void evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1976 
1977   void evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1978   void evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1979 
1980   void evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1981   void evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1982 
1983   void evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1984   void evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1985 
1986   void evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
1987   void evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
1988   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
1989   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
1990 
1991   using Assembler::evpandq;
1992   void evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1993 
1994   using Assembler::evpaddq;
1995   void evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1996 
1997   using Assembler::evporq;
1998   void evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1999 
2000   using Assembler::vpshufb;
2001   void vpshufb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
2002 
2003   using Assembler::vpor;
2004   void vpor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
2005 
2006   using Assembler::vpternlogq;
2007   void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch = noreg);
2008 
2009   void cmov32( Condition cc, Register dst, Address  src);
2010   void cmov32( Condition cc, Register dst, Register src);
2011 
2012   void cmov(   Condition cc, Register dst, Register src) { cmovptr(cc, dst, src); }
2013 
2014   void cmovptr(Condition cc, Register dst, Address  src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
2015   void cmovptr(Condition cc, Register dst, Register src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
2016 
2017   void movoop(Register dst, jobject obj);
2018   void movoop(Address  dst, jobject obj, Register rscratch);
2019 
2020   void mov_metadata(Register dst, Metadata* obj);
2021   void mov_metadata(Address  dst, Metadata* obj, Register rscratch);
2022 
2023   void movptr(Register     dst, Register       src);
2024   void movptr(Register     dst, Address        src);
2025   void movptr(Register     dst, AddressLiteral src);
2026   void movptr(Register     dst, ArrayAddress   src);
2027   void movptr(Register     dst, intptr_t       src);
2028   void movptr(Address      dst, Register       src);
2029   void movptr(Address      dst, int32_t        imm);
2030   void movptr(Address      dst, intptr_t       src, Register rscratch);
2031   void movptr(ArrayAddress dst, Register       src, Register rscratch);
2032 
2033   void movptr(Register dst, RegisterOrConstant src) {
2034     if (src.is_constant()) movptr(dst, src.as_constant());
2035     else                   movptr(dst, src.as_register());
2036   }
2037 
2038 
2039   // to avoid hiding movl
2040   void mov32(Register       dst, AddressLiteral src);
2041   void mov32(AddressLiteral dst, Register        src, Register rscratch = noreg);
2042 
2043   // Import other mov() methods from the parent class or else
2044   // they will be hidden by the following overriding declaration.
2045   using Assembler::movdl;
2046   void movdl(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
2047 
2048   using Assembler::movq;
2049   void movq(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
2050 
2051   // Can push value or effective address
2052   void pushptr(AddressLiteral src, Register rscratch);
2053 
2054   void pushptr(Address src) { LP64_ONLY(pushq(src)) NOT_LP64(pushl(src)); }
2055   void popptr(Address src) { LP64_ONLY(popq(src)) NOT_LP64(popl(src)); }
2056 
2057   void pushoop(jobject obj, Register rscratch);
2058   void pushklass(Metadata* obj, Register rscratch);
2059 
2060   // sign extend as need a l to ptr sized element
2061   void movl2ptr(Register dst, Address src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src)); }
2062   void movl2ptr(Register dst, Register src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(if (dst != src) movl(dst, src)); }
2063 
2064 
2065  public:
2066   // clear memory of size 'cnt' qwords, starting at 'base';
2067   // if 'is_large' is set, do not try to produce short loop
2068   void clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, bool is_large, KRegister mask=knoreg);
2069 
2070   // clear memory initialization sequence for constant size;
2071   void clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
2072 
2073   // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
2074   void xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
2075 
2076   // Fill primitive arrays
2077   void generate_fill(BasicType t, bool aligned,
2078                      Register to, Register value, Register count,
2079                      Register rtmp, XMMRegister xtmp);
2080 
2081   void encode_iso_array(Register src, Register dst, Register len,
2082                         XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
2083                         XMMRegister tmp4, Register tmp5, Register result, bool ascii);
2084 
2085 #ifdef _LP64
2086   void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2);
2087   void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2088                              Register y, Register y_idx, Register z,
2089                              Register carry, Register product,
2090                              Register idx, Register kdx);
2091   void multiply_add_128_x_128(Register x_xstart, Register y, Register z,
2092                               Register yz_idx, Register idx,
2093                               Register carry, Register product, int offset);
2094   void multiply_128_x_128_bmi2_loop(Register y, Register z,
2095                                     Register carry, Register carry2,
2096                                     Register idx, Register jdx,
2097                                     Register yz_idx1, Register yz_idx2,
2098                                     Register tmp, Register tmp3, Register tmp4);
2099   void multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
2100                                Register yz_idx, Register idx, Register jdx,
2101                                Register carry, Register product,
2102                                Register carry2);
2103   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register tmp0,
2104                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
2105   void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
2106                      Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
2107   void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
2108                             Register tmp2);
2109   void multiply_add_64(Register sum, Register op1, Register op2, Register carry,
2110                        Register rdxReg, Register raxReg);
2111   void add_one_64(Register z, Register zlen, Register carry, Register tmp1);
2112   void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
2113                        Register tmp3, Register tmp4);
2114   void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
2115                      Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
2116 
2117   void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1,
2118                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
2119                Register raxReg);
2120   void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
2121                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
2122                Register raxReg);
2123   void vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
2124                            Register result, Register tmp1, Register tmp2,
2125                            XMMRegister vec1, XMMRegister vec2, XMMRegister vec3);
2126 #endif
2127 
2128   // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
2129   void update_byte_crc32(Register crc, Register val, Register table);
2130   void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
2131 
2132 
2133 #ifdef _LP64
2134   void kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2);
2135   void kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
2136                                 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
2137                                 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup);
2138 #endif // _LP64
2139 
2140   // CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
2141   // Note on a naming convention:
2142   // Prefix w = register only used on a Westmere+ architecture
2143   // Prefix n = register only used on a Nehalem architecture
2144 #ifdef _LP64
2145   void crc32c_ipl_alg4(Register in_out, uint32_t n,
2146                        Register tmp1, Register tmp2, Register tmp3);
2147 #else
2148   void crc32c_ipl_alg4(Register in_out, uint32_t n,
2149                        Register tmp1, Register tmp2, Register tmp3,
2150                        XMMRegister xtmp1, XMMRegister xtmp2);
2151 #endif
2152   void crc32c_pclmulqdq(XMMRegister w_xtmp1,
2153                         Register in_out,
2154                         uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
2155                         XMMRegister w_xtmp2,
2156                         Register tmp1,
2157                         Register n_tmp2, Register n_tmp3);
2158   void crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
2159                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
2160                        Register tmp1, Register tmp2,
2161                        Register n_tmp3);
2162   void crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
2163                          Register in_out1, Register in_out2, Register in_out3,
2164                          Register tmp1, Register tmp2, Register tmp3,
2165                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
2166                          Register tmp4, Register tmp5,
2167                          Register n_tmp6);
2168   void crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
2169                             Register tmp1, Register tmp2, Register tmp3,
2170                             Register tmp4, Register tmp5, Register tmp6,
2171                             XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
2172                             bool is_pclmulqdq_supported);
2173   // Fold 128-bit data chunk
2174   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
2175   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf);
2176 #ifdef _LP64
2177   // Fold 512-bit data chunk
2178   void fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, Register pos, int offset);
2179 #endif // _LP64
2180   // Fold 8-bit data
2181   void fold_8bit_crc32(Register crc, Register table, Register tmp);
2182   void fold_8bit_crc32(XMMRegister crc, Register table, XMMRegister xtmp, Register tmp);
2183 
2184   // Compress char[] array to byte[].
2185   void char_array_compress(Register src, Register dst, Register len,
2186                            XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
2187                            XMMRegister tmp4, Register tmp5, Register result,
2188                            KRegister mask1 = knoreg, KRegister mask2 = knoreg);
2189 
2190   // Inflate byte[] array to char[].
2191   void byte_array_inflate(Register src, Register dst, Register len,
2192                           XMMRegister tmp1, Register tmp2, KRegister mask = knoreg);
2193 
2194   void fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
2195                    Register length, Register temp, int vec_enc);
2196 
2197   void fill64_masked(uint shift, Register dst, int disp,
2198                          XMMRegister xmm, KRegister mask, Register length,
2199                          Register temp, bool use64byteVector = false);
2200 
2201   void fill32_masked(uint shift, Register dst, int disp,
2202                          XMMRegister xmm, KRegister mask, Register length,
2203                          Register temp);
2204 
2205   void fill32(Address dst, XMMRegister xmm);
2206 
2207   void fill32(Register dst, int disp, XMMRegister xmm);
2208 
2209   void fill64(Address dst, XMMRegister xmm, bool use64byteVector = false);
2210 
2211   void fill64(Register dst, int dis, XMMRegister xmm, bool use64byteVector = false);
2212 
2213 #ifdef _LP64
2214   void convert_f2i(Register dst, XMMRegister src);
2215   void convert_d2i(Register dst, XMMRegister src);
2216   void convert_f2l(Register dst, XMMRegister src);
2217   void convert_d2l(Register dst, XMMRegister src);
2218   void round_double(Register dst, XMMRegister src, Register rtmp, Register rcx);
2219   void round_float(Register dst, XMMRegister src, Register rtmp, Register rcx);
2220 
2221   void cache_wb(Address line);
2222   void cache_wbsync(bool is_pre);
2223 
2224 #ifdef COMPILER2_OR_JVMCI
2225   void generate_fill_avx3(BasicType type, Register to, Register value,
2226                           Register count, Register rtmp, XMMRegister xtmp);
2227 #endif // COMPILER2_OR_JVMCI
2228 #endif // _LP64
2229 
2230   void vallones(XMMRegister dst, int vector_len);
2231 
2232   void check_stack_alignment(Register sp, const char* msg, unsigned bias = 0, Register tmp = noreg);
2233 
2234   void lightweight_lock(Register basic_lock, Register obj, Register reg_rax, Register tmp, Label& slow);
2235   void lightweight_unlock(Register obj, Register reg_rax, Register tmp, Label& slow);
2236 
2237 #ifdef _LP64
2238   void save_legacy_gprs();
2239   void restore_legacy_gprs();
2240   void load_aotrc_address(Register reg, address a);
2241   void setcc(Assembler::Condition comparison, Register dst);
2242 #endif
2243 };
2244 
2245 #endif // CPU_X86_MACROASSEMBLER_X86_HPP