1 /*
   2  * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef CPU_X86_MACROASSEMBLER_X86_HPP
  26 #define CPU_X86_MACROASSEMBLER_X86_HPP
  27 
  28 #include "asm/assembler.hpp"
  29 #include "asm/register.hpp"
  30 #include "code/vmreg.inline.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "utilities/macros.hpp"
  33 #include "runtime/signature.hpp"
  34 #include "runtime/vm_version.hpp"
  35 #include "utilities/checkedCast.hpp"
  36 
  37 class ciInlineKlass;
  38 
  39 // MacroAssembler extends Assembler by frequently used macros.
  40 //
  41 // Instructions for which a 'better' code sequence exists depending
  42 // on arguments should also go in here.
  43 
  44 class MacroAssembler: public Assembler {
  45   friend class LIR_Assembler;
  46   friend class Runtime1;      // as_Address()
  47 
  48  public:
  49   // Support for VM calls
  50   //
  51   // This is the base routine called by the different versions of call_VM_leaf. The interpreter
  52   // may customize this version by overriding it for its purposes (e.g., to save/restore
  53   // additional registers when doing a VM call).
  54 
  55   virtual void call_VM_leaf_base(
  56     address entry_point,               // the entry point
  57     int     number_of_arguments        // the number of arguments to pop after the call
  58   );
  59 
  60  protected:
  61   // This is the base routine called by the different versions of call_VM. The interpreter
  62   // may customize this version by overriding it for its purposes (e.g., to save/restore
  63   // additional registers when doing a VM call).
  64   //
  65   // call_VM_base returns the register which contains the thread upon return.
  66   // If no last_java_sp is specified (noreg) than rsp will be used instead.
  67   virtual void call_VM_base(           // returns the register containing the thread upon return
  68     Register oop_result,               // where an oop-result ends up if any; use noreg otherwise
  69     Register last_java_sp,             // to set up last_Java_frame in stubs; use noreg otherwise
  70     address  entry_point,              // the entry point
  71     int      number_of_arguments,      // the number of arguments (w/o thread) to pop after the call
  72     bool     check_exceptions          // whether to check for pending exceptions after return
  73   );
  74 
  75   void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions = true);
  76 
  77  public:
  78   MacroAssembler(CodeBuffer* code) : Assembler(code) {}
  79 
  80  // These routines should emit JVMTI PopFrame and ForceEarlyReturn handling code.
  81  // The implementation is only non-empty for the InterpreterMacroAssembler,
  82  // as only the interpreter handles PopFrame and ForceEarlyReturn requests.
  83  virtual void check_and_handle_popframe();
  84  virtual void check_and_handle_earlyret();
  85 
  86   Address as_Address(AddressLiteral adr);
  87   Address as_Address(ArrayAddress adr, Register rscratch);
  88 
  89   // Support for null-checks
  90   //
  91   // Generates code that causes a null OS exception if the content of reg is null.
  92   // If the accessed location is M[reg + offset] and the offset is known, provide the
  93   // offset. No explicit code generation is needed if the offset is within a certain
  94   // range (0 <= offset <= page_size).
  95 
  96   void null_check(Register reg, int offset = -1);
  97   static bool needs_explicit_null_check(intptr_t offset);
  98   static bool uses_implicit_null_check(void* address);
  99 
 100   // markWord tests, kills markWord reg
 101   void test_markword_is_inline_type(Register markword, Label& is_inline_type);
 102 
 103   // inlineKlass queries, kills temp_reg
 104   void test_klass_is_inline_type(Register klass, Register temp_reg, Label& is_inline_type);
 105   void test_oop_is_not_inline_type(Register object, Register tmp, Label& not_inline_type);
 106 
 107   void test_field_is_null_free_inline_type(Register flags, Register temp_reg, Label& is_null_free);
 108   void test_field_is_not_null_free_inline_type(Register flags, Register temp_reg, Label& not_null_free);
 109   void test_field_is_flat(Register flags, Register temp_reg, Label& is_flat);
 110   void test_field_has_null_marker(Register flags, Register temp_reg, Label& has_null_marker);
 111 
 112   // Check oops for special arrays, i.e. flat arrays and/or null-free arrays
 113   void test_oop_prototype_bit(Register oop, Register temp_reg, int32_t test_bit, bool jmp_set, Label& jmp_label);
 114   void test_flat_array_oop(Register oop, Register temp_reg, Label& is_flat_array);
 115   void test_non_flat_array_oop(Register oop, Register temp_reg, Label& is_non_flat_array);
 116   void test_null_free_array_oop(Register oop, Register temp_reg, Label& is_null_free_array);
 117   void test_non_null_free_array_oop(Register oop, Register temp_reg, Label& is_non_null_free_array);
 118 
 119   // Check array klass layout helper for flat or null-free arrays...
 120   void test_flat_array_layout(Register lh, Label& is_flat_array);
 121   void test_non_flat_array_layout(Register lh, Label& is_non_flat_array);
 122 
 123   // Required platform-specific helpers for Label::patch_instructions.
 124   // They _shadow_ the declarations in AbstractAssembler, which are undefined.
 125   void pd_patch_instruction(address branch, address target, const char* file, int line) {
 126     unsigned char op = branch[0];
 127     assert(op == 0xE8 /* call */ ||
 128         op == 0xE9 /* jmp */ ||
 129         op == 0xEB /* short jmp */ ||
 130         (op & 0xF0) == 0x70 /* short jcc */ ||
 131         (op == 0x0F && (branch[1] & 0xF0) == 0x80) /* jcc */ ||
 132         (op == 0xC7 && branch[1] == 0xF8) /* xbegin */ ||
 133         (op == 0x8D) /* lea */,
 134         "Invalid opcode at patch point");
 135 
 136     if (op == 0xEB || (op & 0xF0) == 0x70) {
 137       // short offset operators (jmp and jcc)
 138       char* disp = (char*) &branch[1];
 139       int imm8 = checked_cast<int>(target - (address) &disp[1]);
 140       guarantee(this->is8bit(imm8), "Short forward jump exceeds 8-bit offset at %s:%d",
 141                 file == nullptr ? "<null>" : file, line);
 142       *disp = (char)imm8;
 143     } else {
 144       int* disp = (int*) &branch[(op == 0x0F || op == 0xC7 || op == 0x8D) ? 2 : 1];
 145       int imm32 = checked_cast<int>(target - (address) &disp[1]);
 146       *disp = imm32;
 147     }
 148   }
 149 
 150   // The following 4 methods return the offset of the appropriate move instruction
 151 
 152   // Support for fast byte/short loading with zero extension (depending on particular CPU)
 153   int load_unsigned_byte(Register dst, Address src);
 154   int load_unsigned_short(Register dst, Address src);
 155 
 156   // Support for fast byte/short loading with sign extension (depending on particular CPU)
 157   int load_signed_byte(Register dst, Address src);
 158   int load_signed_short(Register dst, Address src);
 159 
 160   // Support for sign-extension (hi:lo = extend_sign(lo))
 161   void extend_sign(Register hi, Register lo);
 162 
 163   // Load and store values by size and signed-ness
 164   void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2 = noreg);
 165   void store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2 = noreg);
 166 
 167   // Support for inc/dec with optimal instruction selection depending on value
 168 
 169   void increment(Register reg, int value = 1) { incrementq(reg, value); }
 170   void decrement(Register reg, int value = 1) { decrementq(reg, value); }
 171   void increment(Address dst, int value = 1)  { incrementq(dst, value); }
 172   void decrement(Address dst, int value = 1)  { decrementq(dst, value); }
 173 
 174   void decrementl(Address dst, int value = 1);
 175   void decrementl(Register reg, int value = 1);
 176 
 177   void decrementq(Register reg, int value = 1);
 178   void decrementq(Address dst, int value = 1);
 179 
 180   void incrementl(Address dst, int value = 1);
 181   void incrementl(Register reg, int value = 1);
 182 
 183   void incrementq(Register reg, int value = 1);
 184   void incrementq(Address dst, int value = 1);
 185 
 186   void incrementl(AddressLiteral dst, Register rscratch = noreg);
 187   void incrementl(ArrayAddress   dst, Register rscratch);
 188 
 189   void incrementq(AddressLiteral dst, Register rscratch = noreg);
 190 
 191   // Support optimal SSE move instructions.
 192   void movflt(XMMRegister dst, XMMRegister src) {
 193     if (dst-> encoding() == src->encoding()) return;
 194     if (UseXmmRegToRegMoveAll) { movaps(dst, src); return; }
 195     else                       { movss (dst, src); return; }
 196   }
 197   void movflt(XMMRegister dst, Address src) { movss(dst, src); }
 198   void movflt(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 199   void movflt(Address dst, XMMRegister src) { movss(dst, src); }
 200 
 201   // Move with zero extension
 202   void movfltz(XMMRegister dst, XMMRegister src) { movss(dst, src); }
 203 
 204   void movdbl(XMMRegister dst, XMMRegister src) {
 205     if (dst-> encoding() == src->encoding()) return;
 206     if (UseXmmRegToRegMoveAll) { movapd(dst, src); return; }
 207     else                       { movsd (dst, src); return; }
 208   }
 209 
 210   void movdbl(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 211 
 212   void movdbl(XMMRegister dst, Address src) {
 213     if (UseXmmLoadAndClearUpper) { movsd (dst, src); return; }
 214     else                         { movlpd(dst, src); return; }
 215   }
 216   void movdbl(Address dst, XMMRegister src) { movsd(dst, src); }
 217 
 218   void flt_to_flt16(Register dst, XMMRegister src, XMMRegister tmp) {
 219     // Use separate tmp XMM register because caller may
 220     // requires src XMM register to be unchanged (as in x86.ad).
 221     vcvtps2ph(tmp, src, 0x04, Assembler::AVX_128bit);
 222     movdl(dst, tmp);
 223     movswl(dst, dst);
 224   }
 225 
 226   void flt16_to_flt(XMMRegister dst, Register src) {
 227     movdl(dst, src);
 228     vcvtph2ps(dst, dst, Assembler::AVX_128bit);
 229   }
 230 
 231   // Alignment
 232   void align32();
 233   void align64();
 234   void align(uint modulus);
 235   void align(uint modulus, uint target);
 236 
 237   void post_call_nop();
 238   // A 5 byte nop that is safe for patching (see patch_verified_entry)
 239   void fat_nop();
 240 
 241   // Stack frame creation/removal
 242   void enter();
 243   void leave();
 244 
 245   // Support for getting the JavaThread pointer (i.e.; a reference to thread-local information).
 246   // The pointer will be loaded into the thread register. This is a slow version that does native call.
 247   // Normally, JavaThread pointer is available in r15_thread, use that where possible.
 248   void get_thread_slow(Register thread);
 249 
 250   // Support for argument shuffling
 251 
 252   // bias in bytes
 253   void move32_64(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 254   void long_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 255   void float_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 256   void double_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 257   void move_ptr(VMRegPair src, VMRegPair dst);
 258   void object_move(OopMap* map,
 259                    int oop_handle_offset,
 260                    int framesize_in_slots,
 261                    VMRegPair src,
 262                    VMRegPair dst,
 263                    bool is_receiver,
 264                    int* receiver_offset);
 265 
 266   // Support for VM calls
 267   //
 268   // It is imperative that all calls into the VM are handled via the call_VM macros.
 269   // They make sure that the stack linkage is setup correctly. call_VM's correspond
 270   // to ENTRY/ENTRY_X entry points while call_VM_leaf's correspond to LEAF entry points.
 271 
 272 
 273   void call_VM(Register oop_result,
 274                address entry_point,
 275                bool check_exceptions = true);
 276   void call_VM(Register oop_result,
 277                address entry_point,
 278                Register arg_1,
 279                bool check_exceptions = true);
 280   void call_VM(Register oop_result,
 281                address entry_point,
 282                Register arg_1, Register arg_2,
 283                bool check_exceptions = true);
 284   void call_VM(Register oop_result,
 285                address entry_point,
 286                Register arg_1, Register arg_2, Register arg_3,
 287                bool check_exceptions = true);
 288 
 289   // Overloadings with last_Java_sp
 290   void call_VM(Register oop_result,
 291                Register last_java_sp,
 292                address entry_point,
 293                int number_of_arguments = 0,
 294                bool check_exceptions = true);
 295   void call_VM(Register oop_result,
 296                Register last_java_sp,
 297                address entry_point,
 298                Register arg_1, bool
 299                check_exceptions = true);
 300   void call_VM(Register oop_result,
 301                Register last_java_sp,
 302                address entry_point,
 303                Register arg_1, Register arg_2,
 304                bool check_exceptions = true);
 305   void call_VM(Register oop_result,
 306                Register last_java_sp,
 307                address entry_point,
 308                Register arg_1, Register arg_2, Register arg_3,
 309                bool check_exceptions = true);
 310 
 311   void get_vm_result_oop(Register oop_result);
 312   void get_vm_result_metadata(Register metadata_result);
 313 
 314   // These always tightly bind to MacroAssembler::call_VM_base
 315   // bypassing the virtual implementation
 316   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments = 0, bool check_exceptions = true);
 317   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions = true);
 318   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions = true);
 319   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions = true);
 320   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4, bool check_exceptions = true);
 321 
 322   void call_VM_leaf0(address entry_point);
 323   void call_VM_leaf(address entry_point,
 324                     int number_of_arguments = 0);
 325   void call_VM_leaf(address entry_point,
 326                     Register arg_1);
 327   void call_VM_leaf(address entry_point,
 328                     Register arg_1, Register arg_2);
 329   void call_VM_leaf(address entry_point,
 330                     Register arg_1, Register arg_2, Register arg_3);
 331 
 332   void call_VM_leaf(address entry_point,
 333                     Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 334 
 335   // These always tightly bind to MacroAssembler::call_VM_leaf_base
 336   // bypassing the virtual implementation
 337   void super_call_VM_leaf(address entry_point);
 338   void super_call_VM_leaf(address entry_point, Register arg_1);
 339   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
 340   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
 341   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 342 
 343   void set_last_Java_frame(Register last_java_sp,
 344                            Register last_java_fp,
 345                            address  last_java_pc,
 346                            Register rscratch);
 347 
 348   void set_last_Java_frame(Register last_java_sp,
 349                            Register last_java_fp,
 350                            Label &last_java_pc,
 351                            Register scratch);
 352 
 353   void reset_last_Java_frame(bool clear_fp);
 354 
 355   // jobjects
 356   void clear_jobject_tag(Register possibly_non_local);
 357   void resolve_jobject(Register value, Register tmp);
 358   void resolve_global_jobject(Register value, Register tmp);
 359 
 360   // C 'boolean' to Java boolean: x == 0 ? 0 : 1
 361   void c2bool(Register x);
 362 
 363   // C++ bool manipulation
 364 
 365   void movbool(Register dst, Address src);
 366   void movbool(Address dst, bool boolconst);
 367   void movbool(Address dst, Register src);
 368   void testbool(Register dst);
 369 
 370   void resolve_oop_handle(Register result, Register tmp);
 371   void resolve_weak_handle(Register result, Register tmp);
 372   void load_mirror(Register mirror, Register method, Register tmp);
 373   void load_method_holder_cld(Register rresult, Register rmethod);
 374 
 375   void load_method_holder(Register holder, Register method);
 376 
 377   // oop manipulations
 378 
 379   // Load oopDesc._metadata without decode (useful for direct Klass* compare from oops)
 380   void load_metadata(Register dst, Register src);
 381   void load_narrow_klass_compact(Register dst, Register src);
 382   void load_klass(Register dst, Register src, Register tmp);
 383   void store_klass(Register dst, Register src, Register tmp);
 384 
 385   // Compares the Klass pointer of an object to a given Klass (which might be narrow,
 386   // depending on UseCompressedClassPointers).
 387   void cmp_klass(Register klass, Register obj, Register tmp);
 388 
 389   // Compares the Klass pointer of two objects obj1 and obj2. Result is in the condition flags.
 390   // Uses tmp1 and tmp2 as temporary registers.
 391   void cmp_klasses_from_objects(Register obj1, Register obj2, Register tmp1, Register tmp2);
 392 
 393   void access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
 394                       Register tmp1);
 395   void access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val,
 396                        Register tmp1, Register tmp2, Register tmp3);
 397 
 398   void flat_field_copy(DecoratorSet decorators, Register src, Register dst, Register inline_layout_info);
 399 
 400   // inline type data payload offsets...
 401   void payload_offset(Register inline_klass, Register offset);
 402   void payload_addr(Register oop, Register data, Register inline_klass);
 403   // get data payload ptr a flat value array at index, kills rcx and index
 404   void data_for_value_array_index(Register array, Register array_klass,
 405                                   Register index, Register data);
 406 
 407   void load_heap_oop(Register dst, Address src, Register tmp1 = noreg, DecoratorSet decorators = 0);
 408   void load_heap_oop_not_null(Register dst, Address src, Register tmp1 = noreg, DecoratorSet decorators = 0);
 409   void store_heap_oop(Address dst, Register val, Register tmp1 = noreg,
 410                       Register tmp2 = noreg, Register tmp3 = noreg, DecoratorSet decorators = 0);
 411 
 412   // Used for storing null. All other oop constants should be
 413   // stored using routines that take a jobject.
 414   void store_heap_oop_null(Address dst);
 415 
 416   void load_prototype_header(Register dst, Register src, Register tmp);
 417 
 418   void store_klass_gap(Register dst, Register src);
 419 
 420   // This dummy is to prevent a call to store_heap_oop from
 421   // converting a zero (like null) into a Register by giving
 422   // the compiler two choices it can't resolve
 423 
 424   void store_heap_oop(Address dst, void* dummy);
 425 
 426   void encode_heap_oop(Register r);
 427   void decode_heap_oop(Register r);
 428   void encode_heap_oop_not_null(Register r);
 429   void decode_heap_oop_not_null(Register r);
 430   void encode_heap_oop_not_null(Register dst, Register src);
 431   void decode_heap_oop_not_null(Register dst, Register src);
 432 
 433   void set_narrow_oop(Register dst, jobject obj);
 434   void set_narrow_oop(Address dst, jobject obj);
 435   void cmp_narrow_oop(Register dst, jobject obj);
 436   void cmp_narrow_oop(Address dst, jobject obj);
 437 
 438   void encode_klass_not_null(Register r, Register tmp);
 439   void decode_klass_not_null(Register r, Register tmp);
 440   void encode_and_move_klass_not_null(Register dst, Register src);
 441   void decode_and_move_klass_not_null(Register dst, Register src);
 442   void set_narrow_klass(Register dst, Klass* k);
 443   void set_narrow_klass(Address dst, Klass* k);
 444   void cmp_narrow_klass(Register dst, Klass* k);
 445   void cmp_narrow_klass(Address dst, Klass* k);
 446 
 447   // if heap base register is used - reinit it with the correct value
 448   void reinit_heapbase();
 449 
 450   DEBUG_ONLY(void verify_heapbase(const char* msg);)
 451 
 452   // Int division/remainder for Java
 453   // (as idivl, but checks for special case as described in JVM spec.)
 454   // returns idivl instruction offset for implicit exception handling
 455   int corrected_idivl(Register reg);
 456 
 457   // Long division/remainder for Java
 458   // (as idivq, but checks for special case as described in JVM spec.)
 459   // returns idivq instruction offset for implicit exception handling
 460   int corrected_idivq(Register reg);
 461 
 462   void int3();
 463 
 464   // Long operation macros for a 32bit cpu
 465   // Long negation for Java
 466   void lneg(Register hi, Register lo);
 467 
 468   // Long multiplication for Java
 469   // (destroys contents of eax, ebx, ecx and edx)
 470   void lmul(int x_rsp_offset, int y_rsp_offset); // rdx:rax = x * y
 471 
 472   // Long shifts for Java
 473   // (semantics as described in JVM spec.)
 474   void lshl(Register hi, Register lo);                               // hi:lo << (rcx & 0x3f)
 475   void lshr(Register hi, Register lo, bool sign_extension = false);  // hi:lo >> (rcx & 0x3f)
 476 
 477   // Long compare for Java
 478   // (semantics as described in JVM spec.)
 479   void lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo); // x_hi = lcmp(x, y)
 480 
 481 
 482   // misc
 483 
 484   // Sign extension
 485   void sign_extend_short(Register reg);
 486   void sign_extend_byte(Register reg);
 487 
 488   // Division by power of 2, rounding towards 0
 489   void division_with_shift(Register reg, int shift_value);
 490 
 491   // dst = c = a * b + c
 492   void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 493   void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 494 
 495   void vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 496   void vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 497   void vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 498   void vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 499 
 500 
 501   // same as fcmp2int, but using SSE2
 502   void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 503   void cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 504 
 505   void push_IU_state();
 506   void pop_IU_state();
 507 
 508   void push_FPU_state();
 509   void pop_FPU_state();
 510 
 511   void push_CPU_state();
 512   void pop_CPU_state();
 513 
 514   void push_cont_fastpath();
 515   void pop_cont_fastpath();
 516 
 517   void inc_held_monitor_count();
 518   void dec_held_monitor_count();
 519 
 520   DEBUG_ONLY(void stop_if_in_cont(Register cont_reg, const char* name);)
 521 
 522   // Round up to a power of two
 523   void round_to(Register reg, int modulus);
 524 
 525 private:
 526   // General purpose and XMM registers potentially clobbered by native code; there
 527   // is no need for FPU or AVX opmask related methods because C1/interpreter
 528   // - we save/restore FPU state as a whole always
 529   // - do not care about AVX-512 opmask
 530   static RegSet call_clobbered_gp_registers();
 531   static XMMRegSet call_clobbered_xmm_registers();
 532 
 533   void push_set(XMMRegSet set, int offset);
 534   void pop_set(XMMRegSet set, int offset);
 535 
 536 public:
 537   void push_set(RegSet set, int offset = -1);
 538   void pop_set(RegSet set, int offset = -1);
 539 
 540   // Push and pop everything that might be clobbered by a native
 541   // runtime call.
 542   // Only save the lower 64 bits of each vector register.
 543   // Additional registers can be excluded in a passed RegSet.
 544   void push_call_clobbered_registers_except(RegSet exclude, bool save_fpu = true);
 545   void pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu = true);
 546 
 547   void push_call_clobbered_registers(bool save_fpu = true) {
 548     push_call_clobbered_registers_except(RegSet(), save_fpu);
 549   }
 550   void pop_call_clobbered_registers(bool restore_fpu = true) {
 551     pop_call_clobbered_registers_except(RegSet(), restore_fpu);
 552   }
 553 
 554   // allocation
 555 
 556   // Object / value buffer allocation...
 557   // Allocate instance of klass, assumes klass initialized by caller
 558   // new_obj prefers to be rax
 559   // Kills t1 and t2, perserves klass, return allocation in new_obj (rsi on LP64)
 560   void allocate_instance(Register klass, Register new_obj,
 561                          Register t1, Register t2,
 562                          bool clear_fields, Label& alloc_failed);
 563 
 564   void tlab_allocate(
 565     Register obj,                      // result: pointer to object after successful allocation
 566     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
 567     int      con_size_in_bytes,        // object size in bytes if   known at compile time
 568     Register t1,                       // temp register
 569     Register t2,                       // temp register
 570     Label&   slow_case                 // continuation point if fast allocation fails
 571   );
 572   void zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp);
 573 
 574   // For field "index" within "klass", return inline_klass ...
 575   void get_inline_type_field_klass(Register klass, Register index, Register inline_klass);
 576 
 577   void inline_layout_info(Register klass, Register index, Register layout_info);
 578 
 579   void population_count(Register dst, Register src, Register scratch1, Register scratch2);
 580 
 581   // interface method calling
 582   void lookup_interface_method(Register recv_klass,
 583                                Register intf_klass,
 584                                RegisterOrConstant itable_index,
 585                                Register method_result,
 586                                Register scan_temp,
 587                                Label& no_such_interface,
 588                                bool return_method = true);
 589 
 590   void lookup_interface_method_stub(Register recv_klass,
 591                                     Register holder_klass,
 592                                     Register resolved_klass,
 593                                     Register method_result,
 594                                     Register scan_temp,
 595                                     Register temp_reg2,
 596                                     Register receiver,
 597                                     int itable_index,
 598                                     Label& L_no_such_interface);
 599 
 600   // virtual method calling
 601   void lookup_virtual_method(Register recv_klass,
 602                              RegisterOrConstant vtable_index,
 603                              Register method_result);
 604 
 605   // Test sub_klass against super_klass, with fast and slow paths.
 606 
 607   // The fast path produces a tri-state answer: yes / no / maybe-slow.
 608   // One of the three labels can be null, meaning take the fall-through.
 609   // If super_check_offset is -1, the value is loaded up from super_klass.
 610   // No registers are killed, except temp_reg.
 611   void check_klass_subtype_fast_path(Register sub_klass,
 612                                      Register super_klass,
 613                                      Register temp_reg,
 614                                      Label* L_success,
 615                                      Label* L_failure,
 616                                      Label* L_slow_path,
 617                 RegisterOrConstant super_check_offset = RegisterOrConstant(-1));
 618 
 619   // The rest of the type check; must be wired to a corresponding fast path.
 620   // It does not repeat the fast path logic, so don't use it standalone.
 621   // The temp_reg and temp2_reg can be noreg, if no temps are available.
 622   // Updates the sub's secondary super cache as necessary.
 623   // If set_cond_codes, condition codes will be Z on success, NZ on failure.
 624   void check_klass_subtype_slow_path(Register sub_klass,
 625                                      Register super_klass,
 626                                      Register temp_reg,
 627                                      Register temp2_reg,
 628                                      Label* L_success,
 629                                      Label* L_failure,
 630                                      bool set_cond_codes = false);
 631 
 632   // The 64-bit version, which may do a hashed subclass lookup.
 633   void check_klass_subtype_slow_path(Register sub_klass,
 634                                      Register super_klass,
 635                                      Register temp_reg,
 636                                      Register temp2_reg,
 637                                      Register temp3_reg,
 638                                      Register temp4_reg,
 639                                      Label* L_success,
 640                                      Label* L_failure);
 641 
 642   // Three parts of a hashed subclass lookup: a simple linear search,
 643   // a table lookup, and a fallback that does linear probing in the
 644   // event of a hash collision.
 645   void check_klass_subtype_slow_path_linear(Register sub_klass,
 646                                             Register super_klass,
 647                                             Register temp_reg,
 648                                             Register temp2_reg,
 649                                             Label* L_success,
 650                                             Label* L_failure,
 651                                             bool set_cond_codes = false);
 652   void check_klass_subtype_slow_path_table(Register sub_klass,
 653                                            Register super_klass,
 654                                            Register temp_reg,
 655                                            Register temp2_reg,
 656                                            Register temp3_reg,
 657                                            Register result_reg,
 658                                            Label* L_success,
 659                                            Label* L_failure);
 660   void hashed_check_klass_subtype_slow_path(Register sub_klass,
 661                                             Register super_klass,
 662                                             Register temp_reg,
 663                                             Label* L_success,
 664                                             Label* L_failure);
 665 
 666   // As above, but with a constant super_klass.
 667   // The result is in Register result, not the condition codes.
 668   void lookup_secondary_supers_table_const(Register sub_klass,
 669                                            Register super_klass,
 670                                            Register temp1,
 671                                            Register temp2,
 672                                            Register temp3,
 673                                            Register temp4,
 674                                            Register result,
 675                                            u1 super_klass_slot);
 676 
 677   using Assembler::salq;
 678   void salq(Register dest, Register count);
 679   using Assembler::rorq;
 680   void rorq(Register dest, Register count);
 681   void lookup_secondary_supers_table_var(Register sub_klass,
 682                                          Register super_klass,
 683                                          Register temp1,
 684                                          Register temp2,
 685                                          Register temp3,
 686                                          Register temp4,
 687                                          Register result);
 688 
 689   void lookup_secondary_supers_table_slow_path(Register r_super_klass,
 690                                                Register r_array_base,
 691                                                Register r_array_index,
 692                                                Register r_bitmap,
 693                                                Register temp1,
 694                                                Register temp2,
 695                                                Label* L_success,
 696                                                Label* L_failure = nullptr);
 697 
 698   void verify_secondary_supers_table(Register r_sub_klass,
 699                                      Register r_super_klass,
 700                                      Register expected,
 701                                      Register temp1,
 702                                      Register temp2,
 703                                      Register temp3);
 704 
 705   void repne_scanq(Register addr, Register value, Register count, Register limit,
 706                    Label* L_success,
 707                    Label* L_failure = nullptr);
 708 
 709   // If r is valid, return r.
 710   // If r is invalid, remove a register r2 from available_regs, add r2
 711   // to regs_to_push, then return r2.
 712   Register allocate_if_noreg(const Register r,
 713                              RegSetIterator<Register> &available_regs,
 714                              RegSet &regs_to_push);
 715 
 716   // Simplified, combined version, good for typical uses.
 717   // Falls through on failure.
 718   void check_klass_subtype(Register sub_klass,
 719                            Register super_klass,
 720                            Register temp_reg,
 721                            Label& L_success);
 722 
 723   void clinit_barrier(Register klass,
 724                       Label* L_fast_path = nullptr,
 725                       Label* L_slow_path = nullptr);
 726 
 727   // method handles (JSR 292)
 728   Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0);
 729 
 730   // Debugging
 731 
 732   // only if +VerifyOops
 733   void _verify_oop(Register reg, const char* s, const char* file, int line);
 734   void _verify_oop_addr(Address addr, const char* s, const char* file, int line);
 735 
 736   void _verify_oop_checked(Register reg, const char* s, const char* file, int line) {
 737     if (VerifyOops) {
 738       _verify_oop(reg, s, file, line);
 739     }
 740   }
 741   void _verify_oop_addr_checked(Address reg, const char* s, const char* file, int line) {
 742     if (VerifyOops) {
 743       _verify_oop_addr(reg, s, file, line);
 744     }
 745   }
 746 
 747   // TODO: verify method and klass metadata (compare against vptr?)
 748   void _verify_method_ptr(Register reg, const char * msg, const char * file, int line) {}
 749   void _verify_klass_ptr(Register reg, const char * msg, const char * file, int line){}
 750 
 751 #define verify_oop(reg) _verify_oop_checked(reg, "broken oop " #reg, __FILE__, __LINE__)
 752 #define verify_oop_msg(reg, msg) _verify_oop_checked(reg, "broken oop " #reg ", " #msg, __FILE__, __LINE__)
 753 #define verify_oop_addr(addr) _verify_oop_addr_checked(addr, "broken oop addr " #addr, __FILE__, __LINE__)
 754 #define verify_method_ptr(reg) _verify_method_ptr(reg, "broken method " #reg, __FILE__, __LINE__)
 755 #define verify_klass_ptr(reg) _verify_klass_ptr(reg, "broken klass " #reg, __FILE__, __LINE__)
 756 
 757   // Verify or restore cpu control state after JNI call
 758   void restore_cpu_control_state_after_jni(Register rscratch);
 759 
 760   // prints msg, dumps registers and stops execution
 761   void stop(const char* msg);
 762 
 763   // prints msg and continues
 764   void warn(const char* msg);
 765 
 766   // dumps registers and other state
 767   void print_state();
 768 
 769   static void debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg);
 770   static void debug64(char* msg, int64_t pc, int64_t regs[]);
 771   static void print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip);
 772   static void print_state64(int64_t pc, int64_t regs[]);
 773 
 774   void os_breakpoint();
 775 
 776   void untested()                                { stop("untested"); }
 777 
 778   void unimplemented(const char* what = "");
 779 
 780   void should_not_reach_here()                   { stop("should not reach here"); }
 781 
 782   void print_CPU_state();
 783 
 784   // Stack overflow checking
 785   void bang_stack_with_offset(int offset) {
 786     // stack grows down, caller passes positive offset
 787     assert(offset > 0, "must bang with negative offset");
 788     movl(Address(rsp, (-offset)), rax);
 789   }
 790 
 791   // Writes to stack successive pages until offset reached to check for
 792   // stack overflow + shadow pages.  Also, clobbers tmp
 793   void bang_stack_size(Register size, Register tmp);
 794 
 795   // Check for reserved stack access in method being exited (for JIT)
 796   void reserved_stack_check();
 797 
 798   void safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod);
 799 
 800   void verify_tlab();
 801 
 802   static Condition negate_condition(Condition cond);
 803 
 804   // Instructions that use AddressLiteral operands. These instruction can handle 32bit/64bit
 805   // operands. In general the names are modified to avoid hiding the instruction in Assembler
 806   // so that we don't need to implement all the varieties in the Assembler with trivial wrappers
 807   // here in MacroAssembler. The major exception to this rule is call
 808 
 809   // Arithmetics
 810 
 811 
 812   void addptr(Address dst, int32_t src) { addq(dst, src); }
 813   void addptr(Address dst, Register src);
 814 
 815   void addptr(Register dst, Address src) { addq(dst, src); }
 816   void addptr(Register dst, int32_t src);
 817   void addptr(Register dst, Register src);
 818   void addptr(Register dst, RegisterOrConstant src) {
 819     if (src.is_constant()) addptr(dst, checked_cast<int>(src.as_constant()));
 820     else                   addptr(dst, src.as_register());
 821   }
 822 
 823   void andptr(Register dst, int32_t src);
 824   void andptr(Register src1, Register src2) { andq(src1, src2); }
 825   void andptr(Register dst, Address src) { andq(dst, src); }
 826 
 827   using Assembler::andq;
 828   void andq(Register dst, AddressLiteral src, Register rscratch = noreg);
 829 
 830   void cmp8(AddressLiteral src1, int imm, Register rscratch = noreg);
 831 
 832   // renamed to drag out the casting of address to int32_t/intptr_t
 833   void cmp32(Register src1, int32_t imm);
 834 
 835   void cmp32(AddressLiteral src1, int32_t imm, Register rscratch = noreg);
 836   // compare reg - mem, or reg - &mem
 837   void cmp32(Register src1, AddressLiteral src2, Register rscratch = noreg);
 838 
 839   void cmp32(Register src1, Address src2);
 840 
 841   void cmpoop(Register src1, Register src2);
 842   void cmpoop(Register src1, Address src2);
 843   void cmpoop(Register dst, jobject obj, Register rscratch);
 844 
 845   // NOTE src2 must be the lval. This is NOT an mem-mem compare
 846   void cmpptr(Address src1, AddressLiteral src2, Register rscratch);
 847 
 848   void cmpptr(Register src1, AddressLiteral src2, Register rscratch = noreg);
 849 
 850   void cmpptr(Register src1, Register src2) { cmpq(src1, src2); }
 851   void cmpptr(Register src1, Address src2) { cmpq(src1, src2); }
 852 
 853   void cmpptr(Register src1, int32_t src2) { cmpq(src1, src2); }
 854   void cmpptr(Address src1, int32_t src2) { cmpq(src1, src2); }
 855 
 856   // cmp64 to avoild hiding cmpq
 857   void cmp64(Register src1, AddressLiteral src, Register rscratch = noreg);
 858 
 859   void cmpxchgptr(Register reg, Address adr);
 860 
 861   void locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch = noreg);
 862 
 863   void imulptr(Register dst, Register src) { imulq(dst, src); }
 864   void imulptr(Register dst, Register src, int imm32) { imulq(dst, src, imm32); }
 865 
 866 
 867   void negptr(Register dst) { negq(dst); }
 868 
 869   void notptr(Register dst) { notq(dst); }
 870 
 871   void shlptr(Register dst, int32_t shift);
 872   void shlptr(Register dst) { shlq(dst); }
 873 
 874   void shrptr(Register dst, int32_t shift);
 875   void shrptr(Register dst) { shrq(dst); }
 876 
 877   void sarptr(Register dst) { sarq(dst); }
 878   void sarptr(Register dst, int32_t src) { sarq(dst, src); }
 879 
 880   void subptr(Address dst, int32_t src) { subq(dst, src); }
 881 
 882   void subptr(Register dst, Address src) { subq(dst, src); }
 883   void subptr(Register dst, int32_t src);
 884   // Force generation of a 4 byte immediate value even if it fits into 8bit
 885   void subptr_imm32(Register dst, int32_t src);
 886   void subptr(Register dst, Register src);
 887   void subptr(Register dst, RegisterOrConstant src) {
 888     if (src.is_constant()) subptr(dst, (int) src.as_constant());
 889     else                   subptr(dst,       src.as_register());
 890   }
 891 
 892   void sbbptr(Address dst, int32_t src) { sbbq(dst, src); }
 893   void sbbptr(Register dst, int32_t src) { sbbq(dst, src); }
 894 
 895   void xchgptr(Register src1, Register src2) { xchgq(src1, src2); }
 896   void xchgptr(Register src1, Address src2) { xchgq(src1, src2); }
 897 
 898   void xaddptr(Address src1, Register src2) { xaddq(src1, src2); }
 899 
 900 
 901 
 902   // Helper functions for statistics gathering.
 903   // Conditionally (atomically, on MPs) increments passed counter address, preserving condition codes.
 904   void cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch = noreg);
 905   // Unconditional atomic increment.
 906   void atomic_incl(Address counter_addr);
 907   void atomic_incl(AddressLiteral counter_addr, Register rscratch = noreg);
 908   void atomic_incq(Address counter_addr);
 909   void atomic_incq(AddressLiteral counter_addr, Register rscratch = noreg);
 910   void atomic_incptr(AddressLiteral counter_addr, Register rscratch = noreg) { atomic_incq(counter_addr, rscratch); }
 911   void atomic_incptr(Address counter_addr) { atomic_incq(counter_addr); }
 912 
 913   using Assembler::lea;
 914   void lea(Register dst, AddressLiteral adr);
 915   void lea(Address  dst, AddressLiteral adr, Register rscratch);
 916 
 917   void leal32(Register dst, Address src) { leal(dst, src); }
 918 
 919   // Import other testl() methods from the parent class or else
 920   // they will be hidden by the following overriding declaration.
 921   using Assembler::testl;
 922   void testl(Address dst, int32_t imm32);
 923   void testl(Register dst, int32_t imm32);
 924   void testl(Register dst, AddressLiteral src); // requires reachable address
 925   using Assembler::testq;
 926   void testq(Address dst, int32_t imm32);
 927   void testq(Register dst, int32_t imm32);
 928 
 929   void orptr(Register dst, Address src) { orq(dst, src); }
 930   void orptr(Register dst, Register src) { orq(dst, src); }
 931   void orptr(Register dst, int32_t src) { orq(dst, src); }
 932   void orptr(Address dst, int32_t imm32) { orq(dst, imm32); }
 933 
 934   void testptr(Register src, int32_t imm32) { testq(src, imm32); }
 935   void testptr(Register src1, Address src2) { testq(src1, src2); }
 936   void testptr(Address src, int32_t imm32) { testq(src, imm32); }
 937   void testptr(Register src1, Register src2);
 938 
 939   void xorptr(Register dst, Register src) { xorq(dst, src); }
 940   void xorptr(Register dst, Address src) { xorq(dst, src); }
 941 
 942   // Calls
 943 
 944   void call(Label& L, relocInfo::relocType rtype);
 945   void call(Register entry);
 946   void call(Address addr) { Assembler::call(addr); }
 947 
 948   // NOTE: this call transfers to the effective address of entry NOT
 949   // the address contained by entry. This is because this is more natural
 950   // for jumps/calls.
 951   void call(AddressLiteral entry, Register rscratch = rax);
 952 
 953   // Emit the CompiledIC call idiom
 954   void ic_call(address entry, jint method_index = 0);
 955   static int ic_check_size();
 956   int ic_check(int end_alignment);
 957 
 958   void emit_static_call_stub();
 959 
 960   // Jumps
 961 
 962   // NOTE: these jumps transfer to the effective address of dst NOT
 963   // the address contained by dst. This is because this is more natural
 964   // for jumps/calls.
 965   void jump(AddressLiteral dst, Register rscratch = noreg);
 966 
 967   void jump_cc(Condition cc, AddressLiteral dst, Register rscratch = noreg);
 968 
 969   // 32bit can do a case table jump in one instruction but we no longer allow the base
 970   // to be installed in the Address class. This jump will transfer to the address
 971   // contained in the location described by entry (not the address of entry)
 972   void jump(ArrayAddress entry, Register rscratch);
 973 
 974   // Adding more natural conditional jump instructions
 975   void ALWAYSINLINE jo(Label& L, bool maybe_short = true) { jcc(Assembler::overflow, L, maybe_short); }
 976   void ALWAYSINLINE jno(Label& L, bool maybe_short = true) { jcc(Assembler::noOverflow, L, maybe_short); }
 977   void ALWAYSINLINE js(Label& L, bool maybe_short = true) { jcc(Assembler::negative, L, maybe_short); }
 978   void ALWAYSINLINE jns(Label& L, bool maybe_short = true) { jcc(Assembler::positive, L, maybe_short); }
 979   void ALWAYSINLINE je(Label& L, bool maybe_short = true) { jcc(Assembler::equal, L, maybe_short); }
 980   void ALWAYSINLINE jz(Label& L, bool maybe_short = true) { jcc(Assembler::zero, L, maybe_short); }
 981   void ALWAYSINLINE jne(Label& L, bool maybe_short = true) { jcc(Assembler::notEqual, L, maybe_short); }
 982   void ALWAYSINLINE jnz(Label& L, bool maybe_short = true) { jcc(Assembler::notZero, L, maybe_short); }
 983   void ALWAYSINLINE jb(Label& L, bool maybe_short = true) { jcc(Assembler::below, L, maybe_short); }
 984   void ALWAYSINLINE jnae(Label& L, bool maybe_short = true) { jcc(Assembler::below, L, maybe_short); }
 985   void ALWAYSINLINE jc(Label& L, bool maybe_short = true) { jcc(Assembler::carrySet, L, maybe_short); }
 986   void ALWAYSINLINE jnb(Label& L, bool maybe_short = true) { jcc(Assembler::aboveEqual, L, maybe_short); }
 987   void ALWAYSINLINE jae(Label& L, bool maybe_short = true) { jcc(Assembler::aboveEqual, L, maybe_short); }
 988   void ALWAYSINLINE jnc(Label& L, bool maybe_short = true) { jcc(Assembler::carryClear, L, maybe_short); }
 989   void ALWAYSINLINE jbe(Label& L, bool maybe_short = true) { jcc(Assembler::belowEqual, L, maybe_short); }
 990   void ALWAYSINLINE jna(Label& L, bool maybe_short = true) { jcc(Assembler::belowEqual, L, maybe_short); }
 991   void ALWAYSINLINE ja(Label& L, bool maybe_short = true) { jcc(Assembler::above, L, maybe_short); }
 992   void ALWAYSINLINE jnbe(Label& L, bool maybe_short = true) { jcc(Assembler::above, L, maybe_short); }
 993   void ALWAYSINLINE jl(Label& L, bool maybe_short = true) { jcc(Assembler::less, L, maybe_short); }
 994   void ALWAYSINLINE jnge(Label& L, bool maybe_short = true) { jcc(Assembler::less, L, maybe_short); }
 995   void ALWAYSINLINE jge(Label& L, bool maybe_short = true) { jcc(Assembler::greaterEqual, L, maybe_short); }
 996   void ALWAYSINLINE jnl(Label& L, bool maybe_short = true) { jcc(Assembler::greaterEqual, L, maybe_short); }
 997   void ALWAYSINLINE jle(Label& L, bool maybe_short = true) { jcc(Assembler::lessEqual, L, maybe_short); }
 998   void ALWAYSINLINE jng(Label& L, bool maybe_short = true) { jcc(Assembler::lessEqual, L, maybe_short); }
 999   void ALWAYSINLINE jg(Label& L, bool maybe_short = true) { jcc(Assembler::greater, L, maybe_short); }
1000   void ALWAYSINLINE jnle(Label& L, bool maybe_short = true) { jcc(Assembler::greater, L, maybe_short); }
1001   void ALWAYSINLINE jp(Label& L, bool maybe_short = true) { jcc(Assembler::parity, L, maybe_short); }
1002   void ALWAYSINLINE jpe(Label& L, bool maybe_short = true) { jcc(Assembler::parity, L, maybe_short); }
1003   void ALWAYSINLINE jnp(Label& L, bool maybe_short = true) { jcc(Assembler::noParity, L, maybe_short); }
1004   void ALWAYSINLINE jpo(Label& L, bool maybe_short = true) { jcc(Assembler::noParity, L, maybe_short); }
1005   // * No condition for this *  void ALWAYSINLINE jcxz(Label& L, bool maybe_short = true) { jcc(Assembler::cxz, L, maybe_short); }
1006   // * No condition for this *  void ALWAYSINLINE jecxz(Label& L, bool maybe_short = true) { jcc(Assembler::cxz, L, maybe_short); }
1007 
1008   // Short versions of the above
1009   void ALWAYSINLINE jo_b(Label& L) { jccb(Assembler::overflow, L); }
1010   void ALWAYSINLINE jno_b(Label& L) { jccb(Assembler::noOverflow, L); }
1011   void ALWAYSINLINE js_b(Label& L) { jccb(Assembler::negative, L); }
1012   void ALWAYSINLINE jns_b(Label& L) { jccb(Assembler::positive, L); }
1013   void ALWAYSINLINE je_b(Label& L) { jccb(Assembler::equal, L); }
1014   void ALWAYSINLINE jz_b(Label& L) { jccb(Assembler::zero, L); }
1015   void ALWAYSINLINE jne_b(Label& L) { jccb(Assembler::notEqual, L); }
1016   void ALWAYSINLINE jnz_b(Label& L) { jccb(Assembler::notZero, L); }
1017   void ALWAYSINLINE jb_b(Label& L) { jccb(Assembler::below, L); }
1018   void ALWAYSINLINE jnae_b(Label& L) { jccb(Assembler::below, L); }
1019   void ALWAYSINLINE jc_b(Label& L) { jccb(Assembler::carrySet, L); }
1020   void ALWAYSINLINE jnb_b(Label& L) { jccb(Assembler::aboveEqual, L); }
1021   void ALWAYSINLINE jae_b(Label& L) { jccb(Assembler::aboveEqual, L); }
1022   void ALWAYSINLINE jnc_b(Label& L) { jccb(Assembler::carryClear, L); }
1023   void ALWAYSINLINE jbe_b(Label& L) { jccb(Assembler::belowEqual, L); }
1024   void ALWAYSINLINE jna_b(Label& L) { jccb(Assembler::belowEqual, L); }
1025   void ALWAYSINLINE ja_b(Label& L) { jccb(Assembler::above, L); }
1026   void ALWAYSINLINE jnbe_b(Label& L) { jccb(Assembler::above, L); }
1027   void ALWAYSINLINE jl_b(Label& L) { jccb(Assembler::less, L); }
1028   void ALWAYSINLINE jnge_b(Label& L) { jccb(Assembler::less, L); }
1029   void ALWAYSINLINE jge_b(Label& L) { jccb(Assembler::greaterEqual, L); }
1030   void ALWAYSINLINE jnl_b(Label& L) { jccb(Assembler::greaterEqual, L); }
1031   void ALWAYSINLINE jle_b(Label& L) { jccb(Assembler::lessEqual, L); }
1032   void ALWAYSINLINE jng_b(Label& L) { jccb(Assembler::lessEqual, L); }
1033   void ALWAYSINLINE jg_b(Label& L) { jccb(Assembler::greater, L); }
1034   void ALWAYSINLINE jnle_b(Label& L) { jccb(Assembler::greater, L); }
1035   void ALWAYSINLINE jp_b(Label& L) { jccb(Assembler::parity, L); }
1036   void ALWAYSINLINE jpe_b(Label& L) { jccb(Assembler::parity, L); }
1037   void ALWAYSINLINE jnp_b(Label& L) { jccb(Assembler::noParity, L); }
1038   void ALWAYSINLINE jpo_b(Label& L) { jccb(Assembler::noParity, L); }
1039   // * No condition for this *  void ALWAYSINLINE jcxz_b(Label& L) { jccb(Assembler::cxz, L); }
1040   // * No condition for this *  void ALWAYSINLINE jecxz_b(Label& L) { jccb(Assembler::cxz, L); }
1041 
1042   // Floating
1043 
1044   void push_f(XMMRegister r);
1045   void pop_f(XMMRegister r);
1046   void push_d(XMMRegister r);
1047   void pop_d(XMMRegister r);
1048 
1049   void andpd(XMMRegister dst, XMMRegister    src) { Assembler::andpd(dst, src); }
1050   void andpd(XMMRegister dst, Address        src) { Assembler::andpd(dst, src); }
1051   void andpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1052 
1053   void andps(XMMRegister dst, XMMRegister    src) { Assembler::andps(dst, src); }
1054   void andps(XMMRegister dst, Address        src) { Assembler::andps(dst, src); }
1055   void andps(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1056 
1057   void comiss(XMMRegister dst, XMMRegister    src) { Assembler::comiss(dst, src); }
1058   void comiss(XMMRegister dst, Address        src) { Assembler::comiss(dst, src); }
1059   void comiss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1060 
1061   void comisd(XMMRegister dst, XMMRegister    src) { Assembler::comisd(dst, src); }
1062   void comisd(XMMRegister dst, Address        src) { Assembler::comisd(dst, src); }
1063   void comisd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1064 
1065   void cmp32_mxcsr_std(Address mxcsr_save, Register tmp, Register rscratch = noreg);
1066   void ldmxcsr(Address src) { Assembler::ldmxcsr(src); }
1067   void ldmxcsr(AddressLiteral src, Register rscratch = noreg);
1068 
1069  private:
1070   void sha256_AVX2_one_round_compute(
1071     Register  reg_old_h,
1072     Register  reg_a,
1073     Register  reg_b,
1074     Register  reg_c,
1075     Register  reg_d,
1076     Register  reg_e,
1077     Register  reg_f,
1078     Register  reg_g,
1079     Register  reg_h,
1080     int iter);
1081   void sha256_AVX2_four_rounds_compute_first(int start);
1082   void sha256_AVX2_four_rounds_compute_last(int start);
1083   void sha256_AVX2_one_round_and_sched(
1084         XMMRegister xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
1085         XMMRegister xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
1086         XMMRegister xmm_2,     /* ymm6 */
1087         XMMRegister xmm_3,     /* ymm7 */
1088         Register    reg_a,      /* == eax on 0 iteration, then rotate 8 register right on each next iteration */
1089         Register    reg_b,      /* ebx */    /* full cycle is 8 iterations */
1090         Register    reg_c,      /* edi */
1091         Register    reg_d,      /* esi */
1092         Register    reg_e,      /* r8d */
1093         Register    reg_f,      /* r9d */
1094         Register    reg_g,      /* r10d */
1095         Register    reg_h,      /* r11d */
1096         int iter);
1097 
1098   void addm(int disp, Register r1, Register r2);
1099 
1100   void sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c, Register d,
1101                                      Register e, Register f, Register g, Register h, int iteration);
1102 
1103   void sha512_AVX2_one_round_and_schedule(XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1104                                           Register a, Register b, Register c, Register d, Register e, Register f,
1105                                           Register g, Register h, int iteration);
1106 
1107   void addmq(int disp, Register r1, Register r2);
1108  public:
1109   void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1110                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1111                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1112                    bool multi_block, XMMRegister shuf_mask);
1113   void sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1114                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1115                    Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block,
1116                    XMMRegister shuf_mask);
1117   void sha512_update_ni_x1(Register arg_hash, Register arg_msg, Register ofs, Register limit, bool multi_block);
1118 
1119   void fast_md5(Register buf, Address state, Address ofs, Address limit,
1120                 bool multi_block);
1121 
1122   void fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
1123                  XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
1124                  Register buf, Register state, Register ofs, Register limit, Register rsp,
1125                  bool multi_block);
1126 
1127   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1128                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1129                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1130                    bool multi_block, XMMRegister shuf_mask);
1131 
1132   void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1133                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1134                 Register rax, Register rcx, Register rdx, Register tmp);
1135 
1136 private:
1137 
1138   // these are private because users should be doing movflt/movdbl
1139 
1140   void movss(Address     dst, XMMRegister    src) { Assembler::movss(dst, src); }
1141   void movss(XMMRegister dst, XMMRegister    src) { Assembler::movss(dst, src); }
1142   void movss(XMMRegister dst, Address        src) { Assembler::movss(dst, src); }
1143   void movss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1144 
1145   void movlpd(XMMRegister dst, Address        src) {Assembler::movlpd(dst, src); }
1146   void movlpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1147 
1148 public:
1149 
1150   void addsd(XMMRegister dst, XMMRegister    src) { Assembler::addsd(dst, src); }
1151   void addsd(XMMRegister dst, Address        src) { Assembler::addsd(dst, src); }
1152   void addsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1153 
1154   void addss(XMMRegister dst, XMMRegister    src) { Assembler::addss(dst, src); }
1155   void addss(XMMRegister dst, Address        src) { Assembler::addss(dst, src); }
1156   void addss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1157 
1158   void addpd(XMMRegister dst, XMMRegister    src) { Assembler::addpd(dst, src); }
1159   void addpd(XMMRegister dst, Address        src) { Assembler::addpd(dst, src); }
1160   void addpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1161 
1162   using Assembler::vbroadcasti128;
1163   void vbroadcasti128(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1164 
1165   using Assembler::vbroadcastsd;
1166   void vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1167 
1168   using Assembler::vbroadcastss;
1169   void vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1170 
1171   // Vector float blend
1172   void vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg);
1173   void vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg);
1174 
1175   void divsd(XMMRegister dst, XMMRegister    src) { Assembler::divsd(dst, src); }
1176   void divsd(XMMRegister dst, Address        src) { Assembler::divsd(dst, src); }
1177   void divsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1178 
1179   void divss(XMMRegister dst, XMMRegister    src) { Assembler::divss(dst, src); }
1180   void divss(XMMRegister dst, Address        src) { Assembler::divss(dst, src); }
1181   void divss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1182 
1183   // Move Unaligned Double Quadword
1184   void movdqu(Address     dst, XMMRegister    src);
1185   void movdqu(XMMRegister dst, XMMRegister    src);
1186   void movdqu(XMMRegister dst, Address        src);
1187   void movdqu(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1188 
1189   void kmovwl(Register  dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1190   void kmovwl(Address   dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1191   void kmovwl(KRegister dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1192   void kmovwl(KRegister dst, Register       src) { Assembler::kmovwl(dst, src); }
1193   void kmovwl(KRegister dst, Address        src) { Assembler::kmovwl(dst, src); }
1194   void kmovwl(KRegister dst, AddressLiteral src, Register rscratch = noreg);
1195 
1196   void kmovql(KRegister dst, KRegister      src) { Assembler::kmovql(dst, src); }
1197   void kmovql(KRegister dst, Register       src) { Assembler::kmovql(dst, src); }
1198   void kmovql(Register  dst, KRegister      src) { Assembler::kmovql(dst, src); }
1199   void kmovql(KRegister dst, Address        src) { Assembler::kmovql(dst, src); }
1200   void kmovql(Address   dst, KRegister      src) { Assembler::kmovql(dst, src); }
1201   void kmovql(KRegister dst, AddressLiteral src, Register rscratch = noreg);
1202 
1203   // Safe move operation, lowers down to 16bit moves for targets supporting
1204   // AVX512F feature and 64bit moves for targets supporting AVX512BW feature.
1205   void kmov(Address  dst, KRegister src);
1206   void kmov(KRegister dst, Address src);
1207   void kmov(KRegister dst, KRegister src);
1208   void kmov(Register dst, KRegister src);
1209   void kmov(KRegister dst, Register src);
1210 
1211   using Assembler::movddup;
1212   void movddup(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1213 
1214   using Assembler::vmovddup;
1215   void vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1216 
1217   // AVX Unaligned forms
1218   void vmovdqu(Address     dst, XMMRegister    src);
1219   void vmovdqu(XMMRegister dst, Address        src);
1220   void vmovdqu(XMMRegister dst, XMMRegister    src);
1221   void vmovdqu(XMMRegister dst, AddressLiteral src,                 Register rscratch = noreg);
1222   void vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1223   void vmovdqu(XMMRegister dst, XMMRegister    src, int vector_len);
1224   void vmovdqu(XMMRegister dst, Address        src, int vector_len);
1225   void vmovdqu(Address     dst, XMMRegister    src, int vector_len);
1226 
1227   // AVX Aligned forms
1228   using Assembler::vmovdqa;
1229   void vmovdqa(XMMRegister dst, AddressLiteral src,                 Register rscratch = noreg);
1230   void vmovdqa(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1231 
1232   // AVX512 Unaligned
1233   void evmovdqu(BasicType type, KRegister kmask, Address     dst, XMMRegister src, bool merge, int vector_len);
1234   void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address     src, bool merge, int vector_len);
1235   void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len);
1236 
1237   void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::evmovdqub(dst, src, vector_len); }
1238   void evmovdqub(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdqub(dst, src, vector_len); }
1239 
1240   void evmovdqub(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1241     if (dst->encoding() != src->encoding() || mask != k0)  {
1242       Assembler::evmovdqub(dst, mask, src, merge, vector_len);
1243     }
1244   }
1245   void evmovdqub(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
1246   void evmovdqub(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
1247   void evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1248 
1249   void evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
1250   void evmovdquw(Address     dst, XMMRegister src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
1251   void evmovdquw(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
1252 
1253   void evmovdquw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1254     if (dst->encoding() != src->encoding() || mask != k0) {
1255       Assembler::evmovdquw(dst, mask, src, merge, vector_len);
1256     }
1257   }
1258   void evmovdquw(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
1259   void evmovdquw(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
1260   void evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1261 
1262   void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
1263      if (dst->encoding() != src->encoding()) {
1264        Assembler::evmovdqul(dst, src, vector_len);
1265      }
1266   }
1267   void evmovdqul(Address     dst, XMMRegister src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
1268   void evmovdqul(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
1269 
1270   void evmovdqul(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1271     if (dst->encoding() != src->encoding() || mask != k0)  {
1272       Assembler::evmovdqul(dst, mask, src, merge, vector_len);
1273     }
1274   }
1275   void evmovdqul(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
1276   void evmovdqul(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
1277   void evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1278 
1279   void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) {
1280     if (dst->encoding() != src->encoding()) {
1281       Assembler::evmovdquq(dst, src, vector_len);
1282     }
1283   }
1284   void evmovdquq(XMMRegister dst, Address        src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1285   void evmovdquq(Address     dst, XMMRegister    src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1286   void evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1287   void evmovdqaq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1288 
1289   void evmovdquq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1290     if (dst->encoding() != src->encoding() || mask != k0) {
1291       Assembler::evmovdquq(dst, mask, src, merge, vector_len);
1292     }
1293   }
1294   void evmovdquq(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
1295   void evmovdquq(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
1296   void evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1297   void evmovdqaq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1298 
1299   // Move Aligned Double Quadword
1300   void movdqa(XMMRegister dst, XMMRegister    src) { Assembler::movdqa(dst, src); }
1301   void movdqa(XMMRegister dst, Address        src) { Assembler::movdqa(dst, src); }
1302   void movdqa(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1303 
1304   void movsd(Address     dst, XMMRegister    src) { Assembler::movsd(dst, src); }
1305   void movsd(XMMRegister dst, XMMRegister    src) { Assembler::movsd(dst, src); }
1306   void movsd(XMMRegister dst, Address        src) { Assembler::movsd(dst, src); }
1307   void movsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1308 
1309   void mulpd(XMMRegister dst, XMMRegister    src) { Assembler::mulpd(dst, src); }
1310   void mulpd(XMMRegister dst, Address        src) { Assembler::mulpd(dst, src); }
1311   void mulpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1312 
1313   void mulsd(XMMRegister dst, XMMRegister    src) { Assembler::mulsd(dst, src); }
1314   void mulsd(XMMRegister dst, Address        src) { Assembler::mulsd(dst, src); }
1315   void mulsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1316 
1317   void mulss(XMMRegister dst, XMMRegister    src) { Assembler::mulss(dst, src); }
1318   void mulss(XMMRegister dst, Address        src) { Assembler::mulss(dst, src); }
1319   void mulss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1320 
1321   // Carry-Less Multiplication Quadword
1322   void pclmulldq(XMMRegister dst, XMMRegister src) {
1323     // 0x00 - multiply lower 64 bits [0:63]
1324     Assembler::pclmulqdq(dst, src, 0x00);
1325   }
1326   void pclmulhdq(XMMRegister dst, XMMRegister src) {
1327     // 0x11 - multiply upper 64 bits [64:127]
1328     Assembler::pclmulqdq(dst, src, 0x11);
1329   }
1330 
1331   void pcmpeqb(XMMRegister dst, XMMRegister src);
1332   void pcmpeqw(XMMRegister dst, XMMRegister src);
1333 
1334   void pcmpestri(XMMRegister dst, Address src, int imm8);
1335   void pcmpestri(XMMRegister dst, XMMRegister src, int imm8);
1336 
1337   void pmovzxbw(XMMRegister dst, XMMRegister src);
1338   void pmovzxbw(XMMRegister dst, Address src);
1339 
1340   void pmovmskb(Register dst, XMMRegister src);
1341 
1342   void ptest(XMMRegister dst, XMMRegister src);
1343 
1344   void roundsd(XMMRegister dst, XMMRegister    src, int32_t rmode) { Assembler::roundsd(dst, src, rmode); }
1345   void roundsd(XMMRegister dst, Address        src, int32_t rmode) { Assembler::roundsd(dst, src, rmode); }
1346   void roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch = noreg);
1347 
1348   void sqrtss(XMMRegister dst, XMMRegister     src) { Assembler::sqrtss(dst, src); }
1349   void sqrtss(XMMRegister dst, Address         src) { Assembler::sqrtss(dst, src); }
1350   void sqrtss(XMMRegister dst, AddressLiteral  src, Register rscratch = noreg);
1351 
1352   void subsd(XMMRegister dst, XMMRegister    src) { Assembler::subsd(dst, src); }
1353   void subsd(XMMRegister dst, Address        src) { Assembler::subsd(dst, src); }
1354   void subsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1355 
1356   void subss(XMMRegister dst, XMMRegister    src) { Assembler::subss(dst, src); }
1357   void subss(XMMRegister dst, Address        src) { Assembler::subss(dst, src); }
1358   void subss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1359 
1360   void ucomiss(XMMRegister dst, XMMRegister    src) { Assembler::ucomiss(dst, src); }
1361   void ucomiss(XMMRegister dst, Address        src) { Assembler::ucomiss(dst, src); }
1362   void ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1363 
1364   void ucomisd(XMMRegister dst, XMMRegister    src) { Assembler::ucomisd(dst, src); }
1365   void ucomisd(XMMRegister dst, Address        src) { Assembler::ucomisd(dst, src); }
1366   void ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1367 
1368   // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
1369   void xorpd(XMMRegister dst, XMMRegister    src);
1370   void xorpd(XMMRegister dst, Address        src) { Assembler::xorpd(dst, src); }
1371   void xorpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1372 
1373   // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
1374   void xorps(XMMRegister dst, XMMRegister    src);
1375   void xorps(XMMRegister dst, Address        src) { Assembler::xorps(dst, src); }
1376   void xorps(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1377 
1378   // Shuffle Bytes
1379   void pshufb(XMMRegister dst, XMMRegister    src) { Assembler::pshufb(dst, src); }
1380   void pshufb(XMMRegister dst, Address        src) { Assembler::pshufb(dst, src); }
1381   void pshufb(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1382   // AVX 3-operands instructions
1383 
1384   void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vaddsd(dst, nds, src); }
1385   void vaddsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vaddsd(dst, nds, src); }
1386   void vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1387 
1388   void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vaddss(dst, nds, src); }
1389   void vaddss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vaddss(dst, nds, src); }
1390   void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1391 
1392   void vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch = noreg);
1393   void vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch = noreg);
1394 
1395   void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len);
1396   void vpaddb(XMMRegister dst, XMMRegister nds, Address        src, int vector_len);
1397   void vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1398 
1399   void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1400   void vpaddw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1401 
1402   void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1403   void vpaddd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1404   void vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1405 
1406   void vpand(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1407   void vpand(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1408   void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1409 
1410   using Assembler::vpbroadcastd;
1411   void vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1412 
1413   using Assembler::vpbroadcastq;
1414   void vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1415 
1416   void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1417   void vpcmpeqb(XMMRegister dst, XMMRegister src1, Address src2, int vector_len);
1418 
1419   void vpcmpeqw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1420   void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1421   void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1422 
1423   // Vector compares
1424   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1425     Assembler::evpcmpd(kdst, mask, nds, src, comparison, is_signed, vector_len);
1426   }
1427   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1428 
1429   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1430     Assembler::evpcmpq(kdst, mask, nds, src, comparison, is_signed, vector_len);
1431   }
1432   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1433 
1434   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1435     Assembler::evpcmpb(kdst, mask, nds, src, comparison, is_signed, vector_len);
1436   }
1437   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1438 
1439   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1440     Assembler::evpcmpw(kdst, mask, nds, src, comparison, is_signed, vector_len);
1441   }
1442   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1443 
1444   void evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len);
1445 
1446   // Emit comparison instruction for the specified comparison predicate.
1447   void vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len);
1448   void vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len);
1449 
1450   void vpmovzxbw(XMMRegister dst, Address     src, int vector_len);
1451   void vpmovzxbw(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vpmovzxbw(dst, src, vector_len); }
1452 
1453   void vpmovmskb(Register dst, XMMRegister src, int vector_len = Assembler::AVX_256bit);
1454 
1455   void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1456   void vpmullw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1457 
1458   void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); }
1459   void vpmulld(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); }
1460   void vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1461 
1462   void vpmuldq(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpmuldq(dst, nds, src, vector_len); }
1463 
1464   void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1465   void vpsubb(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1466 
1467   void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1468   void vpsubw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1469 
1470   void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1471   void vpsraw(XMMRegister dst, XMMRegister nds, int         shift, int vector_len);
1472 
1473   void evpsrad(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1474   void evpsrad(XMMRegister dst, XMMRegister nds, int         shift, int vector_len);
1475 
1476   void evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1477   void evpsraq(XMMRegister dst, XMMRegister nds, int         shift, int vector_len);
1478 
1479   using Assembler::evpsllw;
1480   void evpsllw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1481     if (!is_varshift) {
1482       Assembler::evpsllw(dst, mask, nds, src, merge, vector_len);
1483     } else {
1484       Assembler::evpsllvw(dst, mask, nds, src, merge, vector_len);
1485     }
1486   }
1487   void evpslld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1488     if (!is_varshift) {
1489       Assembler::evpslld(dst, mask, nds, src, merge, vector_len);
1490     } else {
1491       Assembler::evpsllvd(dst, mask, nds, src, merge, vector_len);
1492     }
1493   }
1494   void evpsllq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1495     if (!is_varshift) {
1496       Assembler::evpsllq(dst, mask, nds, src, merge, vector_len);
1497     } else {
1498       Assembler::evpsllvq(dst, mask, nds, src, merge, vector_len);
1499     }
1500   }
1501   void evpsrlw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1502     if (!is_varshift) {
1503       Assembler::evpsrlw(dst, mask, nds, src, merge, vector_len);
1504     } else {
1505       Assembler::evpsrlvw(dst, mask, nds, src, merge, vector_len);
1506     }
1507   }
1508   void evpsrld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1509     if (!is_varshift) {
1510       Assembler::evpsrld(dst, mask, nds, src, merge, vector_len);
1511     } else {
1512       Assembler::evpsrlvd(dst, mask, nds, src, merge, vector_len);
1513     }
1514   }
1515 
1516   using Assembler::evpsrlq;
1517   void evpsrlq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1518     if (!is_varshift) {
1519       Assembler::evpsrlq(dst, mask, nds, src, merge, vector_len);
1520     } else {
1521       Assembler::evpsrlvq(dst, mask, nds, src, merge, vector_len);
1522     }
1523   }
1524   using Assembler::evpsraw;
1525   void evpsraw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1526     if (!is_varshift) {
1527       Assembler::evpsraw(dst, mask, nds, src, merge, vector_len);
1528     } else {
1529       Assembler::evpsravw(dst, mask, nds, src, merge, vector_len);
1530     }
1531   }
1532   using Assembler::evpsrad;
1533   void evpsrad(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1534     if (!is_varshift) {
1535       Assembler::evpsrad(dst, mask, nds, src, merge, vector_len);
1536     } else {
1537       Assembler::evpsravd(dst, mask, nds, src, merge, vector_len);
1538     }
1539   }
1540   using Assembler::evpsraq;
1541   void evpsraq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1542     if (!is_varshift) {
1543       Assembler::evpsraq(dst, mask, nds, src, merge, vector_len);
1544     } else {
1545       Assembler::evpsravq(dst, mask, nds, src, merge, vector_len);
1546     }
1547   }
1548 
1549   void evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1550   void evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1551   void evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1552   void evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1553 
1554   void evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1555   void evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1556   void evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1557   void evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1558 
1559   void vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1560   void vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1561 
1562   void vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1563   void vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1564 
1565   void vptest(XMMRegister dst, XMMRegister src);
1566   void vptest(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vptest(dst, src, vector_len); }
1567 
1568   void punpcklbw(XMMRegister dst, XMMRegister src);
1569   void punpcklbw(XMMRegister dst, Address src) { Assembler::punpcklbw(dst, src); }
1570 
1571   void pshufd(XMMRegister dst, Address src, int mode);
1572   void pshufd(XMMRegister dst, XMMRegister src, int mode) { Assembler::pshufd(dst, src, mode); }
1573 
1574   void pshuflw(XMMRegister dst, XMMRegister src, int mode);
1575   void pshuflw(XMMRegister dst, Address src, int mode) { Assembler::pshuflw(dst, src, mode); }
1576 
1577   void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1578   void vandpd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1579   void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1580 
1581   void vandps(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1582   void vandps(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1583   void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1584 
1585   void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1586 
1587   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vdivsd(dst, nds, src); }
1588   void vdivsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vdivsd(dst, nds, src); }
1589   void vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1590 
1591   void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vdivss(dst, nds, src); }
1592   void vdivss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vdivss(dst, nds, src); }
1593   void vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1594 
1595   void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vmulsd(dst, nds, src); }
1596   void vmulsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vmulsd(dst, nds, src); }
1597   void vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1598 
1599   void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vmulss(dst, nds, src); }
1600   void vmulss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vmulss(dst, nds, src); }
1601   void vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1602 
1603   void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vsubsd(dst, nds, src); }
1604   void vsubsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vsubsd(dst, nds, src); }
1605   void vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1606 
1607   void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vsubss(dst, nds, src); }
1608   void vsubss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vsubss(dst, nds, src); }
1609   void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1610 
1611   void vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1612   void vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1613 
1614   // AVX Vector instructions
1615 
1616   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1617   void vxorpd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1618   void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1619 
1620   void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1621   void vxorps(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1622   void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1623 
1624   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1625     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1626       Assembler::vpxor(dst, nds, src, vector_len);
1627     else
1628       Assembler::vxorpd(dst, nds, src, vector_len);
1629   }
1630   void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
1631     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1632       Assembler::vpxor(dst, nds, src, vector_len);
1633     else
1634       Assembler::vxorpd(dst, nds, src, vector_len);
1635   }
1636   void vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1637 
1638   // Simple version for AVX2 256bit vectors
1639   void vpxor(XMMRegister dst, XMMRegister src) {
1640     assert(UseAVX >= 2, "Should be at least AVX2");
1641     Assembler::vpxor(dst, dst, src, AVX_256bit);
1642   }
1643   void vpxor(XMMRegister dst, Address src) {
1644     assert(UseAVX >= 2, "Should be at least AVX2");
1645     Assembler::vpxor(dst, dst, src, AVX_256bit);
1646   }
1647 
1648   void vpermd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpermd(dst, nds, src, vector_len); }
1649   void vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1650 
1651   void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
1652     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1653       Assembler::vinserti32x4(dst, nds, src, imm8);
1654     } else if (UseAVX > 1) {
1655       // vinserti128 is available only in AVX2
1656       Assembler::vinserti128(dst, nds, src, imm8);
1657     } else {
1658       Assembler::vinsertf128(dst, nds, src, imm8);
1659     }
1660   }
1661 
1662   void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
1663     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1664       Assembler::vinserti32x4(dst, nds, src, imm8);
1665     } else if (UseAVX > 1) {
1666       // vinserti128 is available only in AVX2
1667       Assembler::vinserti128(dst, nds, src, imm8);
1668     } else {
1669       Assembler::vinsertf128(dst, nds, src, imm8);
1670     }
1671   }
1672 
1673   void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
1674     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1675       Assembler::vextracti32x4(dst, src, imm8);
1676     } else if (UseAVX > 1) {
1677       // vextracti128 is available only in AVX2
1678       Assembler::vextracti128(dst, src, imm8);
1679     } else {
1680       Assembler::vextractf128(dst, src, imm8);
1681     }
1682   }
1683 
1684   void vextracti128(Address dst, XMMRegister src, uint8_t imm8) {
1685     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1686       Assembler::vextracti32x4(dst, src, imm8);
1687     } else if (UseAVX > 1) {
1688       // vextracti128 is available only in AVX2
1689       Assembler::vextracti128(dst, src, imm8);
1690     } else {
1691       Assembler::vextractf128(dst, src, imm8);
1692     }
1693   }
1694 
1695   // 128bit copy to/from high 128 bits of 256bit (YMM) vector registers
1696   void vinserti128_high(XMMRegister dst, XMMRegister src) {
1697     vinserti128(dst, dst, src, 1);
1698   }
1699   void vinserti128_high(XMMRegister dst, Address src) {
1700     vinserti128(dst, dst, src, 1);
1701   }
1702   void vextracti128_high(XMMRegister dst, XMMRegister src) {
1703     vextracti128(dst, src, 1);
1704   }
1705   void vextracti128_high(Address dst, XMMRegister src) {
1706     vextracti128(dst, src, 1);
1707   }
1708 
1709   void vinsertf128_high(XMMRegister dst, XMMRegister src) {
1710     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1711       Assembler::vinsertf32x4(dst, dst, src, 1);
1712     } else {
1713       Assembler::vinsertf128(dst, dst, src, 1);
1714     }
1715   }
1716 
1717   void vinsertf128_high(XMMRegister dst, Address src) {
1718     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1719       Assembler::vinsertf32x4(dst, dst, src, 1);
1720     } else {
1721       Assembler::vinsertf128(dst, dst, src, 1);
1722     }
1723   }
1724 
1725   void vextractf128_high(XMMRegister dst, XMMRegister src) {
1726     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1727       Assembler::vextractf32x4(dst, src, 1);
1728     } else {
1729       Assembler::vextractf128(dst, src, 1);
1730     }
1731   }
1732 
1733   void vextractf128_high(Address dst, XMMRegister src) {
1734     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1735       Assembler::vextractf32x4(dst, src, 1);
1736     } else {
1737       Assembler::vextractf128(dst, src, 1);
1738     }
1739   }
1740 
1741   // 256bit copy to/from high 256 bits of 512bit (ZMM) vector registers
1742   void vinserti64x4_high(XMMRegister dst, XMMRegister src) {
1743     Assembler::vinserti64x4(dst, dst, src, 1);
1744   }
1745   void vinsertf64x4_high(XMMRegister dst, XMMRegister src) {
1746     Assembler::vinsertf64x4(dst, dst, src, 1);
1747   }
1748   void vextracti64x4_high(XMMRegister dst, XMMRegister src) {
1749     Assembler::vextracti64x4(dst, src, 1);
1750   }
1751   void vextractf64x4_high(XMMRegister dst, XMMRegister src) {
1752     Assembler::vextractf64x4(dst, src, 1);
1753   }
1754   void vextractf64x4_high(Address dst, XMMRegister src) {
1755     Assembler::vextractf64x4(dst, src, 1);
1756   }
1757   void vinsertf64x4_high(XMMRegister dst, Address src) {
1758     Assembler::vinsertf64x4(dst, dst, src, 1);
1759   }
1760 
1761   // 128bit copy to/from low 128 bits of 256bit (YMM) vector registers
1762   void vinserti128_low(XMMRegister dst, XMMRegister src) {
1763     vinserti128(dst, dst, src, 0);
1764   }
1765   void vinserti128_low(XMMRegister dst, Address src) {
1766     vinserti128(dst, dst, src, 0);
1767   }
1768   void vextracti128_low(XMMRegister dst, XMMRegister src) {
1769     vextracti128(dst, src, 0);
1770   }
1771   void vextracti128_low(Address dst, XMMRegister src) {
1772     vextracti128(dst, src, 0);
1773   }
1774 
1775   void vinsertf128_low(XMMRegister dst, XMMRegister src) {
1776     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1777       Assembler::vinsertf32x4(dst, dst, src, 0);
1778     } else {
1779       Assembler::vinsertf128(dst, dst, src, 0);
1780     }
1781   }
1782 
1783   void vinsertf128_low(XMMRegister dst, Address src) {
1784     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1785       Assembler::vinsertf32x4(dst, dst, src, 0);
1786     } else {
1787       Assembler::vinsertf128(dst, dst, src, 0);
1788     }
1789   }
1790 
1791   void vextractf128_low(XMMRegister dst, XMMRegister src) {
1792     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1793       Assembler::vextractf32x4(dst, src, 0);
1794     } else {
1795       Assembler::vextractf128(dst, src, 0);
1796     }
1797   }
1798 
1799   void vextractf128_low(Address dst, XMMRegister src) {
1800     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1801       Assembler::vextractf32x4(dst, src, 0);
1802     } else {
1803       Assembler::vextractf128(dst, src, 0);
1804     }
1805   }
1806 
1807   // 256bit copy to/from low 256 bits of 512bit (ZMM) vector registers
1808   void vinserti64x4_low(XMMRegister dst, XMMRegister src) {
1809     Assembler::vinserti64x4(dst, dst, src, 0);
1810   }
1811   void vinsertf64x4_low(XMMRegister dst, XMMRegister src) {
1812     Assembler::vinsertf64x4(dst, dst, src, 0);
1813   }
1814   void vextracti64x4_low(XMMRegister dst, XMMRegister src) {
1815     Assembler::vextracti64x4(dst, src, 0);
1816   }
1817   void vextractf64x4_low(XMMRegister dst, XMMRegister src) {
1818     Assembler::vextractf64x4(dst, src, 0);
1819   }
1820   void vextractf64x4_low(Address dst, XMMRegister src) {
1821     Assembler::vextractf64x4(dst, src, 0);
1822   }
1823   void vinsertf64x4_low(XMMRegister dst, Address src) {
1824     Assembler::vinsertf64x4(dst, dst, src, 0);
1825   }
1826 
1827   // Carry-Less Multiplication Quadword
1828   void vpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1829     // 0x00 - multiply lower 64 bits [0:63]
1830     Assembler::vpclmulqdq(dst, nds, src, 0x00);
1831   }
1832   void vpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1833     // 0x11 - multiply upper 64 bits [64:127]
1834     Assembler::vpclmulqdq(dst, nds, src, 0x11);
1835   }
1836   void vpclmullqhqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1837     // 0x10 - multiply nds[0:63] and src[64:127]
1838     Assembler::vpclmulqdq(dst, nds, src, 0x10);
1839   }
1840   void vpclmulhqlqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1841     //0x01 - multiply nds[64:127] and src[0:63]
1842     Assembler::vpclmulqdq(dst, nds, src, 0x01);
1843   }
1844 
1845   void evpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1846     // 0x00 - multiply lower 64 bits [0:63]
1847     Assembler::evpclmulqdq(dst, nds, src, 0x00, vector_len);
1848   }
1849   void evpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1850     // 0x11 - multiply upper 64 bits [64:127]
1851     Assembler::evpclmulqdq(dst, nds, src, 0x11, vector_len);
1852   }
1853 
1854   // AVX-512 mask operations.
1855   void kand(BasicType etype, KRegister dst, KRegister src1, KRegister src2);
1856   void kor(BasicType type, KRegister dst, KRegister src1, KRegister src2);
1857   void knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp = knoreg, Register rtmp = noreg);
1858   void kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2);
1859   void kortest(uint masklen, KRegister src1, KRegister src2);
1860   void ktest(uint masklen, KRegister src1, KRegister src2);
1861 
1862   void evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1863   void evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1864 
1865   void evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1866   void evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1867 
1868   void evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1869   void evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1870 
1871   void evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1872   void evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1873 
1874   void evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
1875   void evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
1876   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
1877   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
1878 
1879   using Assembler::evpandq;
1880   void evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1881 
1882   using Assembler::evpaddq;
1883   void evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1884 
1885   using Assembler::evporq;
1886   void evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1887 
1888   using Assembler::vpshufb;
1889   void vpshufb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1890 
1891   using Assembler::vpor;
1892   void vpor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1893 
1894   using Assembler::vpternlogq;
1895   void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch = noreg);
1896 
1897   void cmov32( Condition cc, Register dst, Address  src);
1898   void cmov32( Condition cc, Register dst, Register src);
1899 
1900   void cmov(   Condition cc, Register dst, Register src) { cmovptr(cc, dst, src); }
1901 
1902   void cmovptr(Condition cc, Register dst, Address  src) { cmovq(cc, dst, src); }
1903   void cmovptr(Condition cc, Register dst, Register src) { cmovq(cc, dst, src); }
1904 
1905   void movoop(Register dst, jobject obj);
1906   void movoop(Address  dst, jobject obj, Register rscratch);
1907 
1908   void mov_metadata(Register dst, Metadata* obj);
1909   void mov_metadata(Address  dst, Metadata* obj, Register rscratch);
1910 
1911   void movptr(Register     dst, Register       src);
1912   void movptr(Register     dst, Address        src);
1913   void movptr(Register     dst, AddressLiteral src);
1914   void movptr(Register     dst, ArrayAddress   src);
1915   void movptr(Register     dst, intptr_t       src);
1916   void movptr(Address      dst, Register       src);
1917   void movptr(Address      dst, int32_t        imm);
1918   void movptr(Address      dst, intptr_t       src, Register rscratch);
1919   void movptr(ArrayAddress dst, Register       src, Register rscratch);
1920 
1921   void movptr(Register dst, RegisterOrConstant src) {
1922     if (src.is_constant()) movptr(dst, src.as_constant());
1923     else                   movptr(dst, src.as_register());
1924   }
1925 
1926 
1927   // to avoid hiding movl
1928   void mov32(Register       dst, AddressLiteral src);
1929   void mov32(AddressLiteral dst, Register        src, Register rscratch = noreg);
1930 
1931   // Import other mov() methods from the parent class or else
1932   // they will be hidden by the following overriding declaration.
1933   using Assembler::movdl;
1934   void movdl(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1935 
1936   using Assembler::movq;
1937   void movq(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1938 
1939   // Can push value or effective address
1940   void pushptr(AddressLiteral src, Register rscratch);
1941 
1942   void pushptr(Address src) { pushq(src); }
1943   void popptr(Address src) { popq(src); }
1944 
1945   void pushoop(jobject obj, Register rscratch);
1946   void pushklass(Metadata* obj, Register rscratch);
1947 
1948   // sign extend as need a l to ptr sized element
1949   void movl2ptr(Register dst, Address src) { movslq(dst, src); }
1950   void movl2ptr(Register dst, Register src) { movslq(dst, src); }
1951 
1952 
1953  public:
1954   // Inline type specific methods
1955   #include "asm/macroAssembler_common.hpp"
1956 
1957   int store_inline_type_fields_to_buf(ciInlineKlass* vk, bool from_interpreter = true);
1958   bool move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[]);
1959   bool unpack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index,
1960                             VMReg from, int& from_index, VMRegPair* to, int to_count, int& to_index,
1961                             RegState reg_state[]);
1962   bool pack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
1963                           VMRegPair* from, int from_count, int& from_index, VMReg to,
1964                           RegState reg_state[], Register val_array);
1965   int extend_stack_for_inline_args(int args_on_stack);
1966   void remove_frame(int initial_framesize, bool needs_stack_repair);
1967   VMReg spill_reg_for(VMReg reg);
1968 
1969   // clear memory of size 'cnt' qwords, starting at 'base';
1970   // if 'is_large' is set, do not try to produce short loop
1971   void clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp, bool is_large, bool word_copy_only, KRegister mask=knoreg);
1972 
1973   // clear memory initialization sequence for constant size;
1974   void clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
1975 
1976   // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
1977   void xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
1978 
1979   // Fill primitive arrays
1980   void generate_fill(BasicType t, bool aligned,
1981                      Register to, Register value, Register count,
1982                      Register rtmp, XMMRegister xtmp);
1983 
1984   void encode_iso_array(Register src, Register dst, Register len,
1985                         XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
1986                         XMMRegister tmp4, Register tmp5, Register result, bool ascii);
1987 
1988   void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2);
1989   void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
1990                              Register y, Register y_idx, Register z,
1991                              Register carry, Register product,
1992                              Register idx, Register kdx);
1993   void multiply_add_128_x_128(Register x_xstart, Register y, Register z,
1994                               Register yz_idx, Register idx,
1995                               Register carry, Register product, int offset);
1996   void multiply_128_x_128_bmi2_loop(Register y, Register z,
1997                                     Register carry, Register carry2,
1998                                     Register idx, Register jdx,
1999                                     Register yz_idx1, Register yz_idx2,
2000                                     Register tmp, Register tmp3, Register tmp4);
2001   void multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
2002                                Register yz_idx, Register idx, Register jdx,
2003                                Register carry, Register product,
2004                                Register carry2);
2005   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register tmp0,
2006                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
2007   void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
2008                      Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
2009   void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
2010                             Register tmp2);
2011   void multiply_add_64(Register sum, Register op1, Register op2, Register carry,
2012                        Register rdxReg, Register raxReg);
2013   void add_one_64(Register z, Register zlen, Register carry, Register tmp1);
2014   void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
2015                        Register tmp3, Register tmp4);
2016   void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
2017                      Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
2018 
2019   void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1,
2020                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
2021                Register raxReg);
2022   void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
2023                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
2024                Register raxReg);
2025   void vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
2026                            Register result, Register tmp1, Register tmp2,
2027                            XMMRegister vec1, XMMRegister vec2, XMMRegister vec3);
2028 
2029   // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
2030   void update_byte_crc32(Register crc, Register val, Register table);
2031   void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
2032 
2033   void kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2);
2034   void kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
2035                                 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
2036                                 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup);
2037 
2038   // CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
2039   // Note on a naming convention:
2040   // Prefix w = register only used on a Westmere+ architecture
2041   // Prefix n = register only used on a Nehalem architecture
2042   void crc32c_ipl_alg4(Register in_out, uint32_t n,
2043                        Register tmp1, Register tmp2, Register tmp3);
2044   void crc32c_pclmulqdq(XMMRegister w_xtmp1,
2045                         Register in_out,
2046                         uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
2047                         XMMRegister w_xtmp2,
2048                         Register tmp1,
2049                         Register n_tmp2, Register n_tmp3);
2050   void crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
2051                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
2052                        Register tmp1, Register tmp2,
2053                        Register n_tmp3);
2054   void crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
2055                          Register in_out1, Register in_out2, Register in_out3,
2056                          Register tmp1, Register tmp2, Register tmp3,
2057                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
2058                          Register tmp4, Register tmp5,
2059                          Register n_tmp6);
2060   void crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
2061                             Register tmp1, Register tmp2, Register tmp3,
2062                             Register tmp4, Register tmp5, Register tmp6,
2063                             XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
2064                             bool is_pclmulqdq_supported);
2065   // Fold 128-bit data chunk
2066   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
2067   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf);
2068   // Fold 512-bit data chunk
2069   void fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, Register pos, int offset);
2070   // Fold 8-bit data
2071   void fold_8bit_crc32(Register crc, Register table, Register tmp);
2072   void fold_8bit_crc32(XMMRegister crc, Register table, XMMRegister xtmp, Register tmp);
2073 
2074   // Compress char[] array to byte[].
2075   void char_array_compress(Register src, Register dst, Register len,
2076                            XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
2077                            XMMRegister tmp4, Register tmp5, Register result,
2078                            KRegister mask1 = knoreg, KRegister mask2 = knoreg);
2079 
2080   // Inflate byte[] array to char[].
2081   void byte_array_inflate(Register src, Register dst, Register len,
2082                           XMMRegister tmp1, Register tmp2, KRegister mask = knoreg);
2083 
2084   void fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
2085                    Register length, Register temp, int vec_enc);
2086 
2087   void fill64_masked(uint shift, Register dst, int disp,
2088                          XMMRegister xmm, KRegister mask, Register length,
2089                          Register temp, bool use64byteVector = false);
2090 
2091   void fill32_masked(uint shift, Register dst, int disp,
2092                          XMMRegister xmm, KRegister mask, Register length,
2093                          Register temp);
2094 
2095   void fill32(Address dst, XMMRegister xmm);
2096 
2097   void fill32(Register dst, int disp, XMMRegister xmm);
2098 
2099   void fill64(Address dst, XMMRegister xmm, bool use64byteVector = false);
2100 
2101   void fill64(Register dst, int dis, XMMRegister xmm, bool use64byteVector = false);
2102 
2103   void convert_f2i(Register dst, XMMRegister src);
2104   void convert_d2i(Register dst, XMMRegister src);
2105   void convert_f2l(Register dst, XMMRegister src);
2106   void convert_d2l(Register dst, XMMRegister src);
2107   void round_double(Register dst, XMMRegister src, Register rtmp, Register rcx);
2108   void round_float(Register dst, XMMRegister src, Register rtmp, Register rcx);
2109 
2110   void cache_wb(Address line);
2111   void cache_wbsync(bool is_pre);
2112 
2113 #ifdef COMPILER2_OR_JVMCI
2114   void generate_fill_avx3(BasicType type, Register to, Register value,
2115                           Register count, Register rtmp, XMMRegister xtmp);
2116 #endif // COMPILER2_OR_JVMCI
2117 
2118   void vallones(XMMRegister dst, int vector_len);
2119 
2120   void check_stack_alignment(Register sp, const char* msg, unsigned bias = 0, Register tmp = noreg);
2121 
2122   void lightweight_lock(Register basic_lock, Register obj, Register reg_rax, Register tmp, Label& slow);
2123   void lightweight_unlock(Register obj, Register reg_rax, Register tmp, Label& slow);
2124 
2125   void save_legacy_gprs();
2126   void restore_legacy_gprs();
2127   void setcc(Assembler::Condition comparison, Register dst);
2128 };
2129 
2130 #endif // CPU_X86_MACROASSEMBLER_X86_HPP