1 /*
   2  * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef CPU_X86_MACROASSEMBLER_X86_HPP
  26 #define CPU_X86_MACROASSEMBLER_X86_HPP
  27 
  28 #include "asm/assembler.hpp"
  29 #include "asm/register.hpp"
  30 #include "code/vmreg.inline.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "utilities/macros.hpp"
  33 #include "runtime/rtmLocking.hpp"
  34 #include "runtime/signature.hpp"
  35 #include "runtime/vm_version.hpp"
  36 #include "utilities/checkedCast.hpp"
  37 
  38 class ciInlineKlass;
  39 
  40 // MacroAssembler extends Assembler by frequently used macros.
  41 //
  42 // Instructions for which a 'better' code sequence exists depending
  43 // on arguments should also go in here.
  44 
  45 class MacroAssembler: public Assembler {
  46   friend class LIR_Assembler;
  47   friend class Runtime1;      // as_Address()
  48 
  49  public:
  50   // Support for VM calls
  51   //
  52   // This is the base routine called by the different versions of call_VM_leaf. The interpreter
  53   // may customize this version by overriding it for its purposes (e.g., to save/restore
  54   // additional registers when doing a VM call).
  55 
  56   virtual void call_VM_leaf_base(
  57     address entry_point,               // the entry point
  58     int     number_of_arguments        // the number of arguments to pop after the call
  59   );
  60 
  61  protected:
  62   // This is the base routine called by the different versions of call_VM. The interpreter
  63   // may customize this version by overriding it for its purposes (e.g., to save/restore
  64   // additional registers when doing a VM call).
  65   //
  66   // If no java_thread register is specified (noreg) than rdi will be used instead. call_VM_base
  67   // returns the register which contains the thread upon return. If a thread register has been
  68   // specified, the return value will correspond to that register. If no last_java_sp is specified
  69   // (noreg) than rsp will be used instead.
  70   virtual void call_VM_base(           // returns the register containing the thread upon return
  71     Register oop_result,               // where an oop-result ends up if any; use noreg otherwise
  72     Register java_thread,              // the thread if computed before     ; use noreg otherwise
  73     Register last_java_sp,             // to set up last_Java_frame in stubs; use noreg otherwise
  74     address  entry_point,              // the entry point
  75     int      number_of_arguments,      // the number of arguments (w/o thread) to pop after the call
  76     bool     check_exceptions          // whether to check for pending exceptions after return
  77   );
  78 
  79   void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions = true);
  80 
  81   // helpers for FPU flag access
  82   // tmp is a temporary register, if none is available use noreg
  83   void save_rax   (Register tmp);
  84   void restore_rax(Register tmp);
  85 
  86  public:
  87   MacroAssembler(CodeBuffer* code) : Assembler(code) {}
  88 
  89  // These routines should emit JVMTI PopFrame and ForceEarlyReturn handling code.
  90  // The implementation is only non-empty for the InterpreterMacroAssembler,
  91  // as only the interpreter handles PopFrame and ForceEarlyReturn requests.
  92  virtual void check_and_handle_popframe(Register java_thread);
  93  virtual void check_and_handle_earlyret(Register java_thread);
  94 
  95   Address as_Address(AddressLiteral adr);
  96   Address as_Address(ArrayAddress adr, Register rscratch);
  97 
  98   // Support for null-checks
  99   //
 100   // Generates code that causes a null OS exception if the content of reg is null.
 101   // If the accessed location is M[reg + offset] and the offset is known, provide the
 102   // offset. No explicit code generation is needed if the offset is within a certain
 103   // range (0 <= offset <= page_size).
 104 
 105   void null_check(Register reg, int offset = -1);
 106   static bool needs_explicit_null_check(intptr_t offset);
 107   static bool uses_implicit_null_check(void* address);
 108 
 109   // markWord tests, kills markWord reg
 110   void test_markword_is_inline_type(Register markword, Label& is_inline_type);
 111 
 112   // inlineKlass queries, kills temp_reg
 113   void test_klass_is_inline_type(Register klass, Register temp_reg, Label& is_inline_type);
 114   void test_klass_is_empty_inline_type(Register klass, Register temp_reg, Label& is_empty_inline_type);
 115   void test_oop_is_not_inline_type(Register object, Register tmp, Label& not_inline_type);
 116 
 117   // Get the default value oop for the given InlineKlass
 118   void get_default_value_oop(Register inline_klass, Register temp_reg, Register obj);
 119   // The empty value oop, for the given InlineKlass ("empty" as in no instance fields)
 120   // get_default_value_oop with extra assertion for empty inline klass
 121   void get_empty_inline_type_oop(Register inline_klass, Register temp_reg, Register obj);
 122 
 123   void test_field_is_null_free_inline_type(Register flags, Register temp_reg, Label& is_null_free);
 124   void test_field_is_not_null_free_inline_type(Register flags, Register temp_reg, Label& not_null_free);
 125   void test_field_is_flat(Register flags, Register temp_reg, Label& is_flat);
 126   void test_field_has_null_marker(Register flags, Register temp_reg, Label& has_null_marker);
 127 
 128   // Check oops for special arrays, i.e. flat arrays and/or null-free arrays
 129   void test_oop_prototype_bit(Register oop, Register temp_reg, int32_t test_bit, bool jmp_set, Label& jmp_label);
 130   void test_flat_array_oop(Register oop, Register temp_reg, Label& is_flat_array);
 131   void test_non_flat_array_oop(Register oop, Register temp_reg, Label& is_non_flat_array);
 132   void test_null_free_array_oop(Register oop, Register temp_reg, Label& is_null_free_array);
 133   void test_non_null_free_array_oop(Register oop, Register temp_reg, Label& is_non_null_free_array);
 134 
 135   // Check array klass layout helper for flat or null-free arrays...
 136   void test_flat_array_layout(Register lh, Label& is_flat_array);
 137   void test_non_flat_array_layout(Register lh, Label& is_non_flat_array);
 138 
 139   // Required platform-specific helpers for Label::patch_instructions.
 140   // They _shadow_ the declarations in AbstractAssembler, which are undefined.
 141   void pd_patch_instruction(address branch, address target, const char* file, int line) {
 142     unsigned char op = branch[0];
 143     assert(op == 0xE8 /* call */ ||
 144         op == 0xE9 /* jmp */ ||
 145         op == 0xEB /* short jmp */ ||
 146         (op & 0xF0) == 0x70 /* short jcc */ ||
 147         (op == 0x0F && (branch[1] & 0xF0) == 0x80) /* jcc */ ||
 148         (op == 0xC7 && branch[1] == 0xF8) /* xbegin */,
 149         "Invalid opcode at patch point");
 150 
 151     if (op == 0xEB || (op & 0xF0) == 0x70) {
 152       // short offset operators (jmp and jcc)
 153       char* disp = (char*) &branch[1];
 154       int imm8 = checked_cast<int>(target - (address) &disp[1]);
 155       guarantee(this->is8bit(imm8), "Short forward jump exceeds 8-bit offset at %s:%d",
 156                 file == nullptr ? "<null>" : file, line);
 157       *disp = (char)imm8;
 158     } else {
 159       int* disp = (int*) &branch[(op == 0x0F || op == 0xC7)? 2: 1];
 160       int imm32 = checked_cast<int>(target - (address) &disp[1]);
 161       *disp = imm32;
 162     }
 163   }
 164 
 165   // The following 4 methods return the offset of the appropriate move instruction
 166 
 167   // Support for fast byte/short loading with zero extension (depending on particular CPU)
 168   int load_unsigned_byte(Register dst, Address src);
 169   int load_unsigned_short(Register dst, Address src);
 170 
 171   // Support for fast byte/short loading with sign extension (depending on particular CPU)
 172   int load_signed_byte(Register dst, Address src);
 173   int load_signed_short(Register dst, Address src);
 174 
 175   // Support for sign-extension (hi:lo = extend_sign(lo))
 176   void extend_sign(Register hi, Register lo);
 177 
 178   // Load and store values by size and signed-ness
 179   void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2 = noreg);
 180   void store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2 = noreg);
 181 
 182   // Support for inc/dec with optimal instruction selection depending on value
 183 
 184   void increment(Register reg, int value = 1) { LP64_ONLY(incrementq(reg, value)) NOT_LP64(incrementl(reg, value)) ; }
 185   void decrement(Register reg, int value = 1) { LP64_ONLY(decrementq(reg, value)) NOT_LP64(decrementl(reg, value)) ; }
 186   void increment(Address dst, int value = 1)  { LP64_ONLY(incrementq(dst, value)) NOT_LP64(incrementl(dst, value)) ; }
 187   void decrement(Address dst, int value = 1)  { LP64_ONLY(decrementq(dst, value)) NOT_LP64(decrementl(dst, value)) ; }
 188 
 189   void decrementl(Address dst, int value = 1);
 190   void decrementl(Register reg, int value = 1);
 191 
 192   void decrementq(Register reg, int value = 1);
 193   void decrementq(Address dst, int value = 1);
 194 
 195   void incrementl(Address dst, int value = 1);
 196   void incrementl(Register reg, int value = 1);
 197 
 198   void incrementq(Register reg, int value = 1);
 199   void incrementq(Address dst, int value = 1);
 200 
 201   void incrementl(AddressLiteral dst, Register rscratch = noreg);
 202   void incrementl(ArrayAddress   dst, Register rscratch);
 203 
 204   void incrementq(AddressLiteral dst, Register rscratch = noreg);
 205 
 206   // Support optimal SSE move instructions.
 207   void movflt(XMMRegister dst, XMMRegister src) {
 208     if (dst-> encoding() == src->encoding()) return;
 209     if (UseXmmRegToRegMoveAll) { movaps(dst, src); return; }
 210     else                       { movss (dst, src); return; }
 211   }
 212   void movflt(XMMRegister dst, Address src) { movss(dst, src); }
 213   void movflt(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 214   void movflt(Address dst, XMMRegister src) { movss(dst, src); }
 215 
 216   // Move with zero extension
 217   void movfltz(XMMRegister dst, XMMRegister src) { movss(dst, src); }
 218 
 219   void movdbl(XMMRegister dst, XMMRegister src) {
 220     if (dst-> encoding() == src->encoding()) return;
 221     if (UseXmmRegToRegMoveAll) { movapd(dst, src); return; }
 222     else                       { movsd (dst, src); return; }
 223   }
 224 
 225   void movdbl(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 226 
 227   void movdbl(XMMRegister dst, Address src) {
 228     if (UseXmmLoadAndClearUpper) { movsd (dst, src); return; }
 229     else                         { movlpd(dst, src); return; }
 230   }
 231   void movdbl(Address dst, XMMRegister src) { movsd(dst, src); }
 232 
 233   void flt_to_flt16(Register dst, XMMRegister src, XMMRegister tmp) {
 234     // Use separate tmp XMM register because caller may
 235     // requires src XMM register to be unchanged (as in x86.ad).
 236     vcvtps2ph(tmp, src, 0x04, Assembler::AVX_128bit);
 237     movdl(dst, tmp);
 238     movswl(dst, dst);
 239   }
 240 
 241   void flt16_to_flt(XMMRegister dst, Register src) {
 242     movdl(dst, src);
 243     vcvtph2ps(dst, dst, Assembler::AVX_128bit);
 244   }
 245 
 246   // Alignment
 247   void align32();
 248   void align64();
 249   void align(int modulus);
 250   void align(int modulus, int target);
 251 
 252   void post_call_nop();
 253   // A 5 byte nop that is safe for patching (see patch_verified_entry)
 254   void fat_nop();
 255 
 256   // Stack frame creation/removal
 257   void enter();
 258   void leave();
 259 
 260   // Support for getting the JavaThread pointer (i.e.; a reference to thread-local information)
 261   // The pointer will be loaded into the thread register.
 262   void get_thread(Register thread);
 263 
 264 #ifdef _LP64
 265   // Support for argument shuffling
 266 
 267   // bias in bytes
 268   void move32_64(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 269   void long_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 270   void float_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 271   void double_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 272   void move_ptr(VMRegPair src, VMRegPair dst);
 273   void object_move(OopMap* map,
 274                    int oop_handle_offset,
 275                    int framesize_in_slots,
 276                    VMRegPair src,
 277                    VMRegPair dst,
 278                    bool is_receiver,
 279                    int* receiver_offset);
 280 #endif // _LP64
 281 
 282   // Support for VM calls
 283   //
 284   // It is imperative that all calls into the VM are handled via the call_VM macros.
 285   // They make sure that the stack linkage is setup correctly. call_VM's correspond
 286   // to ENTRY/ENTRY_X entry points while call_VM_leaf's correspond to LEAF entry points.
 287 
 288 
 289   void call_VM(Register oop_result,
 290                address entry_point,
 291                bool check_exceptions = true);
 292   void call_VM(Register oop_result,
 293                address entry_point,
 294                Register arg_1,
 295                bool check_exceptions = true);
 296   void call_VM(Register oop_result,
 297                address entry_point,
 298                Register arg_1, Register arg_2,
 299                bool check_exceptions = true);
 300   void call_VM(Register oop_result,
 301                address entry_point,
 302                Register arg_1, Register arg_2, Register arg_3,
 303                bool check_exceptions = true);
 304 
 305   // Overloadings with last_Java_sp
 306   void call_VM(Register oop_result,
 307                Register last_java_sp,
 308                address entry_point,
 309                int number_of_arguments = 0,
 310                bool check_exceptions = true);
 311   void call_VM(Register oop_result,
 312                Register last_java_sp,
 313                address entry_point,
 314                Register arg_1, bool
 315                check_exceptions = true);
 316   void call_VM(Register oop_result,
 317                Register last_java_sp,
 318                address entry_point,
 319                Register arg_1, Register arg_2,
 320                bool check_exceptions = true);
 321   void call_VM(Register oop_result,
 322                Register last_java_sp,
 323                address entry_point,
 324                Register arg_1, Register arg_2, Register arg_3,
 325                bool check_exceptions = true);
 326 
 327   void get_vm_result  (Register oop_result, Register thread);
 328   void get_vm_result_2(Register metadata_result, Register thread);
 329 
 330   // These always tightly bind to MacroAssembler::call_VM_base
 331   // bypassing the virtual implementation
 332   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments = 0, bool check_exceptions = true);
 333   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions = true);
 334   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions = true);
 335   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions = true);
 336   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4, bool check_exceptions = true);
 337 
 338   void call_VM_leaf0(address entry_point);
 339   void call_VM_leaf(address entry_point,
 340                     int number_of_arguments = 0);
 341   void call_VM_leaf(address entry_point,
 342                     Register arg_1);
 343   void call_VM_leaf(address entry_point,
 344                     Register arg_1, Register arg_2);
 345   void call_VM_leaf(address entry_point,
 346                     Register arg_1, Register arg_2, Register arg_3);
 347 
 348   void call_VM_leaf(address entry_point,
 349                     Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 350 
 351   // These always tightly bind to MacroAssembler::call_VM_leaf_base
 352   // bypassing the virtual implementation
 353   void super_call_VM_leaf(address entry_point);
 354   void super_call_VM_leaf(address entry_point, Register arg_1);
 355   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
 356   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
 357   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 358 
 359   // last Java Frame (fills frame anchor)
 360   void set_last_Java_frame(Register thread,
 361                            Register last_java_sp,
 362                            Register last_java_fp,
 363                            address  last_java_pc,
 364                            Register rscratch);
 365 
 366   // thread in the default location (r15_thread on 64bit)
 367   void set_last_Java_frame(Register last_java_sp,
 368                            Register last_java_fp,
 369                            address  last_java_pc,
 370                            Register rscratch);
 371 
 372   void reset_last_Java_frame(Register thread, bool clear_fp);
 373 
 374   // thread in the default location (r15_thread on 64bit)
 375   void reset_last_Java_frame(bool clear_fp);
 376 
 377   // jobjects
 378   void clear_jobject_tag(Register possibly_non_local);
 379   void resolve_jobject(Register value, Register thread, Register tmp);
 380   void resolve_global_jobject(Register value, Register thread, Register tmp);
 381 
 382   // C 'boolean' to Java boolean: x == 0 ? 0 : 1
 383   void c2bool(Register x);
 384 
 385   // C++ bool manipulation
 386 
 387   void movbool(Register dst, Address src);
 388   void movbool(Address dst, bool boolconst);
 389   void movbool(Address dst, Register src);
 390   void testbool(Register dst);
 391 
 392   void resolve_oop_handle(Register result, Register tmp);
 393   void resolve_weak_handle(Register result, Register tmp);
 394   void load_mirror(Register mirror, Register method, Register tmp);
 395   void load_method_holder_cld(Register rresult, Register rmethod);
 396 
 397   void load_method_holder(Register holder, Register method);
 398 
 399   // oop manipulations
 400   void load_metadata(Register dst, Register src);
 401   void load_klass(Register dst, Register src, Register tmp);
 402   void store_klass(Register dst, Register src, Register tmp);
 403 
 404   void access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
 405                       Register tmp1, Register thread_tmp);
 406   void access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val,
 407                        Register tmp1, Register tmp2, Register tmp3);
 408 
 409   void access_value_copy(DecoratorSet decorators, Register src, Register dst, Register inline_klass);
 410 
 411   // inline type data payload offsets...
 412   void first_field_offset(Register inline_klass, Register offset);
 413   void data_for_oop(Register oop, Register data, Register inline_klass);
 414   // get data payload ptr a flat value array at index, kills rcx and index
 415   void data_for_value_array_index(Register array, Register array_klass,
 416                                   Register index, Register data);
 417 
 418   void load_heap_oop(Register dst, Address src, Register tmp1 = noreg,
 419                      Register thread_tmp = noreg, DecoratorSet decorators = 0);
 420   void load_heap_oop_not_null(Register dst, Address src, Register tmp1 = noreg,
 421                               Register thread_tmp = noreg, DecoratorSet decorators = 0);
 422   void store_heap_oop(Address dst, Register val, Register tmp1 = noreg,
 423                       Register tmp2 = noreg, Register tmp3 = noreg, DecoratorSet decorators = 0);
 424 
 425   // Used for storing null. All other oop constants should be
 426   // stored using routines that take a jobject.
 427   void store_heap_oop_null(Address dst);
 428 
 429   void load_prototype_header(Register dst, Register src, Register tmp);
 430 
 431 #ifdef _LP64
 432   void store_klass_gap(Register dst, Register src);
 433 
 434   // This dummy is to prevent a call to store_heap_oop from
 435   // converting a zero (like null) into a Register by giving
 436   // the compiler two choices it can't resolve
 437 
 438   void store_heap_oop(Address dst, void* dummy);
 439 
 440   void encode_heap_oop(Register r);
 441   void decode_heap_oop(Register r);
 442   void encode_heap_oop_not_null(Register r);
 443   void decode_heap_oop_not_null(Register r);
 444   void encode_heap_oop_not_null(Register dst, Register src);
 445   void decode_heap_oop_not_null(Register dst, Register src);
 446 
 447   void set_narrow_oop(Register dst, jobject obj);
 448   void set_narrow_oop(Address dst, jobject obj);
 449   void cmp_narrow_oop(Register dst, jobject obj);
 450   void cmp_narrow_oop(Address dst, jobject obj);
 451 
 452   void encode_klass_not_null(Register r, Register tmp);
 453   void decode_klass_not_null(Register r, Register tmp);
 454   void encode_and_move_klass_not_null(Register dst, Register src);
 455   void decode_and_move_klass_not_null(Register dst, Register src);
 456   void set_narrow_klass(Register dst, Klass* k);
 457   void set_narrow_klass(Address dst, Klass* k);
 458   void cmp_narrow_klass(Register dst, Klass* k);
 459   void cmp_narrow_klass(Address dst, Klass* k);
 460 
 461   // if heap base register is used - reinit it with the correct value
 462   void reinit_heapbase();
 463 
 464   DEBUG_ONLY(void verify_heapbase(const char* msg);)
 465 
 466 #endif // _LP64
 467 
 468   // Int division/remainder for Java
 469   // (as idivl, but checks for special case as described in JVM spec.)
 470   // returns idivl instruction offset for implicit exception handling
 471   int corrected_idivl(Register reg);
 472 
 473   // Long division/remainder for Java
 474   // (as idivq, but checks for special case as described in JVM spec.)
 475   // returns idivq instruction offset for implicit exception handling
 476   int corrected_idivq(Register reg);
 477 
 478   void int3();
 479 
 480   // Long operation macros for a 32bit cpu
 481   // Long negation for Java
 482   void lneg(Register hi, Register lo);
 483 
 484   // Long multiplication for Java
 485   // (destroys contents of eax, ebx, ecx and edx)
 486   void lmul(int x_rsp_offset, int y_rsp_offset); // rdx:rax = x * y
 487 
 488   // Long shifts for Java
 489   // (semantics as described in JVM spec.)
 490   void lshl(Register hi, Register lo);                               // hi:lo << (rcx & 0x3f)
 491   void lshr(Register hi, Register lo, bool sign_extension = false);  // hi:lo >> (rcx & 0x3f)
 492 
 493   // Long compare for Java
 494   // (semantics as described in JVM spec.)
 495   void lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo); // x_hi = lcmp(x, y)
 496 
 497 
 498   // misc
 499 
 500   // Sign extension
 501   void sign_extend_short(Register reg);
 502   void sign_extend_byte(Register reg);
 503 
 504   // Division by power of 2, rounding towards 0
 505   void division_with_shift(Register reg, int shift_value);
 506 
 507 #ifndef _LP64
 508   // Compares the top-most stack entries on the FPU stack and sets the eflags as follows:
 509   //
 510   // CF (corresponds to C0) if x < y
 511   // PF (corresponds to C2) if unordered
 512   // ZF (corresponds to C3) if x = y
 513   //
 514   // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
 515   // tmp is a temporary register, if none is available use noreg (only matters for non-P6 code)
 516   void fcmp(Register tmp);
 517   // Variant of the above which allows y to be further down the stack
 518   // and which only pops x and y if specified. If pop_right is
 519   // specified then pop_left must also be specified.
 520   void fcmp(Register tmp, int index, bool pop_left, bool pop_right);
 521 
 522   // Floating-point comparison for Java
 523   // Compares the top-most stack entries on the FPU stack and stores the result in dst.
 524   // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
 525   // (semantics as described in JVM spec.)
 526   void fcmp2int(Register dst, bool unordered_is_less);
 527   // Variant of the above which allows y to be further down the stack
 528   // and which only pops x and y if specified. If pop_right is
 529   // specified then pop_left must also be specified.
 530   void fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right);
 531 
 532   // Floating-point remainder for Java (ST0 = ST0 fremr ST1, ST1 is empty afterwards)
 533   // tmp is a temporary register, if none is available use noreg
 534   void fremr(Register tmp);
 535 
 536   // only if +VerifyFPU
 537   void verify_FPU(int stack_depth, const char* s = "illegal FPU state");
 538 #endif // !LP64
 539 
 540   // dst = c = a * b + c
 541   void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 542   void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 543 
 544   void vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 545   void vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 546   void vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 547   void vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 548 
 549 
 550   // same as fcmp2int, but using SSE2
 551   void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 552   void cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 553 
 554   // branch to L if FPU flag C2 is set/not set
 555   // tmp is a temporary register, if none is available use noreg
 556   void jC2 (Register tmp, Label& L);
 557   void jnC2(Register tmp, Label& L);
 558 
 559   // Load float value from 'address'. If UseSSE >= 1, the value is loaded into
 560   // register xmm0. Otherwise, the value is loaded onto the FPU stack.
 561   void load_float(Address src);
 562 
 563   // Store float value to 'address'. If UseSSE >= 1, the value is stored
 564   // from register xmm0. Otherwise, the value is stored from the FPU stack.
 565   void store_float(Address dst);
 566 
 567   // Load double value from 'address'. If UseSSE >= 2, the value is loaded into
 568   // register xmm0. Otherwise, the value is loaded onto the FPU stack.
 569   void load_double(Address src);
 570 
 571   // Store double value to 'address'. If UseSSE >= 2, the value is stored
 572   // from register xmm0. Otherwise, the value is stored from the FPU stack.
 573   void store_double(Address dst);
 574 
 575 #ifndef _LP64
 576   // Pop ST (ffree & fincstp combined)
 577   void fpop();
 578 
 579   void empty_FPU_stack();
 580 #endif // !_LP64
 581 
 582   void push_IU_state();
 583   void pop_IU_state();
 584 
 585   void push_FPU_state();
 586   void pop_FPU_state();
 587 
 588   void push_CPU_state();
 589   void pop_CPU_state();
 590 
 591   void push_cont_fastpath();
 592   void pop_cont_fastpath();
 593 
 594   void inc_held_monitor_count();
 595   void dec_held_monitor_count();
 596 
 597   DEBUG_ONLY(void stop_if_in_cont(Register cont_reg, const char* name);)
 598 
 599   // Round up to a power of two
 600   void round_to(Register reg, int modulus);
 601 
 602 private:
 603   // General purpose and XMM registers potentially clobbered by native code; there
 604   // is no need for FPU or AVX opmask related methods because C1/interpreter
 605   // - we save/restore FPU state as a whole always
 606   // - do not care about AVX-512 opmask
 607   static RegSet call_clobbered_gp_registers();
 608   static XMMRegSet call_clobbered_xmm_registers();
 609 
 610   void push_set(XMMRegSet set, int offset);
 611   void pop_set(XMMRegSet set, int offset);
 612 
 613 public:
 614   void push_set(RegSet set, int offset = -1);
 615   void pop_set(RegSet set, int offset = -1);
 616 
 617   // Push and pop everything that might be clobbered by a native
 618   // runtime call.
 619   // Only save the lower 64 bits of each vector register.
 620   // Additional registers can be excluded in a passed RegSet.
 621   void push_call_clobbered_registers_except(RegSet exclude, bool save_fpu = true);
 622   void pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu = true);
 623 
 624   void push_call_clobbered_registers(bool save_fpu = true) {
 625     push_call_clobbered_registers_except(RegSet(), save_fpu);
 626   }
 627   void pop_call_clobbered_registers(bool restore_fpu = true) {
 628     pop_call_clobbered_registers_except(RegSet(), restore_fpu);
 629   }
 630 
 631   // allocation
 632 
 633   // Object / value buffer allocation...
 634   // Allocate instance of klass, assumes klass initialized by caller
 635   // new_obj prefers to be rax
 636   // Kills t1 and t2, perserves klass, return allocation in new_obj (rsi on LP64)
 637   void allocate_instance(Register klass, Register new_obj,
 638                          Register t1, Register t2,
 639                          bool clear_fields, Label& alloc_failed);
 640 
 641   void tlab_allocate(
 642     Register thread,                   // Current thread
 643     Register obj,                      // result: pointer to object after successful allocation
 644     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
 645     int      con_size_in_bytes,        // object size in bytes if   known at compile time
 646     Register t1,                       // temp register
 647     Register t2,                       // temp register
 648     Label&   slow_case                 // continuation point if fast allocation fails
 649   );
 650   void zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp);
 651 
 652   // For field "index" within "klass", return inline_klass ...
 653   void get_inline_type_field_klass(Register klass, Register index, Register inline_klass);
 654 
 655   // interface method calling
 656   void lookup_interface_method(Register recv_klass,
 657                                Register intf_klass,
 658                                RegisterOrConstant itable_index,
 659                                Register method_result,
 660                                Register scan_temp,
 661                                Label& no_such_interface,
 662                                bool return_method = true);
 663 
 664   void lookup_interface_method_stub(Register recv_klass,
 665                                     Register holder_klass,
 666                                     Register resolved_klass,
 667                                     Register method_result,
 668                                     Register scan_temp,
 669                                     Register temp_reg2,
 670                                     Register receiver,
 671                                     int itable_index,
 672                                     Label& L_no_such_interface);
 673 
 674   // virtual method calling
 675   void lookup_virtual_method(Register recv_klass,
 676                              RegisterOrConstant vtable_index,
 677                              Register method_result);
 678 
 679   // Test sub_klass against super_klass, with fast and slow paths.
 680 
 681   // The fast path produces a tri-state answer: yes / no / maybe-slow.
 682   // One of the three labels can be null, meaning take the fall-through.
 683   // If super_check_offset is -1, the value is loaded up from super_klass.
 684   // No registers are killed, except temp_reg.
 685   void check_klass_subtype_fast_path(Register sub_klass,
 686                                      Register super_klass,
 687                                      Register temp_reg,
 688                                      Label* L_success,
 689                                      Label* L_failure,
 690                                      Label* L_slow_path,
 691                 RegisterOrConstant super_check_offset = RegisterOrConstant(-1));
 692 
 693   // The rest of the type check; must be wired to a corresponding fast path.
 694   // It does not repeat the fast path logic, so don't use it standalone.
 695   // The temp_reg and temp2_reg can be noreg, if no temps are available.
 696   // Updates the sub's secondary super cache as necessary.
 697   // If set_cond_codes, condition codes will be Z on success, NZ on failure.
 698   void check_klass_subtype_slow_path(Register sub_klass,
 699                                      Register super_klass,
 700                                      Register temp_reg,
 701                                      Register temp2_reg,
 702                                      Label* L_success,
 703                                      Label* L_failure,
 704                                      bool set_cond_codes = false);
 705 
 706   // Simplified, combined version, good for typical uses.
 707   // Falls through on failure.
 708   void check_klass_subtype(Register sub_klass,
 709                            Register super_klass,
 710                            Register temp_reg,
 711                            Label& L_success);
 712 
 713   void clinit_barrier(Register klass,
 714                       Register thread,
 715                       Label* L_fast_path = nullptr,
 716                       Label* L_slow_path = nullptr);
 717 
 718   // method handles (JSR 292)
 719   Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0);
 720 
 721   // Debugging
 722 
 723   // only if +VerifyOops
 724   void _verify_oop(Register reg, const char* s, const char* file, int line);
 725   void _verify_oop_addr(Address addr, const char* s, const char* file, int line);
 726 
 727   void _verify_oop_checked(Register reg, const char* s, const char* file, int line) {
 728     if (VerifyOops) {
 729       _verify_oop(reg, s, file, line);
 730     }
 731   }
 732   void _verify_oop_addr_checked(Address reg, const char* s, const char* file, int line) {
 733     if (VerifyOops) {
 734       _verify_oop_addr(reg, s, file, line);
 735     }
 736   }
 737 
 738   // TODO: verify method and klass metadata (compare against vptr?)
 739   void _verify_method_ptr(Register reg, const char * msg, const char * file, int line) {}
 740   void _verify_klass_ptr(Register reg, const char * msg, const char * file, int line){}
 741 
 742 #define verify_oop(reg) _verify_oop_checked(reg, "broken oop " #reg, __FILE__, __LINE__)
 743 #define verify_oop_msg(reg, msg) _verify_oop_checked(reg, "broken oop " #reg ", " #msg, __FILE__, __LINE__)
 744 #define verify_oop_addr(addr) _verify_oop_addr_checked(addr, "broken oop addr " #addr, __FILE__, __LINE__)
 745 #define verify_method_ptr(reg) _verify_method_ptr(reg, "broken method " #reg, __FILE__, __LINE__)
 746 #define verify_klass_ptr(reg) _verify_klass_ptr(reg, "broken klass " #reg, __FILE__, __LINE__)
 747 
 748   // Verify or restore cpu control state after JNI call
 749   void restore_cpu_control_state_after_jni(Register rscratch);
 750 
 751   // prints msg, dumps registers and stops execution
 752   void stop(const char* msg);
 753 
 754   // prints msg and continues
 755   void warn(const char* msg);
 756 
 757   // dumps registers and other state
 758   void print_state();
 759 
 760   static void debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg);
 761   static void debug64(char* msg, int64_t pc, int64_t regs[]);
 762   static void print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip);
 763   static void print_state64(int64_t pc, int64_t regs[]);
 764 
 765   void os_breakpoint();
 766 
 767   void untested()                                { stop("untested"); }
 768 
 769   void unimplemented(const char* what = "");
 770 
 771   void should_not_reach_here()                   { stop("should not reach here"); }
 772 
 773   void print_CPU_state();
 774 
 775   // Stack overflow checking
 776   void bang_stack_with_offset(int offset) {
 777     // stack grows down, caller passes positive offset
 778     assert(offset > 0, "must bang with negative offset");
 779     movl(Address(rsp, (-offset)), rax);
 780   }
 781 
 782   // Writes to stack successive pages until offset reached to check for
 783   // stack overflow + shadow pages.  Also, clobbers tmp
 784   void bang_stack_size(Register size, Register tmp);
 785 
 786   // Check for reserved stack access in method being exited (for JIT)
 787   void reserved_stack_check();
 788 
 789   void safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod);
 790 
 791   void verify_tlab();
 792 
 793   static Condition negate_condition(Condition cond);
 794 
 795   // Instructions that use AddressLiteral operands. These instruction can handle 32bit/64bit
 796   // operands. In general the names are modified to avoid hiding the instruction in Assembler
 797   // so that we don't need to implement all the varieties in the Assembler with trivial wrappers
 798   // here in MacroAssembler. The major exception to this rule is call
 799 
 800   // Arithmetics
 801 
 802 
 803   void addptr(Address dst, int32_t src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)) ; }
 804   void addptr(Address dst, Register src);
 805 
 806   void addptr(Register dst, Address src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); }
 807   void addptr(Register dst, int32_t src);
 808   void addptr(Register dst, Register src);
 809   void addptr(Register dst, RegisterOrConstant src) {
 810     if (src.is_constant()) addptr(dst, checked_cast<int>(src.as_constant()));
 811     else                   addptr(dst, src.as_register());
 812   }
 813 
 814   void andptr(Register dst, int32_t src);
 815   void andptr(Register dst, Register src) { LP64_ONLY(andq(dst, src)) NOT_LP64(andl(dst, src)) ; }
 816   void andptr(Register dst, Address src) { LP64_ONLY(andq(dst, src)) NOT_LP64(andl(dst, src)) ; }
 817 
 818 #ifdef _LP64
 819   using Assembler::andq;
 820   void andq(Register dst, AddressLiteral src, Register rscratch = noreg);
 821 #endif
 822 
 823   void cmp8(AddressLiteral src1, int imm, Register rscratch = noreg);
 824 
 825   // renamed to drag out the casting of address to int32_t/intptr_t
 826   void cmp32(Register src1, int32_t imm);
 827 
 828   void cmp32(AddressLiteral src1, int32_t imm, Register rscratch = noreg);
 829   // compare reg - mem, or reg - &mem
 830   void cmp32(Register src1, AddressLiteral src2, Register rscratch = noreg);
 831 
 832   void cmp32(Register src1, Address src2);
 833 
 834 #ifndef _LP64
 835   void cmpklass(Address dst, Metadata* obj);
 836   void cmpklass(Register dst, Metadata* obj);
 837   void cmpoop(Address dst, jobject obj);
 838 #endif // _LP64
 839 
 840   void cmpoop(Register src1, Register src2);
 841   void cmpoop(Register src1, Address src2);
 842   void cmpoop(Register dst, jobject obj, Register rscratch);
 843 
 844   // NOTE src2 must be the lval. This is NOT an mem-mem compare
 845   void cmpptr(Address src1, AddressLiteral src2, Register rscratch);
 846 
 847   void cmpptr(Register src1, AddressLiteral src2, Register rscratch = noreg);
 848 
 849   void cmpptr(Register src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 850   void cmpptr(Register src1, Address src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 851   // void cmpptr(Address src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 852 
 853   void cmpptr(Register src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 854   void cmpptr(Address src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 855 
 856   // cmp64 to avoild hiding cmpq
 857   void cmp64(Register src1, AddressLiteral src, Register rscratch = noreg);
 858 
 859   void cmpxchgptr(Register reg, Address adr);
 860 
 861   void locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch = noreg);
 862 
 863   void imulptr(Register dst, Register src) { LP64_ONLY(imulq(dst, src)) NOT_LP64(imull(dst, src)); }
 864   void imulptr(Register dst, Register src, int imm32) { LP64_ONLY(imulq(dst, src, imm32)) NOT_LP64(imull(dst, src, imm32)); }
 865 
 866 
 867   void negptr(Register dst) { LP64_ONLY(negq(dst)) NOT_LP64(negl(dst)); }
 868 
 869   void notptr(Register dst) { LP64_ONLY(notq(dst)) NOT_LP64(notl(dst)); }
 870 
 871   void shlptr(Register dst, int32_t shift);
 872   void shlptr(Register dst) { LP64_ONLY(shlq(dst)) NOT_LP64(shll(dst)); }
 873 
 874   void shrptr(Register dst, int32_t shift);
 875   void shrptr(Register dst) { LP64_ONLY(shrq(dst)) NOT_LP64(shrl(dst)); }
 876 
 877   void sarptr(Register dst) { LP64_ONLY(sarq(dst)) NOT_LP64(sarl(dst)); }
 878   void sarptr(Register dst, int32_t src) { LP64_ONLY(sarq(dst, src)) NOT_LP64(sarl(dst, src)); }
 879 
 880   void subptr(Address dst, int32_t src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
 881 
 882   void subptr(Register dst, Address src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
 883   void subptr(Register dst, int32_t src);
 884   // Force generation of a 4 byte immediate value even if it fits into 8bit
 885   void subptr_imm32(Register dst, int32_t src);
 886   void subptr(Register dst, Register src);
 887   void subptr(Register dst, RegisterOrConstant src) {
 888     if (src.is_constant()) subptr(dst, (int) src.as_constant());
 889     else                   subptr(dst,       src.as_register());
 890   }
 891 
 892   void sbbptr(Address dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
 893   void sbbptr(Register dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
 894 
 895   void xchgptr(Register src1, Register src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
 896   void xchgptr(Register src1, Address src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
 897 
 898   void xaddptr(Address src1, Register src2) { LP64_ONLY(xaddq(src1, src2)) NOT_LP64(xaddl(src1, src2)) ; }
 899 
 900 
 901 
 902   // Helper functions for statistics gathering.
 903   // Conditionally (atomically, on MPs) increments passed counter address, preserving condition codes.
 904   void cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch = noreg);
 905   // Unconditional atomic increment.
 906   void atomic_incl(Address counter_addr);
 907   void atomic_incl(AddressLiteral counter_addr, Register rscratch = noreg);
 908 #ifdef _LP64
 909   void atomic_incq(Address counter_addr);
 910   void atomic_incq(AddressLiteral counter_addr, Register rscratch = noreg);
 911 #endif
 912   void atomic_incptr(AddressLiteral counter_addr, Register rscratch = noreg) { LP64_ONLY(atomic_incq(counter_addr, rscratch)) NOT_LP64(atomic_incl(counter_addr, rscratch)) ; }
 913   void atomic_incptr(Address counter_addr) { LP64_ONLY(atomic_incq(counter_addr)) NOT_LP64(atomic_incl(counter_addr)) ; }
 914 
 915   void lea(Register dst, Address        adr) { Assembler::lea(dst, adr); }
 916   void lea(Register dst, AddressLiteral adr);
 917   void lea(Address  dst, AddressLiteral adr, Register rscratch);
 918 
 919   void leal32(Register dst, Address src) { leal(dst, src); }
 920 
 921   // Import other testl() methods from the parent class or else
 922   // they will be hidden by the following overriding declaration.
 923   using Assembler::testl;
 924   void testl(Address dst, int32_t imm32);
 925   void testl(Register dst, int32_t imm32);
 926   void testl(Register dst, AddressLiteral src); // requires reachable address
 927   using Assembler::testq;
 928   void testq(Address dst, int32_t imm32);
 929   void testq(Register dst, int32_t imm32);
 930 
 931   void orptr(Register dst, Address src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 932   void orptr(Register dst, Register src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 933   void orptr(Register dst, int32_t src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 934   void orptr(Address dst, int32_t imm32) { LP64_ONLY(orq(dst, imm32)) NOT_LP64(orl(dst, imm32)); }
 935 
 936   void testptr(Register src, int32_t imm32) {  LP64_ONLY(testq(src, imm32)) NOT_LP64(testl(src, imm32)); }
 937   void testptr(Register src1, Address src2) { LP64_ONLY(testq(src1, src2)) NOT_LP64(testl(src1, src2)); }
 938   void testptr(Address src, int32_t imm32) {  LP64_ONLY(testq(src, imm32)) NOT_LP64(testl(src, imm32)); }
 939   void testptr(Register src1, Register src2);
 940 
 941   void xorptr(Register dst, Register src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
 942   void xorptr(Register dst, Address src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
 943 
 944   // Calls
 945 
 946   void call(Label& L, relocInfo::relocType rtype);
 947   void call(Register entry);
 948   void call(Address addr) { Assembler::call(addr); }
 949 
 950   // NOTE: this call transfers to the effective address of entry NOT
 951   // the address contained by entry. This is because this is more natural
 952   // for jumps/calls.
 953   void call(AddressLiteral entry, Register rscratch = rax);
 954 
 955   // Emit the CompiledIC call idiom
 956   void ic_call(address entry, jint method_index = 0);
 957   static int ic_check_size();
 958   int ic_check(int end_alignment);
 959 
 960   void emit_static_call_stub();
 961 
 962   // Jumps
 963 
 964   // NOTE: these jumps transfer to the effective address of dst NOT
 965   // the address contained by dst. This is because this is more natural
 966   // for jumps/calls.
 967   void jump(AddressLiteral dst, Register rscratch = noreg);
 968 
 969   void jump_cc(Condition cc, AddressLiteral dst, Register rscratch = noreg);
 970 
 971   // 32bit can do a case table jump in one instruction but we no longer allow the base
 972   // to be installed in the Address class. This jump will transfer to the address
 973   // contained in the location described by entry (not the address of entry)
 974   void jump(ArrayAddress entry, Register rscratch);
 975 
 976   // Floating
 977 
 978   void push_f(XMMRegister r);
 979   void pop_f(XMMRegister r);
 980   void push_d(XMMRegister r);
 981   void pop_d(XMMRegister r);
 982 
 983   void andpd(XMMRegister dst, XMMRegister    src) { Assembler::andpd(dst, src); }
 984   void andpd(XMMRegister dst, Address        src) { Assembler::andpd(dst, src); }
 985   void andpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 986 
 987   void andps(XMMRegister dst, XMMRegister    src) { Assembler::andps(dst, src); }
 988   void andps(XMMRegister dst, Address        src) { Assembler::andps(dst, src); }
 989   void andps(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 990 
 991   void comiss(XMMRegister dst, XMMRegister    src) { Assembler::comiss(dst, src); }
 992   void comiss(XMMRegister dst, Address        src) { Assembler::comiss(dst, src); }
 993   void comiss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 994 
 995   void comisd(XMMRegister dst, XMMRegister    src) { Assembler::comisd(dst, src); }
 996   void comisd(XMMRegister dst, Address        src) { Assembler::comisd(dst, src); }
 997   void comisd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 998 
 999 #ifndef _LP64
1000   void fadd_s(Address        src) { Assembler::fadd_s(src); }
1001   void fadd_s(AddressLiteral src) { Assembler::fadd_s(as_Address(src)); }
1002 
1003   void fldcw(Address        src) { Assembler::fldcw(src); }
1004   void fldcw(AddressLiteral src);
1005 
1006   void fld_s(int index)          { Assembler::fld_s(index); }
1007   void fld_s(Address        src) { Assembler::fld_s(src); }
1008   void fld_s(AddressLiteral src);
1009 
1010   void fld_d(Address        src) { Assembler::fld_d(src); }
1011   void fld_d(AddressLiteral src);
1012 
1013   void fld_x(Address        src) { Assembler::fld_x(src); }
1014   void fld_x(AddressLiteral src) { Assembler::fld_x(as_Address(src)); }
1015 
1016   void fmul_s(Address        src) { Assembler::fmul_s(src); }
1017   void fmul_s(AddressLiteral src) { Assembler::fmul_s(as_Address(src)); }
1018 #endif // !_LP64
1019 
1020   void ldmxcsr(Address src) { Assembler::ldmxcsr(src); }
1021   void ldmxcsr(AddressLiteral src, Register rscratch = noreg);
1022 
1023 #ifdef _LP64
1024  private:
1025   void sha256_AVX2_one_round_compute(
1026     Register  reg_old_h,
1027     Register  reg_a,
1028     Register  reg_b,
1029     Register  reg_c,
1030     Register  reg_d,
1031     Register  reg_e,
1032     Register  reg_f,
1033     Register  reg_g,
1034     Register  reg_h,
1035     int iter);
1036   void sha256_AVX2_four_rounds_compute_first(int start);
1037   void sha256_AVX2_four_rounds_compute_last(int start);
1038   void sha256_AVX2_one_round_and_sched(
1039         XMMRegister xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
1040         XMMRegister xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
1041         XMMRegister xmm_2,     /* ymm6 */
1042         XMMRegister xmm_3,     /* ymm7 */
1043         Register    reg_a,      /* == eax on 0 iteration, then rotate 8 register right on each next iteration */
1044         Register    reg_b,      /* ebx */    /* full cycle is 8 iterations */
1045         Register    reg_c,      /* edi */
1046         Register    reg_d,      /* esi */
1047         Register    reg_e,      /* r8d */
1048         Register    reg_f,      /* r9d */
1049         Register    reg_g,      /* r10d */
1050         Register    reg_h,      /* r11d */
1051         int iter);
1052 
1053   void addm(int disp, Register r1, Register r2);
1054 
1055   void sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c, Register d,
1056                                      Register e, Register f, Register g, Register h, int iteration);
1057 
1058   void sha512_AVX2_one_round_and_schedule(XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1059                                           Register a, Register b, Register c, Register d, Register e, Register f,
1060                                           Register g, Register h, int iteration);
1061 
1062   void addmq(int disp, Register r1, Register r2);
1063  public:
1064   void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1065                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1066                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1067                    bool multi_block, XMMRegister shuf_mask);
1068   void sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1069                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1070                    Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block,
1071                    XMMRegister shuf_mask);
1072 #endif // _LP64
1073 
1074   void fast_md5(Register buf, Address state, Address ofs, Address limit,
1075                 bool multi_block);
1076 
1077   void fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
1078                  XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
1079                  Register buf, Register state, Register ofs, Register limit, Register rsp,
1080                  bool multi_block);
1081 
1082 #ifdef _LP64
1083   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1084                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1085                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1086                    bool multi_block, XMMRegister shuf_mask);
1087 #else
1088   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1089                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1090                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1091                    bool multi_block);
1092 #endif
1093 
1094   void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1095                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1096                 Register rax, Register rcx, Register rdx, Register tmp);
1097 
1098 #ifndef _LP64
1099  private:
1100   // Initialized in macroAssembler_x86_constants.cpp
1101   static address ONES;
1102   static address L_2IL0FLOATPACKET_0;
1103   static address PI4_INV;
1104   static address PI4X3;
1105   static address PI4X4;
1106 
1107  public:
1108   void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1109                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1110                 Register rax, Register rcx, Register rdx, Register tmp1);
1111 
1112   void fast_log10(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1113                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1114                 Register rax, Register rcx, Register rdx, Register tmp);
1115 
1116   void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1117                 XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
1118                 Register rdx, Register tmp);
1119 
1120   void fast_sin(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1121                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1122                 Register rax, Register rbx, Register rdx);
1123 
1124   void fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1125                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1126                 Register rax, Register rcx, Register rdx, Register tmp);
1127 
1128   void libm_sincos_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1129                         Register edx, Register ebx, Register esi, Register edi,
1130                         Register ebp, Register esp);
1131 
1132   void libm_reduce_pi04l(Register eax, Register ecx, Register edx, Register ebx,
1133                          Register esi, Register edi, Register ebp, Register esp);
1134 
1135   void libm_tancot_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1136                         Register edx, Register ebx, Register esi, Register edi,
1137                         Register ebp, Register esp);
1138 
1139   void fast_tan(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1140                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1141                 Register rax, Register rcx, Register rdx, Register tmp);
1142 #endif // !_LP64
1143 
1144 private:
1145 
1146   // these are private because users should be doing movflt/movdbl
1147 
1148   void movss(Address     dst, XMMRegister    src) { Assembler::movss(dst, src); }
1149   void movss(XMMRegister dst, XMMRegister    src) { Assembler::movss(dst, src); }
1150   void movss(XMMRegister dst, Address        src) { Assembler::movss(dst, src); }
1151   void movss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1152 
1153   void movlpd(XMMRegister dst, Address        src) {Assembler::movlpd(dst, src); }
1154   void movlpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1155 
1156 public:
1157 
1158   void addsd(XMMRegister dst, XMMRegister    src) { Assembler::addsd(dst, src); }
1159   void addsd(XMMRegister dst, Address        src) { Assembler::addsd(dst, src); }
1160   void addsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1161 
1162   void addss(XMMRegister dst, XMMRegister    src) { Assembler::addss(dst, src); }
1163   void addss(XMMRegister dst, Address        src) { Assembler::addss(dst, src); }
1164   void addss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1165 
1166   void addpd(XMMRegister dst, XMMRegister    src) { Assembler::addpd(dst, src); }
1167   void addpd(XMMRegister dst, Address        src) { Assembler::addpd(dst, src); }
1168   void addpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1169 
1170   using Assembler::vbroadcastsd;
1171   void vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1172 
1173   using Assembler::vbroadcastss;
1174   void vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1175 
1176   // Vector float blend
1177   void vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg);
1178   void vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg);
1179 
1180   void divsd(XMMRegister dst, XMMRegister    src) { Assembler::divsd(dst, src); }
1181   void divsd(XMMRegister dst, Address        src) { Assembler::divsd(dst, src); }
1182   void divsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1183 
1184   void divss(XMMRegister dst, XMMRegister    src) { Assembler::divss(dst, src); }
1185   void divss(XMMRegister dst, Address        src) { Assembler::divss(dst, src); }
1186   void divss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1187 
1188   // Move Unaligned Double Quadword
1189   void movdqu(Address     dst, XMMRegister    src);
1190   void movdqu(XMMRegister dst, XMMRegister    src);
1191   void movdqu(XMMRegister dst, Address        src);
1192   void movdqu(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1193 
1194   void kmovwl(Register  dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1195   void kmovwl(Address   dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1196   void kmovwl(KRegister dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1197   void kmovwl(KRegister dst, Register       src) { Assembler::kmovwl(dst, src); }
1198   void kmovwl(KRegister dst, Address        src) { Assembler::kmovwl(dst, src); }
1199   void kmovwl(KRegister dst, AddressLiteral src, Register rscratch = noreg);
1200 
1201   void kmovql(KRegister dst, KRegister      src) { Assembler::kmovql(dst, src); }
1202   void kmovql(KRegister dst, Register       src) { Assembler::kmovql(dst, src); }
1203   void kmovql(Register  dst, KRegister      src) { Assembler::kmovql(dst, src); }
1204   void kmovql(KRegister dst, Address        src) { Assembler::kmovql(dst, src); }
1205   void kmovql(Address   dst, KRegister      src) { Assembler::kmovql(dst, src); }
1206   void kmovql(KRegister dst, AddressLiteral src, Register rscratch = noreg);
1207 
1208   // Safe move operation, lowers down to 16bit moves for targets supporting
1209   // AVX512F feature and 64bit moves for targets supporting AVX512BW feature.
1210   void kmov(Address  dst, KRegister src);
1211   void kmov(KRegister dst, Address src);
1212   void kmov(KRegister dst, KRegister src);
1213   void kmov(Register dst, KRegister src);
1214   void kmov(KRegister dst, Register src);
1215 
1216   using Assembler::movddup;
1217   void movddup(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1218 
1219   using Assembler::vmovddup;
1220   void vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1221 
1222   // AVX Unaligned forms
1223   void vmovdqu(Address     dst, XMMRegister    src);
1224   void vmovdqu(XMMRegister dst, Address        src);
1225   void vmovdqu(XMMRegister dst, XMMRegister    src);
1226   void vmovdqu(XMMRegister dst, AddressLiteral src,                 Register rscratch = noreg);
1227   void vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1228 
1229   // AVX512 Unaligned
1230   void evmovdqu(BasicType type, KRegister kmask, Address     dst, XMMRegister src, bool merge, int vector_len);
1231   void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address     src, bool merge, int vector_len);
1232 
1233   void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::evmovdqub(dst, src, vector_len); }
1234   void evmovdqub(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdqub(dst, src, vector_len); }
1235 
1236   void evmovdqub(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1237     if (dst->encoding() != src->encoding() || mask != k0)  {
1238       Assembler::evmovdqub(dst, mask, src, merge, vector_len);
1239     }
1240   }
1241   void evmovdqub(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
1242   void evmovdqub(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
1243   void evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1244 
1245   void evmovdquw(Address     dst, XMMRegister src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
1246   void evmovdquw(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
1247 
1248   void evmovdquw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1249     if (dst->encoding() != src->encoding() || mask != k0) {
1250       Assembler::evmovdquw(dst, mask, src, merge, vector_len);
1251     }
1252   }
1253   void evmovdquw(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
1254   void evmovdquw(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
1255   void evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1256 
1257   void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
1258      if (dst->encoding() != src->encoding()) {
1259        Assembler::evmovdqul(dst, src, vector_len);
1260      }
1261   }
1262   void evmovdqul(Address     dst, XMMRegister src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
1263   void evmovdqul(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
1264 
1265   void evmovdqul(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1266     if (dst->encoding() != src->encoding() || mask != k0)  {
1267       Assembler::evmovdqul(dst, mask, src, merge, vector_len);
1268     }
1269   }
1270   void evmovdqul(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
1271   void evmovdqul(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
1272   void evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1273 
1274   void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) {
1275     if (dst->encoding() != src->encoding()) {
1276       Assembler::evmovdquq(dst, src, vector_len);
1277     }
1278   }
1279   void evmovdquq(XMMRegister dst, Address        src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1280   void evmovdquq(Address     dst, XMMRegister    src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1281   void evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1282 
1283   void evmovdquq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1284     if (dst->encoding() != src->encoding() || mask != k0) {
1285       Assembler::evmovdquq(dst, mask, src, merge, vector_len);
1286     }
1287   }
1288   void evmovdquq(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
1289   void evmovdquq(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
1290   void evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1291 
1292   // Move Aligned Double Quadword
1293   void movdqa(XMMRegister dst, XMMRegister    src) { Assembler::movdqa(dst, src); }
1294   void movdqa(XMMRegister dst, Address        src) { Assembler::movdqa(dst, src); }
1295   void movdqa(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1296 
1297   void movsd(Address     dst, XMMRegister    src) { Assembler::movsd(dst, src); }
1298   void movsd(XMMRegister dst, XMMRegister    src) { Assembler::movsd(dst, src); }
1299   void movsd(XMMRegister dst, Address        src) { Assembler::movsd(dst, src); }
1300   void movsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1301 
1302   void mulpd(XMMRegister dst, XMMRegister    src) { Assembler::mulpd(dst, src); }
1303   void mulpd(XMMRegister dst, Address        src) { Assembler::mulpd(dst, src); }
1304   void mulpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1305 
1306   void mulsd(XMMRegister dst, XMMRegister    src) { Assembler::mulsd(dst, src); }
1307   void mulsd(XMMRegister dst, Address        src) { Assembler::mulsd(dst, src); }
1308   void mulsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1309 
1310   void mulss(XMMRegister dst, XMMRegister    src) { Assembler::mulss(dst, src); }
1311   void mulss(XMMRegister dst, Address        src) { Assembler::mulss(dst, src); }
1312   void mulss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1313 
1314   // Carry-Less Multiplication Quadword
1315   void pclmulldq(XMMRegister dst, XMMRegister src) {
1316     // 0x00 - multiply lower 64 bits [0:63]
1317     Assembler::pclmulqdq(dst, src, 0x00);
1318   }
1319   void pclmulhdq(XMMRegister dst, XMMRegister src) {
1320     // 0x11 - multiply upper 64 bits [64:127]
1321     Assembler::pclmulqdq(dst, src, 0x11);
1322   }
1323 
1324   void pcmpeqb(XMMRegister dst, XMMRegister src);
1325   void pcmpeqw(XMMRegister dst, XMMRegister src);
1326 
1327   void pcmpestri(XMMRegister dst, Address src, int imm8);
1328   void pcmpestri(XMMRegister dst, XMMRegister src, int imm8);
1329 
1330   void pmovzxbw(XMMRegister dst, XMMRegister src);
1331   void pmovzxbw(XMMRegister dst, Address src);
1332 
1333   void pmovmskb(Register dst, XMMRegister src);
1334 
1335   void ptest(XMMRegister dst, XMMRegister src);
1336 
1337   void roundsd(XMMRegister dst, XMMRegister    src, int32_t rmode) { Assembler::roundsd(dst, src, rmode); }
1338   void roundsd(XMMRegister dst, Address        src, int32_t rmode) { Assembler::roundsd(dst, src, rmode); }
1339   void roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch = noreg);
1340 
1341   void sqrtss(XMMRegister dst, XMMRegister     src) { Assembler::sqrtss(dst, src); }
1342   void sqrtss(XMMRegister dst, Address         src) { Assembler::sqrtss(dst, src); }
1343   void sqrtss(XMMRegister dst, AddressLiteral  src, Register rscratch = noreg);
1344 
1345   void subsd(XMMRegister dst, XMMRegister    src) { Assembler::subsd(dst, src); }
1346   void subsd(XMMRegister dst, Address        src) { Assembler::subsd(dst, src); }
1347   void subsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1348 
1349   void subss(XMMRegister dst, XMMRegister    src) { Assembler::subss(dst, src); }
1350   void subss(XMMRegister dst, Address        src) { Assembler::subss(dst, src); }
1351   void subss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1352 
1353   void ucomiss(XMMRegister dst, XMMRegister    src) { Assembler::ucomiss(dst, src); }
1354   void ucomiss(XMMRegister dst, Address        src) { Assembler::ucomiss(dst, src); }
1355   void ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1356 
1357   void ucomisd(XMMRegister dst, XMMRegister    src) { Assembler::ucomisd(dst, src); }
1358   void ucomisd(XMMRegister dst, Address        src) { Assembler::ucomisd(dst, src); }
1359   void ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1360 
1361   // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
1362   void xorpd(XMMRegister dst, XMMRegister    src);
1363   void xorpd(XMMRegister dst, Address        src) { Assembler::xorpd(dst, src); }
1364   void xorpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1365 
1366   // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
1367   void xorps(XMMRegister dst, XMMRegister    src);
1368   void xorps(XMMRegister dst, Address        src) { Assembler::xorps(dst, src); }
1369   void xorps(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1370 
1371   // Shuffle Bytes
1372   void pshufb(XMMRegister dst, XMMRegister    src) { Assembler::pshufb(dst, src); }
1373   void pshufb(XMMRegister dst, Address        src) { Assembler::pshufb(dst, src); }
1374   void pshufb(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1375   // AVX 3-operands instructions
1376 
1377   void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vaddsd(dst, nds, src); }
1378   void vaddsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vaddsd(dst, nds, src); }
1379   void vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1380 
1381   void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vaddss(dst, nds, src); }
1382   void vaddss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vaddss(dst, nds, src); }
1383   void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1384 
1385   void vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch = noreg);
1386   void vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch = noreg);
1387 
1388   void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len);
1389   void vpaddb(XMMRegister dst, XMMRegister nds, Address        src, int vector_len);
1390   void vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1391 
1392   void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1393   void vpaddw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1394 
1395   void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1396   void vpaddd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1397   void vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1398 
1399   void vpand(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1400   void vpand(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1401   void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1402 
1403   using Assembler::vpbroadcastd;
1404   void vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1405 
1406   using Assembler::vpbroadcastq;
1407   void vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1408 
1409   void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1410 
1411   void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1412   void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1413 
1414   // Vector compares
1415   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1416     Assembler::evpcmpd(kdst, mask, nds, src, comparison, is_signed, vector_len);
1417   }
1418   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1419 
1420   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1421     Assembler::evpcmpq(kdst, mask, nds, src, comparison, is_signed, vector_len);
1422   }
1423   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1424 
1425   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1426     Assembler::evpcmpb(kdst, mask, nds, src, comparison, is_signed, vector_len);
1427   }
1428   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1429 
1430   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1431     Assembler::evpcmpw(kdst, mask, nds, src, comparison, is_signed, vector_len);
1432   }
1433   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1434 
1435   void evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len);
1436 
1437   // Emit comparison instruction for the specified comparison predicate.
1438   void vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len);
1439   void vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len);
1440 
1441   void vpmovzxbw(XMMRegister dst, Address     src, int vector_len);
1442   void vpmovzxbw(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vpmovzxbw(dst, src, vector_len); }
1443 
1444   void vpmovmskb(Register dst, XMMRegister src, int vector_len = Assembler::AVX_256bit);
1445 
1446   void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1447   void vpmullw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1448 
1449   void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); }
1450   void vpmulld(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); }
1451   void vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1452 
1453   void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1454   void vpsubb(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1455 
1456   void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1457   void vpsubw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1458 
1459   void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1460   void vpsraw(XMMRegister dst, XMMRegister nds, int         shift, int vector_len);
1461 
1462   void evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1463   void evpsraq(XMMRegister dst, XMMRegister nds, int         shift, int vector_len);
1464 
1465   void evpsllw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1466     if (!is_varshift) {
1467       Assembler::evpsllw(dst, mask, nds, src, merge, vector_len);
1468     } else {
1469       Assembler::evpsllvw(dst, mask, nds, src, merge, vector_len);
1470     }
1471   }
1472   void evpslld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1473     if (!is_varshift) {
1474       Assembler::evpslld(dst, mask, nds, src, merge, vector_len);
1475     } else {
1476       Assembler::evpsllvd(dst, mask, nds, src, merge, vector_len);
1477     }
1478   }
1479   void evpsllq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1480     if (!is_varshift) {
1481       Assembler::evpsllq(dst, mask, nds, src, merge, vector_len);
1482     } else {
1483       Assembler::evpsllvq(dst, mask, nds, src, merge, vector_len);
1484     }
1485   }
1486   void evpsrlw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1487     if (!is_varshift) {
1488       Assembler::evpsrlw(dst, mask, nds, src, merge, vector_len);
1489     } else {
1490       Assembler::evpsrlvw(dst, mask, nds, src, merge, vector_len);
1491     }
1492   }
1493   void evpsrld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1494     if (!is_varshift) {
1495       Assembler::evpsrld(dst, mask, nds, src, merge, vector_len);
1496     } else {
1497       Assembler::evpsrlvd(dst, mask, nds, src, merge, vector_len);
1498     }
1499   }
1500   void evpsrlq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1501     if (!is_varshift) {
1502       Assembler::evpsrlq(dst, mask, nds, src, merge, vector_len);
1503     } else {
1504       Assembler::evpsrlvq(dst, mask, nds, src, merge, vector_len);
1505     }
1506   }
1507   void evpsraw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1508     if (!is_varshift) {
1509       Assembler::evpsraw(dst, mask, nds, src, merge, vector_len);
1510     } else {
1511       Assembler::evpsravw(dst, mask, nds, src, merge, vector_len);
1512     }
1513   }
1514   void evpsrad(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1515     if (!is_varshift) {
1516       Assembler::evpsrad(dst, mask, nds, src, merge, vector_len);
1517     } else {
1518       Assembler::evpsravd(dst, mask, nds, src, merge, vector_len);
1519     }
1520   }
1521   void evpsraq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1522     if (!is_varshift) {
1523       Assembler::evpsraq(dst, mask, nds, src, merge, vector_len);
1524     } else {
1525       Assembler::evpsravq(dst, mask, nds, src, merge, vector_len);
1526     }
1527   }
1528 
1529   void evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1530   void evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1531   void evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1532   void evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1533 
1534   void vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1535   void vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1536 
1537   void vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1538   void vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1539 
1540   void vptest(XMMRegister dst, XMMRegister src);
1541   void vptest(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vptest(dst, src, vector_len); }
1542 
1543   void punpcklbw(XMMRegister dst, XMMRegister src);
1544   void punpcklbw(XMMRegister dst, Address src) { Assembler::punpcklbw(dst, src); }
1545 
1546   void pshufd(XMMRegister dst, Address src, int mode);
1547   void pshufd(XMMRegister dst, XMMRegister src, int mode) { Assembler::pshufd(dst, src, mode); }
1548 
1549   void pshuflw(XMMRegister dst, XMMRegister src, int mode);
1550   void pshuflw(XMMRegister dst, Address src, int mode) { Assembler::pshuflw(dst, src, mode); }
1551 
1552   void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1553   void vandpd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1554   void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1555 
1556   void vandps(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1557   void vandps(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1558   void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1559 
1560   void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1561 
1562   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vdivsd(dst, nds, src); }
1563   void vdivsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vdivsd(dst, nds, src); }
1564   void vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1565 
1566   void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vdivss(dst, nds, src); }
1567   void vdivss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vdivss(dst, nds, src); }
1568   void vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1569 
1570   void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vmulsd(dst, nds, src); }
1571   void vmulsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vmulsd(dst, nds, src); }
1572   void vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1573 
1574   void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vmulss(dst, nds, src); }
1575   void vmulss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vmulss(dst, nds, src); }
1576   void vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1577 
1578   void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vsubsd(dst, nds, src); }
1579   void vsubsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vsubsd(dst, nds, src); }
1580   void vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1581 
1582   void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vsubss(dst, nds, src); }
1583   void vsubss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vsubss(dst, nds, src); }
1584   void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1585 
1586   void vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1587   void vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1588 
1589   // AVX Vector instructions
1590 
1591   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1592   void vxorpd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1593   void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1594 
1595   void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1596   void vxorps(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1597   void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1598 
1599   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1600     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1601       Assembler::vpxor(dst, nds, src, vector_len);
1602     else
1603       Assembler::vxorpd(dst, nds, src, vector_len);
1604   }
1605   void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
1606     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1607       Assembler::vpxor(dst, nds, src, vector_len);
1608     else
1609       Assembler::vxorpd(dst, nds, src, vector_len);
1610   }
1611   void vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1612 
1613   // Simple version for AVX2 256bit vectors
1614   void vpxor(XMMRegister dst, XMMRegister src) {
1615     assert(UseAVX >= 2, "Should be at least AVX2");
1616     Assembler::vpxor(dst, dst, src, AVX_256bit);
1617   }
1618   void vpxor(XMMRegister dst, Address src) {
1619     assert(UseAVX >= 2, "Should be at least AVX2");
1620     Assembler::vpxor(dst, dst, src, AVX_256bit);
1621   }
1622 
1623   void vpermd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpermd(dst, nds, src, vector_len); }
1624   void vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1625 
1626   void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
1627     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1628       Assembler::vinserti32x4(dst, nds, src, imm8);
1629     } else if (UseAVX > 1) {
1630       // vinserti128 is available only in AVX2
1631       Assembler::vinserti128(dst, nds, src, imm8);
1632     } else {
1633       Assembler::vinsertf128(dst, nds, src, imm8);
1634     }
1635   }
1636 
1637   void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
1638     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1639       Assembler::vinserti32x4(dst, nds, src, imm8);
1640     } else if (UseAVX > 1) {
1641       // vinserti128 is available only in AVX2
1642       Assembler::vinserti128(dst, nds, src, imm8);
1643     } else {
1644       Assembler::vinsertf128(dst, nds, src, imm8);
1645     }
1646   }
1647 
1648   void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
1649     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1650       Assembler::vextracti32x4(dst, src, imm8);
1651     } else if (UseAVX > 1) {
1652       // vextracti128 is available only in AVX2
1653       Assembler::vextracti128(dst, src, imm8);
1654     } else {
1655       Assembler::vextractf128(dst, src, imm8);
1656     }
1657   }
1658 
1659   void vextracti128(Address dst, XMMRegister src, uint8_t imm8) {
1660     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1661       Assembler::vextracti32x4(dst, src, imm8);
1662     } else if (UseAVX > 1) {
1663       // vextracti128 is available only in AVX2
1664       Assembler::vextracti128(dst, src, imm8);
1665     } else {
1666       Assembler::vextractf128(dst, src, imm8);
1667     }
1668   }
1669 
1670   // 128bit copy to/from high 128 bits of 256bit (YMM) vector registers
1671   void vinserti128_high(XMMRegister dst, XMMRegister src) {
1672     vinserti128(dst, dst, src, 1);
1673   }
1674   void vinserti128_high(XMMRegister dst, Address src) {
1675     vinserti128(dst, dst, src, 1);
1676   }
1677   void vextracti128_high(XMMRegister dst, XMMRegister src) {
1678     vextracti128(dst, src, 1);
1679   }
1680   void vextracti128_high(Address dst, XMMRegister src) {
1681     vextracti128(dst, src, 1);
1682   }
1683 
1684   void vinsertf128_high(XMMRegister dst, XMMRegister src) {
1685     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1686       Assembler::vinsertf32x4(dst, dst, src, 1);
1687     } else {
1688       Assembler::vinsertf128(dst, dst, src, 1);
1689     }
1690   }
1691 
1692   void vinsertf128_high(XMMRegister dst, Address src) {
1693     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1694       Assembler::vinsertf32x4(dst, dst, src, 1);
1695     } else {
1696       Assembler::vinsertf128(dst, dst, src, 1);
1697     }
1698   }
1699 
1700   void vextractf128_high(XMMRegister dst, XMMRegister src) {
1701     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1702       Assembler::vextractf32x4(dst, src, 1);
1703     } else {
1704       Assembler::vextractf128(dst, src, 1);
1705     }
1706   }
1707 
1708   void vextractf128_high(Address dst, XMMRegister src) {
1709     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1710       Assembler::vextractf32x4(dst, src, 1);
1711     } else {
1712       Assembler::vextractf128(dst, src, 1);
1713     }
1714   }
1715 
1716   // 256bit copy to/from high 256 bits of 512bit (ZMM) vector registers
1717   void vinserti64x4_high(XMMRegister dst, XMMRegister src) {
1718     Assembler::vinserti64x4(dst, dst, src, 1);
1719   }
1720   void vinsertf64x4_high(XMMRegister dst, XMMRegister src) {
1721     Assembler::vinsertf64x4(dst, dst, src, 1);
1722   }
1723   void vextracti64x4_high(XMMRegister dst, XMMRegister src) {
1724     Assembler::vextracti64x4(dst, src, 1);
1725   }
1726   void vextractf64x4_high(XMMRegister dst, XMMRegister src) {
1727     Assembler::vextractf64x4(dst, src, 1);
1728   }
1729   void vextractf64x4_high(Address dst, XMMRegister src) {
1730     Assembler::vextractf64x4(dst, src, 1);
1731   }
1732   void vinsertf64x4_high(XMMRegister dst, Address src) {
1733     Assembler::vinsertf64x4(dst, dst, src, 1);
1734   }
1735 
1736   // 128bit copy to/from low 128 bits of 256bit (YMM) vector registers
1737   void vinserti128_low(XMMRegister dst, XMMRegister src) {
1738     vinserti128(dst, dst, src, 0);
1739   }
1740   void vinserti128_low(XMMRegister dst, Address src) {
1741     vinserti128(dst, dst, src, 0);
1742   }
1743   void vextracti128_low(XMMRegister dst, XMMRegister src) {
1744     vextracti128(dst, src, 0);
1745   }
1746   void vextracti128_low(Address dst, XMMRegister src) {
1747     vextracti128(dst, src, 0);
1748   }
1749 
1750   void vinsertf128_low(XMMRegister dst, XMMRegister src) {
1751     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1752       Assembler::vinsertf32x4(dst, dst, src, 0);
1753     } else {
1754       Assembler::vinsertf128(dst, dst, src, 0);
1755     }
1756   }
1757 
1758   void vinsertf128_low(XMMRegister dst, Address src) {
1759     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1760       Assembler::vinsertf32x4(dst, dst, src, 0);
1761     } else {
1762       Assembler::vinsertf128(dst, dst, src, 0);
1763     }
1764   }
1765 
1766   void vextractf128_low(XMMRegister dst, XMMRegister src) {
1767     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1768       Assembler::vextractf32x4(dst, src, 0);
1769     } else {
1770       Assembler::vextractf128(dst, src, 0);
1771     }
1772   }
1773 
1774   void vextractf128_low(Address dst, XMMRegister src) {
1775     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1776       Assembler::vextractf32x4(dst, src, 0);
1777     } else {
1778       Assembler::vextractf128(dst, src, 0);
1779     }
1780   }
1781 
1782   // 256bit copy to/from low 256 bits of 512bit (ZMM) vector registers
1783   void vinserti64x4_low(XMMRegister dst, XMMRegister src) {
1784     Assembler::vinserti64x4(dst, dst, src, 0);
1785   }
1786   void vinsertf64x4_low(XMMRegister dst, XMMRegister src) {
1787     Assembler::vinsertf64x4(dst, dst, src, 0);
1788   }
1789   void vextracti64x4_low(XMMRegister dst, XMMRegister src) {
1790     Assembler::vextracti64x4(dst, src, 0);
1791   }
1792   void vextractf64x4_low(XMMRegister dst, XMMRegister src) {
1793     Assembler::vextractf64x4(dst, src, 0);
1794   }
1795   void vextractf64x4_low(Address dst, XMMRegister src) {
1796     Assembler::vextractf64x4(dst, src, 0);
1797   }
1798   void vinsertf64x4_low(XMMRegister dst, Address src) {
1799     Assembler::vinsertf64x4(dst, dst, src, 0);
1800   }
1801 
1802   // Carry-Less Multiplication Quadword
1803   void vpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1804     // 0x00 - multiply lower 64 bits [0:63]
1805     Assembler::vpclmulqdq(dst, nds, src, 0x00);
1806   }
1807   void vpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1808     // 0x11 - multiply upper 64 bits [64:127]
1809     Assembler::vpclmulqdq(dst, nds, src, 0x11);
1810   }
1811   void vpclmullqhqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1812     // 0x10 - multiply nds[0:63] and src[64:127]
1813     Assembler::vpclmulqdq(dst, nds, src, 0x10);
1814   }
1815   void vpclmulhqlqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1816     //0x01 - multiply nds[64:127] and src[0:63]
1817     Assembler::vpclmulqdq(dst, nds, src, 0x01);
1818   }
1819 
1820   void evpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1821     // 0x00 - multiply lower 64 bits [0:63]
1822     Assembler::evpclmulqdq(dst, nds, src, 0x00, vector_len);
1823   }
1824   void evpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1825     // 0x11 - multiply upper 64 bits [64:127]
1826     Assembler::evpclmulqdq(dst, nds, src, 0x11, vector_len);
1827   }
1828 
1829   // AVX-512 mask operations.
1830   void kand(BasicType etype, KRegister dst, KRegister src1, KRegister src2);
1831   void kor(BasicType type, KRegister dst, KRegister src1, KRegister src2);
1832   void knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp = knoreg, Register rtmp = noreg);
1833   void kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2);
1834   void kortest(uint masklen, KRegister src1, KRegister src2);
1835   void ktest(uint masklen, KRegister src1, KRegister src2);
1836 
1837   void evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1838   void evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1839 
1840   void evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1841   void evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1842 
1843   void evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1844   void evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1845 
1846   void evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1847   void evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1848 
1849   void evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
1850   void evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
1851   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
1852   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
1853 
1854   using Assembler::evpandq;
1855   void evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1856 
1857   using Assembler::evpaddq;
1858   void evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1859 
1860   using Assembler::evporq;
1861   void evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1862 
1863   using Assembler::vpshufb;
1864   void vpshufb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1865 
1866   using Assembler::vpternlogq;
1867   void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch = noreg);
1868 
1869   void cmov32( Condition cc, Register dst, Address  src);
1870   void cmov32( Condition cc, Register dst, Register src);
1871 
1872   void cmov(   Condition cc, Register dst, Register src) { cmovptr(cc, dst, src); }
1873 
1874   void cmovptr(Condition cc, Register dst, Address  src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
1875   void cmovptr(Condition cc, Register dst, Register src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
1876 
1877   void movoop(Register dst, jobject obj);
1878   void movoop(Address  dst, jobject obj, Register rscratch);
1879 
1880   void mov_metadata(Register dst, Metadata* obj);
1881   void mov_metadata(Address  dst, Metadata* obj, Register rscratch);
1882 
1883   void movptr(Register     dst, Register       src);
1884   void movptr(Register     dst, Address        src);
1885   void movptr(Register     dst, AddressLiteral src);
1886   void movptr(Register     dst, ArrayAddress   src);
1887   void movptr(Register     dst, intptr_t       src);
1888   void movptr(Address      dst, Register       src);
1889   void movptr(Address      dst, int32_t        imm);
1890   void movptr(Address      dst, intptr_t       src, Register rscratch);
1891   void movptr(ArrayAddress dst, Register       src, Register rscratch);
1892 
1893   void movptr(Register dst, RegisterOrConstant src) {
1894     if (src.is_constant()) movptr(dst, src.as_constant());
1895     else                   movptr(dst, src.as_register());
1896   }
1897 
1898 
1899   // to avoid hiding movl
1900   void mov32(Register       dst, AddressLiteral src);
1901   void mov32(AddressLiteral dst, Register        src, Register rscratch = noreg);
1902 
1903   // Import other mov() methods from the parent class or else
1904   // they will be hidden by the following overriding declaration.
1905   using Assembler::movdl;
1906   void movdl(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1907 
1908   using Assembler::movq;
1909   void movq(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1910 
1911   // Can push value or effective address
1912   void pushptr(AddressLiteral src, Register rscratch);
1913 
1914   void pushptr(Address src) { LP64_ONLY(pushq(src)) NOT_LP64(pushl(src)); }
1915   void popptr(Address src) { LP64_ONLY(popq(src)) NOT_LP64(popl(src)); }
1916 
1917   void pushoop(jobject obj, Register rscratch);
1918   void pushklass(Metadata* obj, Register rscratch);
1919 
1920   // sign extend as need a l to ptr sized element
1921   void movl2ptr(Register dst, Address src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src)); }
1922   void movl2ptr(Register dst, Register src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(if (dst != src) movl(dst, src)); }
1923 
1924 
1925  public:
1926   // Inline type specific methods
1927   #include "asm/macroAssembler_common.hpp"
1928 
1929   int store_inline_type_fields_to_buf(ciInlineKlass* vk, bool from_interpreter = true);
1930   bool move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[]);
1931   bool unpack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index,
1932                             VMReg from, int& from_index, VMRegPair* to, int to_count, int& to_index,
1933                             RegState reg_state[]);
1934   bool pack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
1935                           VMRegPair* from, int from_count, int& from_index, VMReg to,
1936                           RegState reg_state[], Register val_array);
1937   int extend_stack_for_inline_args(int args_on_stack);
1938   void remove_frame(int initial_framesize, bool needs_stack_repair);
1939   VMReg spill_reg_for(VMReg reg);
1940 
1941   // clear memory of size 'cnt' qwords, starting at 'base';
1942   // if 'is_large' is set, do not try to produce short loop
1943   void clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp, bool is_large, bool word_copy_only, KRegister mask=knoreg);
1944 
1945   // clear memory initialization sequence for constant size;
1946   void clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
1947 
1948   // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
1949   void xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
1950 
1951   // Fill primitive arrays
1952   void generate_fill(BasicType t, bool aligned,
1953                      Register to, Register value, Register count,
1954                      Register rtmp, XMMRegister xtmp);
1955 
1956   void encode_iso_array(Register src, Register dst, Register len,
1957                         XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
1958                         XMMRegister tmp4, Register tmp5, Register result, bool ascii);
1959 
1960 #ifdef _LP64
1961   void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2);
1962   void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
1963                              Register y, Register y_idx, Register z,
1964                              Register carry, Register product,
1965                              Register idx, Register kdx);
1966   void multiply_add_128_x_128(Register x_xstart, Register y, Register z,
1967                               Register yz_idx, Register idx,
1968                               Register carry, Register product, int offset);
1969   void multiply_128_x_128_bmi2_loop(Register y, Register z,
1970                                     Register carry, Register carry2,
1971                                     Register idx, Register jdx,
1972                                     Register yz_idx1, Register yz_idx2,
1973                                     Register tmp, Register tmp3, Register tmp4);
1974   void multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
1975                                Register yz_idx, Register idx, Register jdx,
1976                                Register carry, Register product,
1977                                Register carry2);
1978   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
1979                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
1980   void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
1981                      Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
1982   void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
1983                             Register tmp2);
1984   void multiply_add_64(Register sum, Register op1, Register op2, Register carry,
1985                        Register rdxReg, Register raxReg);
1986   void add_one_64(Register z, Register zlen, Register carry, Register tmp1);
1987   void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
1988                        Register tmp3, Register tmp4);
1989   void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
1990                      Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
1991 
1992   void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1,
1993                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
1994                Register raxReg);
1995   void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
1996                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
1997                Register raxReg);
1998   void vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
1999                            Register result, Register tmp1, Register tmp2,
2000                            XMMRegister vec1, XMMRegister vec2, XMMRegister vec3);
2001 #endif
2002 
2003   // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
2004   void update_byte_crc32(Register crc, Register val, Register table);
2005   void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
2006 
2007 
2008 #ifdef _LP64
2009   void kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2);
2010   void kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
2011                                 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
2012                                 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup);
2013 #endif // _LP64
2014 
2015   // CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
2016   // Note on a naming convention:
2017   // Prefix w = register only used on a Westmere+ architecture
2018   // Prefix n = register only used on a Nehalem architecture
2019 #ifdef _LP64
2020   void crc32c_ipl_alg4(Register in_out, uint32_t n,
2021                        Register tmp1, Register tmp2, Register tmp3);
2022 #else
2023   void crc32c_ipl_alg4(Register in_out, uint32_t n,
2024                        Register tmp1, Register tmp2, Register tmp3,
2025                        XMMRegister xtmp1, XMMRegister xtmp2);
2026 #endif
2027   void crc32c_pclmulqdq(XMMRegister w_xtmp1,
2028                         Register in_out,
2029                         uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
2030                         XMMRegister w_xtmp2,
2031                         Register tmp1,
2032                         Register n_tmp2, Register n_tmp3);
2033   void crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
2034                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
2035                        Register tmp1, Register tmp2,
2036                        Register n_tmp3);
2037   void crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
2038                          Register in_out1, Register in_out2, Register in_out3,
2039                          Register tmp1, Register tmp2, Register tmp3,
2040                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
2041                          Register tmp4, Register tmp5,
2042                          Register n_tmp6);
2043   void crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
2044                             Register tmp1, Register tmp2, Register tmp3,
2045                             Register tmp4, Register tmp5, Register tmp6,
2046                             XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
2047                             bool is_pclmulqdq_supported);
2048   // Fold 128-bit data chunk
2049   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
2050   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf);
2051 #ifdef _LP64
2052   // Fold 512-bit data chunk
2053   void fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, Register pos, int offset);
2054 #endif // _LP64
2055   // Fold 8-bit data
2056   void fold_8bit_crc32(Register crc, Register table, Register tmp);
2057   void fold_8bit_crc32(XMMRegister crc, Register table, XMMRegister xtmp, Register tmp);
2058 
2059   // Compress char[] array to byte[].
2060   void char_array_compress(Register src, Register dst, Register len,
2061                            XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
2062                            XMMRegister tmp4, Register tmp5, Register result,
2063                            KRegister mask1 = knoreg, KRegister mask2 = knoreg);
2064 
2065   // Inflate byte[] array to char[].
2066   void byte_array_inflate(Register src, Register dst, Register len,
2067                           XMMRegister tmp1, Register tmp2, KRegister mask = knoreg);
2068 
2069   void fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
2070                    Register length, Register temp, int vec_enc);
2071 
2072   void fill64_masked(uint shift, Register dst, int disp,
2073                          XMMRegister xmm, KRegister mask, Register length,
2074                          Register temp, bool use64byteVector = false);
2075 
2076   void fill32_masked(uint shift, Register dst, int disp,
2077                          XMMRegister xmm, KRegister mask, Register length,
2078                          Register temp);
2079 
2080   void fill32(Address dst, XMMRegister xmm);
2081 
2082   void fill32(Register dst, int disp, XMMRegister xmm);
2083 
2084   void fill64(Address dst, XMMRegister xmm, bool use64byteVector = false);
2085 
2086   void fill64(Register dst, int dis, XMMRegister xmm, bool use64byteVector = false);
2087 
2088 #ifdef _LP64
2089   void convert_f2i(Register dst, XMMRegister src);
2090   void convert_d2i(Register dst, XMMRegister src);
2091   void convert_f2l(Register dst, XMMRegister src);
2092   void convert_d2l(Register dst, XMMRegister src);
2093   void round_double(Register dst, XMMRegister src, Register rtmp, Register rcx);
2094   void round_float(Register dst, XMMRegister src, Register rtmp, Register rcx);
2095 
2096   void cache_wb(Address line);
2097   void cache_wbsync(bool is_pre);
2098 
2099 #ifdef COMPILER2_OR_JVMCI
2100   void generate_fill_avx3(BasicType type, Register to, Register value,
2101                           Register count, Register rtmp, XMMRegister xtmp);
2102 #endif // COMPILER2_OR_JVMCI
2103 #endif // _LP64
2104 
2105   void vallones(XMMRegister dst, int vector_len);
2106 
2107   void check_stack_alignment(Register sp, const char* msg, unsigned bias = 0, Register tmp = noreg);
2108 
2109   void lightweight_lock(Register obj, Register reg_rax, Register thread, Register tmp, Label& slow);
2110   void lightweight_unlock(Register obj, Register reg_rax, Register thread, Register tmp, Label& slow);
2111 };
2112 
2113 /**
2114  * class SkipIfEqual:
2115  *
2116  * Instantiating this class will result in assembly code being output that will
2117  * jump around any code emitted between the creation of the instance and it's
2118  * automatic destruction at the end of a scope block, depending on the value of
2119  * the flag passed to the constructor, which will be checked at run-time.
2120  */
2121 class SkipIfEqual {
2122  private:
2123   MacroAssembler* _masm;
2124   Label _label;
2125 
2126  public:
2127    SkipIfEqual(MacroAssembler*, const bool* flag_addr, bool value, Register rscratch);
2128    ~SkipIfEqual();
2129 };
2130 
2131 #endif // CPU_X86_MACROASSEMBLER_X86_HPP