1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef CPU_X86_MACROASSEMBLER_X86_HPP
  26 #define CPU_X86_MACROASSEMBLER_X86_HPP
  27 
  28 #include "asm/assembler.hpp"
  29 #include "asm/register.hpp"
  30 #include "code/vmreg.inline.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "utilities/macros.hpp"
  33 #include "runtime/rtmLocking.hpp"
  34 #include "runtime/signature.hpp"
  35 #include "runtime/vm_version.hpp"
  36 #include "utilities/checkedCast.hpp"
  37 
  38 class ciInlineKlass;
  39 
  40 // MacroAssembler extends Assembler by frequently used macros.
  41 //
  42 // Instructions for which a 'better' code sequence exists depending
  43 // on arguments should also go in here.
  44 
  45 class MacroAssembler: public Assembler {
  46   friend class LIR_Assembler;
  47   friend class Runtime1;      // as_Address()
  48 
  49  public:
  50   // Support for VM calls
  51   //
  52   // This is the base routine called by the different versions of call_VM_leaf. The interpreter
  53   // may customize this version by overriding it for its purposes (e.g., to save/restore
  54   // additional registers when doing a VM call).
  55 
  56   virtual void call_VM_leaf_base(
  57     address entry_point,               // the entry point
  58     int     number_of_arguments        // the number of arguments to pop after the call
  59   );
  60 
  61  protected:
  62   // This is the base routine called by the different versions of call_VM. The interpreter
  63   // may customize this version by overriding it for its purposes (e.g., to save/restore
  64   // additional registers when doing a VM call).
  65   //
  66   // If no java_thread register is specified (noreg) than rdi will be used instead. call_VM_base
  67   // returns the register which contains the thread upon return. If a thread register has been
  68   // specified, the return value will correspond to that register. If no last_java_sp is specified
  69   // (noreg) than rsp will be used instead.
  70   virtual void call_VM_base(           // returns the register containing the thread upon return
  71     Register oop_result,               // where an oop-result ends up if any; use noreg otherwise
  72     Register java_thread,              // the thread if computed before     ; use noreg otherwise
  73     Register last_java_sp,             // to set up last_Java_frame in stubs; use noreg otherwise
  74     address  entry_point,              // the entry point
  75     int      number_of_arguments,      // the number of arguments (w/o thread) to pop after the call
  76     bool     check_exceptions          // whether to check for pending exceptions after return
  77   );
  78 
  79   void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions = true);
  80 
  81   // helpers for FPU flag access
  82   // tmp is a temporary register, if none is available use noreg
  83   void save_rax   (Register tmp);
  84   void restore_rax(Register tmp);
  85 
  86  public:
  87   MacroAssembler(CodeBuffer* code) : Assembler(code) {}
  88 
  89  // These routines should emit JVMTI PopFrame and ForceEarlyReturn handling code.
  90  // The implementation is only non-empty for the InterpreterMacroAssembler,
  91  // as only the interpreter handles PopFrame and ForceEarlyReturn requests.
  92  virtual void check_and_handle_popframe(Register java_thread);
  93  virtual void check_and_handle_earlyret(Register java_thread);
  94 
  95   Address as_Address(AddressLiteral adr);
  96   Address as_Address(ArrayAddress adr, Register rscratch);
  97 
  98   // Support for null-checks
  99   //
 100   // Generates code that causes a null OS exception if the content of reg is null.
 101   // If the accessed location is M[reg + offset] and the offset is known, provide the
 102   // offset. No explicit code generation is needed if the offset is within a certain
 103   // range (0 <= offset <= page_size).
 104 
 105   void null_check(Register reg, int offset = -1);
 106   static bool needs_explicit_null_check(intptr_t offset);
 107   static bool uses_implicit_null_check(void* address);
 108 
 109   // markWord tests, kills markWord reg
 110   void test_markword_is_inline_type(Register markword, Label& is_inline_type);
 111 
 112   // inlineKlass queries, kills temp_reg
 113   void test_klass_is_inline_type(Register klass, Register temp_reg, Label& is_inline_type);
 114   void test_klass_is_empty_inline_type(Register klass, Register temp_reg, Label& is_empty_inline_type);
 115   void test_oop_is_not_inline_type(Register object, Register tmp, Label& not_inline_type);
 116 
 117   // Get the default value oop for the given InlineKlass
 118   void get_default_value_oop(Register inline_klass, Register temp_reg, Register obj);
 119   // The empty value oop, for the given InlineKlass ("empty" as in no instance fields)
 120   // get_default_value_oop with extra assertion for empty inline klass
 121   void get_empty_inline_type_oop(Register inline_klass, Register temp_reg, Register obj);
 122 
 123   void test_field_is_null_free_inline_type(Register flags, Register temp_reg, Label& is_null_free);
 124   void test_field_is_not_null_free_inline_type(Register flags, Register temp_reg, Label& not_null_free);
 125   void test_field_is_flat(Register flags, Register temp_reg, Label& is_flat);
 126 
 127   // Check oops for special arrays, i.e. flat arrays and/or null-free arrays
 128   void test_oop_prototype_bit(Register oop, Register temp_reg, int32_t test_bit, bool jmp_set, Label& jmp_label);
 129   void test_flat_array_oop(Register oop, Register temp_reg, Label& is_flat_array);
 130   void test_non_flat_array_oop(Register oop, Register temp_reg, Label& is_non_flat_array);
 131   void test_null_free_array_oop(Register oop, Register temp_reg, Label& is_null_free_array);
 132   void test_non_null_free_array_oop(Register oop, Register temp_reg, Label& is_non_null_free_array);
 133 
 134   // Check array klass layout helper for flat or null-free arrays...
 135   void test_flat_array_layout(Register lh, Label& is_flat_array);
 136   void test_non_flat_array_layout(Register lh, Label& is_non_flat_array);
 137   void test_null_free_array_layout(Register lh, Label& is_null_free_array);
 138   void test_non_null_free_array_layout(Register lh, Label& is_non_null_free_array);
 139 
 140   // Required platform-specific helpers for Label::patch_instructions.
 141   // They _shadow_ the declarations in AbstractAssembler, which are undefined.
 142   void pd_patch_instruction(address branch, address target, const char* file, int line) {
 143     unsigned char op = branch[0];
 144     assert(op == 0xE8 /* call */ ||
 145         op == 0xE9 /* jmp */ ||
 146         op == 0xEB /* short jmp */ ||
 147         (op & 0xF0) == 0x70 /* short jcc */ ||
 148         op == 0x0F && (branch[1] & 0xF0) == 0x80 /* jcc */ ||
 149         op == 0xC7 && branch[1] == 0xF8 /* xbegin */,
 150         "Invalid opcode at patch point");
 151 
 152     if (op == 0xEB || (op & 0xF0) == 0x70) {
 153       // short offset operators (jmp and jcc)
 154       char* disp = (char*) &branch[1];
 155       int imm8 = checked_cast<int>(target - (address) &disp[1]);
 156       guarantee(this->is8bit(imm8), "Short forward jump exceeds 8-bit offset at %s:%d",
 157                 file == nullptr ? "<null>" : file, line);
 158       *disp = (char)imm8;
 159     } else {
 160       int* disp = (int*) &branch[(op == 0x0F || op == 0xC7)? 2: 1];
 161       int imm32 = checked_cast<int>(target - (address) &disp[1]);
 162       *disp = imm32;
 163     }
 164   }
 165 
 166   // The following 4 methods return the offset of the appropriate move instruction
 167 
 168   // Support for fast byte/short loading with zero extension (depending on particular CPU)
 169   int load_unsigned_byte(Register dst, Address src);
 170   int load_unsigned_short(Register dst, Address src);
 171 
 172   // Support for fast byte/short loading with sign extension (depending on particular CPU)
 173   int load_signed_byte(Register dst, Address src);
 174   int load_signed_short(Register dst, Address src);
 175 
 176   // Support for sign-extension (hi:lo = extend_sign(lo))
 177   void extend_sign(Register hi, Register lo);
 178 
 179   // Load and store values by size and signed-ness
 180   void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2 = noreg);
 181   void store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2 = noreg);
 182 
 183   // Support for inc/dec with optimal instruction selection depending on value
 184 
 185   void increment(Register reg, int value = 1) { LP64_ONLY(incrementq(reg, value)) NOT_LP64(incrementl(reg, value)) ; }
 186   void decrement(Register reg, int value = 1) { LP64_ONLY(decrementq(reg, value)) NOT_LP64(decrementl(reg, value)) ; }
 187   void increment(Address dst, int value = 1)  { LP64_ONLY(incrementq(dst, value)) NOT_LP64(incrementl(dst, value)) ; }
 188   void decrement(Address dst, int value = 1)  { LP64_ONLY(decrementq(dst, value)) NOT_LP64(decrementl(dst, value)) ; }
 189 
 190   void decrementl(Address dst, int value = 1);
 191   void decrementl(Register reg, int value = 1);
 192 
 193   void decrementq(Register reg, int value = 1);
 194   void decrementq(Address dst, int value = 1);
 195 
 196   void incrementl(Address dst, int value = 1);
 197   void incrementl(Register reg, int value = 1);
 198 
 199   void incrementq(Register reg, int value = 1);
 200   void incrementq(Address dst, int value = 1);
 201 
 202   void incrementl(AddressLiteral dst, Register rscratch = noreg);
 203   void incrementl(ArrayAddress   dst, Register rscratch);
 204 
 205   void incrementq(AddressLiteral dst, Register rscratch = noreg);
 206 
 207   // Support optimal SSE move instructions.
 208   void movflt(XMMRegister dst, XMMRegister src) {
 209     if (dst-> encoding() == src->encoding()) return;
 210     if (UseXmmRegToRegMoveAll) { movaps(dst, src); return; }
 211     else                       { movss (dst, src); return; }
 212   }
 213   void movflt(XMMRegister dst, Address src) { movss(dst, src); }
 214   void movflt(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 215   void movflt(Address dst, XMMRegister src) { movss(dst, src); }
 216 
 217   // Move with zero extension
 218   void movfltz(XMMRegister dst, XMMRegister src) { movss(dst, src); }
 219 
 220   void movdbl(XMMRegister dst, XMMRegister src) {
 221     if (dst-> encoding() == src->encoding()) return;
 222     if (UseXmmRegToRegMoveAll) { movapd(dst, src); return; }
 223     else                       { movsd (dst, src); return; }
 224   }
 225 
 226   void movdbl(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
 227 
 228   void movdbl(XMMRegister dst, Address src) {
 229     if (UseXmmLoadAndClearUpper) { movsd (dst, src); return; }
 230     else                         { movlpd(dst, src); return; }
 231   }
 232   void movdbl(Address dst, XMMRegister src) { movsd(dst, src); }
 233 
 234   void flt_to_flt16(Register dst, XMMRegister src, XMMRegister tmp) {
 235     // Use separate tmp XMM register because caller may
 236     // requires src XMM register to be unchanged (as in x86.ad).
 237     vcvtps2ph(tmp, src, 0x04, Assembler::AVX_128bit);
 238     movdl(dst, tmp);
 239     movswl(dst, dst);
 240   }
 241 
 242   void flt16_to_flt(XMMRegister dst, Register src) {
 243     movdl(dst, src);
 244     vcvtph2ps(dst, dst, Assembler::AVX_128bit);
 245   }
 246 
 247   // Alignment
 248   void align32();
 249   void align64();
 250   void align(int modulus);
 251   void align(int modulus, int target);
 252 
 253   void post_call_nop();
 254   // A 5 byte nop that is safe for patching (see patch_verified_entry)
 255   void fat_nop();
 256 
 257   // Stack frame creation/removal
 258   void enter();
 259   void leave();
 260 
 261   // Support for getting the JavaThread pointer (i.e.; a reference to thread-local information)
 262   // The pointer will be loaded into the thread register.
 263   void get_thread(Register thread);
 264 
 265 #ifdef _LP64
 266   // Support for argument shuffling
 267 
 268   // bias in bytes
 269   void move32_64(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 270   void long_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 271   void float_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 272   void double_move(VMRegPair src, VMRegPair dst, Register tmp = rax, int in_stk_bias = 0, int out_stk_bias = 0);
 273   void move_ptr(VMRegPair src, VMRegPair dst);
 274   void object_move(OopMap* map,
 275                    int oop_handle_offset,
 276                    int framesize_in_slots,
 277                    VMRegPair src,
 278                    VMRegPair dst,
 279                    bool is_receiver,
 280                    int* receiver_offset);
 281 #endif // _LP64
 282 
 283   // Support for VM calls
 284   //
 285   // It is imperative that all calls into the VM are handled via the call_VM macros.
 286   // They make sure that the stack linkage is setup correctly. call_VM's correspond
 287   // to ENTRY/ENTRY_X entry points while call_VM_leaf's correspond to LEAF entry points.
 288 
 289 
 290   void call_VM(Register oop_result,
 291                address entry_point,
 292                bool check_exceptions = true);
 293   void call_VM(Register oop_result,
 294                address entry_point,
 295                Register arg_1,
 296                bool check_exceptions = true);
 297   void call_VM(Register oop_result,
 298                address entry_point,
 299                Register arg_1, Register arg_2,
 300                bool check_exceptions = true);
 301   void call_VM(Register oop_result,
 302                address entry_point,
 303                Register arg_1, Register arg_2, Register arg_3,
 304                bool check_exceptions = true);
 305 
 306   // Overloadings with last_Java_sp
 307   void call_VM(Register oop_result,
 308                Register last_java_sp,
 309                address entry_point,
 310                int number_of_arguments = 0,
 311                bool check_exceptions = true);
 312   void call_VM(Register oop_result,
 313                Register last_java_sp,
 314                address entry_point,
 315                Register arg_1, bool
 316                check_exceptions = true);
 317   void call_VM(Register oop_result,
 318                Register last_java_sp,
 319                address entry_point,
 320                Register arg_1, Register arg_2,
 321                bool check_exceptions = true);
 322   void call_VM(Register oop_result,
 323                Register last_java_sp,
 324                address entry_point,
 325                Register arg_1, Register arg_2, Register arg_3,
 326                bool check_exceptions = true);
 327 
 328   void get_vm_result  (Register oop_result, Register thread);
 329   void get_vm_result_2(Register metadata_result, Register thread);
 330 
 331   // These always tightly bind to MacroAssembler::call_VM_base
 332   // bypassing the virtual implementation
 333   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments = 0, bool check_exceptions = true);
 334   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions = true);
 335   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions = true);
 336   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions = true);
 337   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4, bool check_exceptions = true);
 338 
 339   void call_VM_leaf0(address entry_point);
 340   void call_VM_leaf(address entry_point,
 341                     int number_of_arguments = 0);
 342   void call_VM_leaf(address entry_point,
 343                     Register arg_1);
 344   void call_VM_leaf(address entry_point,
 345                     Register arg_1, Register arg_2);
 346   void call_VM_leaf(address entry_point,
 347                     Register arg_1, Register arg_2, Register arg_3);
 348 
 349   void call_VM_leaf(address entry_point,
 350                     Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 351 
 352   // These always tightly bind to MacroAssembler::call_VM_leaf_base
 353   // bypassing the virtual implementation
 354   void super_call_VM_leaf(address entry_point);
 355   void super_call_VM_leaf(address entry_point, Register arg_1);
 356   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
 357   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
 358   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 359 
 360   // last Java Frame (fills frame anchor)
 361   void set_last_Java_frame(Register thread,
 362                            Register last_java_sp,
 363                            Register last_java_fp,
 364                            address  last_java_pc,
 365                            Register rscratch);
 366 
 367   // thread in the default location (r15_thread on 64bit)
 368   void set_last_Java_frame(Register last_java_sp,
 369                            Register last_java_fp,
 370                            address  last_java_pc,
 371                            Register rscratch);
 372 
 373   void reset_last_Java_frame(Register thread, bool clear_fp);
 374 
 375   // thread in the default location (r15_thread on 64bit)
 376   void reset_last_Java_frame(bool clear_fp);
 377 
 378   // jobjects
 379   void clear_jobject_tag(Register possibly_non_local);
 380   void resolve_jobject(Register value, Register thread, Register tmp);
 381   void resolve_global_jobject(Register value, Register thread, Register tmp);
 382 
 383   // C 'boolean' to Java boolean: x == 0 ? 0 : 1
 384   void c2bool(Register x);
 385 
 386   // C++ bool manipulation
 387 
 388   void movbool(Register dst, Address src);
 389   void movbool(Address dst, bool boolconst);
 390   void movbool(Address dst, Register src);
 391   void testbool(Register dst);
 392 
 393   void resolve_oop_handle(Register result, Register tmp);
 394   void resolve_weak_handle(Register result, Register tmp);
 395   void load_mirror(Register mirror, Register method, Register tmp);
 396   void load_method_holder_cld(Register rresult, Register rmethod);
 397 
 398   void load_method_holder(Register holder, Register method);
 399 
 400   // oop manipulations
 401   void load_metadata(Register dst, Register src);
 402   void load_klass(Register dst, Register src, Register tmp);
 403   void store_klass(Register dst, Register src, Register tmp);
 404 
 405   void access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
 406                       Register tmp1, Register thread_tmp);
 407   void access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val,
 408                        Register tmp1, Register tmp2, Register tmp3);
 409 
 410   void access_value_copy(DecoratorSet decorators, Register src, Register dst, Register inline_klass);
 411 
 412   // inline type data payload offsets...
 413   void first_field_offset(Register inline_klass, Register offset);
 414   void data_for_oop(Register oop, Register data, Register inline_klass);
 415   // get data payload ptr a flat value array at index, kills rcx and index
 416   void data_for_value_array_index(Register array, Register array_klass,
 417                                   Register index, Register data);
 418 
 419 
 420   void load_heap_oop(Register dst, Address src, Register tmp1 = noreg,
 421                      Register thread_tmp = noreg, DecoratorSet decorators = 0);
 422   void load_heap_oop_not_null(Register dst, Address src, Register tmp1 = noreg,
 423                               Register thread_tmp = noreg, DecoratorSet decorators = 0);
 424   void store_heap_oop(Address dst, Register val, Register tmp1 = noreg,
 425                       Register tmp2 = noreg, Register tmp3 = noreg, DecoratorSet decorators = 0);
 426 
 427   // Used for storing null. All other oop constants should be
 428   // stored using routines that take a jobject.
 429   void store_heap_oop_null(Address dst);
 430 
 431   void load_prototype_header(Register dst, Register src, Register tmp);
 432 
 433 #ifdef _LP64
 434   void store_klass_gap(Register dst, Register src);
 435 
 436   // This dummy is to prevent a call to store_heap_oop from
 437   // converting a zero (like null) into a Register by giving
 438   // the compiler two choices it can't resolve
 439 
 440   void store_heap_oop(Address dst, void* dummy);
 441 
 442   void encode_heap_oop(Register r);
 443   void decode_heap_oop(Register r);
 444   void encode_heap_oop_not_null(Register r);
 445   void decode_heap_oop_not_null(Register r);
 446   void encode_heap_oop_not_null(Register dst, Register src);
 447   void decode_heap_oop_not_null(Register dst, Register src);
 448 
 449   void set_narrow_oop(Register dst, jobject obj);
 450   void set_narrow_oop(Address dst, jobject obj);
 451   void cmp_narrow_oop(Register dst, jobject obj);
 452   void cmp_narrow_oop(Address dst, jobject obj);
 453 
 454   void encode_klass_not_null(Register r, Register tmp);
 455   void decode_klass_not_null(Register r, Register tmp);
 456   void encode_and_move_klass_not_null(Register dst, Register src);
 457   void decode_and_move_klass_not_null(Register dst, Register src);
 458   void set_narrow_klass(Register dst, Klass* k);
 459   void set_narrow_klass(Address dst, Klass* k);
 460   void cmp_narrow_klass(Register dst, Klass* k);
 461   void cmp_narrow_klass(Address dst, Klass* k);
 462 
 463   // if heap base register is used - reinit it with the correct value
 464   void reinit_heapbase();
 465 
 466   DEBUG_ONLY(void verify_heapbase(const char* msg);)
 467 
 468 #endif // _LP64
 469 
 470   // Int division/remainder for Java
 471   // (as idivl, but checks for special case as described in JVM spec.)
 472   // returns idivl instruction offset for implicit exception handling
 473   int corrected_idivl(Register reg);
 474 
 475   // Long division/remainder for Java
 476   // (as idivq, but checks for special case as described in JVM spec.)
 477   // returns idivq instruction offset for implicit exception handling
 478   int corrected_idivq(Register reg);
 479 
 480   void int3();
 481 
 482   // Long operation macros for a 32bit cpu
 483   // Long negation for Java
 484   void lneg(Register hi, Register lo);
 485 
 486   // Long multiplication for Java
 487   // (destroys contents of eax, ebx, ecx and edx)
 488   void lmul(int x_rsp_offset, int y_rsp_offset); // rdx:rax = x * y
 489 
 490   // Long shifts for Java
 491   // (semantics as described in JVM spec.)
 492   void lshl(Register hi, Register lo);                               // hi:lo << (rcx & 0x3f)
 493   void lshr(Register hi, Register lo, bool sign_extension = false);  // hi:lo >> (rcx & 0x3f)
 494 
 495   // Long compare for Java
 496   // (semantics as described in JVM spec.)
 497   void lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo); // x_hi = lcmp(x, y)
 498 
 499 
 500   // misc
 501 
 502   // Sign extension
 503   void sign_extend_short(Register reg);
 504   void sign_extend_byte(Register reg);
 505 
 506   // Division by power of 2, rounding towards 0
 507   void division_with_shift(Register reg, int shift_value);
 508 
 509 #ifndef _LP64
 510   // Compares the top-most stack entries on the FPU stack and sets the eflags as follows:
 511   //
 512   // CF (corresponds to C0) if x < y
 513   // PF (corresponds to C2) if unordered
 514   // ZF (corresponds to C3) if x = y
 515   //
 516   // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
 517   // tmp is a temporary register, if none is available use noreg (only matters for non-P6 code)
 518   void fcmp(Register tmp);
 519   // Variant of the above which allows y to be further down the stack
 520   // and which only pops x and y if specified. If pop_right is
 521   // specified then pop_left must also be specified.
 522   void fcmp(Register tmp, int index, bool pop_left, bool pop_right);
 523 
 524   // Floating-point comparison for Java
 525   // Compares the top-most stack entries on the FPU stack and stores the result in dst.
 526   // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
 527   // (semantics as described in JVM spec.)
 528   void fcmp2int(Register dst, bool unordered_is_less);
 529   // Variant of the above which allows y to be further down the stack
 530   // and which only pops x and y if specified. If pop_right is
 531   // specified then pop_left must also be specified.
 532   void fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right);
 533 
 534   // Floating-point remainder for Java (ST0 = ST0 fremr ST1, ST1 is empty afterwards)
 535   // tmp is a temporary register, if none is available use noreg
 536   void fremr(Register tmp);
 537 
 538   // only if +VerifyFPU
 539   void verify_FPU(int stack_depth, const char* s = "illegal FPU state");
 540 #endif // !LP64
 541 
 542   // dst = c = a * b + c
 543   void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 544   void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 545 
 546   void vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 547   void vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 548   void vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 549   void vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 550 
 551 
 552   // same as fcmp2int, but using SSE2
 553   void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 554   void cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 555 
 556   // branch to L if FPU flag C2 is set/not set
 557   // tmp is a temporary register, if none is available use noreg
 558   void jC2 (Register tmp, Label& L);
 559   void jnC2(Register tmp, Label& L);
 560 
 561   // Load float value from 'address'. If UseSSE >= 1, the value is loaded into
 562   // register xmm0. Otherwise, the value is loaded onto the FPU stack.
 563   void load_float(Address src);
 564 
 565   // Store float value to 'address'. If UseSSE >= 1, the value is stored
 566   // from register xmm0. Otherwise, the value is stored from the FPU stack.
 567   void store_float(Address dst);
 568 
 569   // Load double value from 'address'. If UseSSE >= 2, the value is loaded into
 570   // register xmm0. Otherwise, the value is loaded onto the FPU stack.
 571   void load_double(Address src);
 572 
 573   // Store double value to 'address'. If UseSSE >= 2, the value is stored
 574   // from register xmm0. Otherwise, the value is stored from the FPU stack.
 575   void store_double(Address dst);
 576 
 577 #ifndef _LP64
 578   // Pop ST (ffree & fincstp combined)
 579   void fpop();
 580 
 581   void empty_FPU_stack();
 582 #endif // !_LP64
 583 
 584   void push_IU_state();
 585   void pop_IU_state();
 586 
 587   void push_FPU_state();
 588   void pop_FPU_state();
 589 
 590   void push_CPU_state();
 591   void pop_CPU_state();
 592 
 593   void push_cont_fastpath();
 594   void pop_cont_fastpath();
 595 
 596   void inc_held_monitor_count();
 597   void dec_held_monitor_count();
 598 
 599   DEBUG_ONLY(void stop_if_in_cont(Register cont_reg, const char* name);)
 600 
 601   // Round up to a power of two
 602   void round_to(Register reg, int modulus);
 603 
 604 private:
 605   // General purpose and XMM registers potentially clobbered by native code; there
 606   // is no need for FPU or AVX opmask related methods because C1/interpreter
 607   // - we save/restore FPU state as a whole always
 608   // - do not care about AVX-512 opmask
 609   static RegSet call_clobbered_gp_registers();
 610   static XMMRegSet call_clobbered_xmm_registers();
 611 
 612   void push_set(XMMRegSet set, int offset);
 613   void pop_set(XMMRegSet set, int offset);
 614 
 615 public:
 616   void push_set(RegSet set, int offset = -1);
 617   void pop_set(RegSet set, int offset = -1);
 618 
 619   // Push and pop everything that might be clobbered by a native
 620   // runtime call.
 621   // Only save the lower 64 bits of each vector register.
 622   // Additional registers can be excluded in a passed RegSet.
 623   void push_call_clobbered_registers_except(RegSet exclude, bool save_fpu = true);
 624   void pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu = true);
 625 
 626   void push_call_clobbered_registers(bool save_fpu = true) {
 627     push_call_clobbered_registers_except(RegSet(), save_fpu);
 628   }
 629   void pop_call_clobbered_registers(bool restore_fpu = true) {
 630     pop_call_clobbered_registers_except(RegSet(), restore_fpu);
 631   }
 632 
 633   // allocation
 634 
 635   // Object / value buffer allocation...
 636   // Allocate instance of klass, assumes klass initialized by caller
 637   // new_obj prefers to be rax
 638   // Kills t1 and t2, perserves klass, return allocation in new_obj (rsi on LP64)
 639   void allocate_instance(Register klass, Register new_obj,
 640                          Register t1, Register t2,
 641                          bool clear_fields, Label& alloc_failed);
 642 
 643   void tlab_allocate(
 644     Register thread,                   // Current thread
 645     Register obj,                      // result: pointer to object after successful allocation
 646     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
 647     int      con_size_in_bytes,        // object size in bytes if   known at compile time
 648     Register t1,                       // temp register
 649     Register t2,                       // temp register
 650     Label&   slow_case                 // continuation point if fast allocation fails
 651   );
 652   void zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp);
 653 
 654   // For field "index" within "klass", return inline_klass ...
 655   void get_inline_type_field_klass(Register klass, Register index, Register inline_klass);
 656 
 657   // interface method calling
 658   void lookup_interface_method(Register recv_klass,
 659                                Register intf_klass,
 660                                RegisterOrConstant itable_index,
 661                                Register method_result,
 662                                Register scan_temp,
 663                                Label& no_such_interface,
 664                                bool return_method = true);
 665 
 666   void lookup_interface_method_stub(Register recv_klass,
 667                                     Register holder_klass,
 668                                     Register resolved_klass,
 669                                     Register method_result,
 670                                     Register scan_temp,
 671                                     Register temp_reg2,
 672                                     Register receiver,
 673                                     int itable_index,
 674                                     Label& L_no_such_interface);
 675 
 676   // virtual method calling
 677   void lookup_virtual_method(Register recv_klass,
 678                              RegisterOrConstant vtable_index,
 679                              Register method_result);
 680 
 681   // Test sub_klass against super_klass, with fast and slow paths.
 682 
 683   // The fast path produces a tri-state answer: yes / no / maybe-slow.
 684   // One of the three labels can be null, meaning take the fall-through.
 685   // If super_check_offset is -1, the value is loaded up from super_klass.
 686   // No registers are killed, except temp_reg.
 687   void check_klass_subtype_fast_path(Register sub_klass,
 688                                      Register super_klass,
 689                                      Register temp_reg,
 690                                      Label* L_success,
 691                                      Label* L_failure,
 692                                      Label* L_slow_path,
 693                 RegisterOrConstant super_check_offset = RegisterOrConstant(-1));
 694 
 695   // The rest of the type check; must be wired to a corresponding fast path.
 696   // It does not repeat the fast path logic, so don't use it standalone.
 697   // The temp_reg and temp2_reg can be noreg, if no temps are available.
 698   // Updates the sub's secondary super cache as necessary.
 699   // If set_cond_codes, condition codes will be Z on success, NZ on failure.
 700   void check_klass_subtype_slow_path(Register sub_klass,
 701                                      Register super_klass,
 702                                      Register temp_reg,
 703                                      Register temp2_reg,
 704                                      Label* L_success,
 705                                      Label* L_failure,
 706                                      bool set_cond_codes = false);
 707 
 708   // Simplified, combined version, good for typical uses.
 709   // Falls through on failure.
 710   void check_klass_subtype(Register sub_klass,
 711                            Register super_klass,
 712                            Register temp_reg,
 713                            Label& L_success);
 714 
 715   void clinit_barrier(Register klass,
 716                       Register thread,
 717                       Label* L_fast_path = nullptr,
 718                       Label* L_slow_path = nullptr);
 719 
 720   // method handles (JSR 292)
 721   Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0);
 722 
 723   // Debugging
 724 
 725   // only if +VerifyOops
 726   void _verify_oop(Register reg, const char* s, const char* file, int line);
 727   void _verify_oop_addr(Address addr, const char* s, const char* file, int line);
 728 
 729   void _verify_oop_checked(Register reg, const char* s, const char* file, int line) {
 730     if (VerifyOops) {
 731       _verify_oop(reg, s, file, line);
 732     }
 733   }
 734   void _verify_oop_addr_checked(Address reg, const char* s, const char* file, int line) {
 735     if (VerifyOops) {
 736       _verify_oop_addr(reg, s, file, line);
 737     }
 738   }
 739 
 740   // TODO: verify method and klass metadata (compare against vptr?)
 741   void _verify_method_ptr(Register reg, const char * msg, const char * file, int line) {}
 742   void _verify_klass_ptr(Register reg, const char * msg, const char * file, int line){}
 743 
 744 #define verify_oop(reg) _verify_oop_checked(reg, "broken oop " #reg, __FILE__, __LINE__)
 745 #define verify_oop_msg(reg, msg) _verify_oop_checked(reg, "broken oop " #reg ", " #msg, __FILE__, __LINE__)
 746 #define verify_oop_addr(addr) _verify_oop_addr_checked(addr, "broken oop addr " #addr, __FILE__, __LINE__)
 747 #define verify_method_ptr(reg) _verify_method_ptr(reg, "broken method " #reg, __FILE__, __LINE__)
 748 #define verify_klass_ptr(reg) _verify_klass_ptr(reg, "broken klass " #reg, __FILE__, __LINE__)
 749 
 750   // Verify or restore cpu control state after JNI call
 751   void restore_cpu_control_state_after_jni(Register rscratch);
 752 
 753   // prints msg, dumps registers and stops execution
 754   void stop(const char* msg);
 755 
 756   // prints msg and continues
 757   void warn(const char* msg);
 758 
 759   // dumps registers and other state
 760   void print_state();
 761 
 762   static void debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg);
 763   static void debug64(char* msg, int64_t pc, int64_t regs[]);
 764   static void print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip);
 765   static void print_state64(int64_t pc, int64_t regs[]);
 766 
 767   void os_breakpoint();
 768 
 769   void untested()                                { stop("untested"); }
 770 
 771   void unimplemented(const char* what = "");
 772 
 773   void should_not_reach_here()                   { stop("should not reach here"); }
 774 
 775   void print_CPU_state();
 776 
 777   // Stack overflow checking
 778   void bang_stack_with_offset(int offset) {
 779     // stack grows down, caller passes positive offset
 780     assert(offset > 0, "must bang with negative offset");
 781     movl(Address(rsp, (-offset)), rax);
 782   }
 783 
 784   // Writes to stack successive pages until offset reached to check for
 785   // stack overflow + shadow pages.  Also, clobbers tmp
 786   void bang_stack_size(Register size, Register tmp);
 787 
 788   // Check for reserved stack access in method being exited (for JIT)
 789   void reserved_stack_check();
 790 
 791   void safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod);
 792 
 793   void verify_tlab();
 794 
 795   static Condition negate_condition(Condition cond);
 796 
 797   // Instructions that use AddressLiteral operands. These instruction can handle 32bit/64bit
 798   // operands. In general the names are modified to avoid hiding the instruction in Assembler
 799   // so that we don't need to implement all the varieties in the Assembler with trivial wrappers
 800   // here in MacroAssembler. The major exception to this rule is call
 801 
 802   // Arithmetics
 803 
 804 
 805   void addptr(Address dst, int32_t src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)) ; }
 806   void addptr(Address dst, Register src);
 807 
 808   void addptr(Register dst, Address src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); }
 809   void addptr(Register dst, int32_t src);
 810   void addptr(Register dst, Register src);
 811   void addptr(Register dst, RegisterOrConstant src) {
 812     if (src.is_constant()) addptr(dst, checked_cast<int>(src.as_constant()));
 813     else                   addptr(dst, src.as_register());
 814   }
 815 
 816   void andptr(Register dst, int32_t src);
 817   void andptr(Register dst, Register src) { LP64_ONLY(andq(dst, src)) NOT_LP64(andl(dst, src)) ; }
 818   void andptr(Register dst, Address src) { LP64_ONLY(andq(dst, src)) NOT_LP64(andl(dst, src)) ; }
 819 
 820 #ifdef _LP64
 821   using Assembler::andq;
 822   void andq(Register dst, AddressLiteral src, Register rscratch = noreg);
 823 #endif
 824 
 825   void cmp8(AddressLiteral src1, int imm, Register rscratch = noreg);
 826 
 827   // renamed to drag out the casting of address to int32_t/intptr_t
 828   void cmp32(Register src1, int32_t imm);
 829 
 830   void cmp32(AddressLiteral src1, int32_t imm, Register rscratch = noreg);
 831   // compare reg - mem, or reg - &mem
 832   void cmp32(Register src1, AddressLiteral src2, Register rscratch = noreg);
 833 
 834   void cmp32(Register src1, Address src2);
 835 
 836 #ifndef _LP64
 837   void cmpklass(Address dst, Metadata* obj);
 838   void cmpklass(Register dst, Metadata* obj);
 839   void cmpoop(Address dst, jobject obj);
 840 #endif // _LP64
 841 
 842   void cmpoop(Register src1, Register src2);
 843   void cmpoop(Register src1, Address src2);
 844   void cmpoop(Register dst, jobject obj, Register rscratch);
 845 
 846   // NOTE src2 must be the lval. This is NOT an mem-mem compare
 847   void cmpptr(Address src1, AddressLiteral src2, Register rscratch);
 848 
 849   void cmpptr(Register src1, AddressLiteral src2, Register rscratch = noreg);
 850 
 851   void cmpptr(Register src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 852   void cmpptr(Register src1, Address src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 853   // void cmpptr(Address src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 854 
 855   void cmpptr(Register src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 856   void cmpptr(Address src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 857 
 858   // cmp64 to avoild hiding cmpq
 859   void cmp64(Register src1, AddressLiteral src, Register rscratch = noreg);
 860 
 861   void cmpxchgptr(Register reg, Address adr);
 862 
 863 
 864   // cvt instructions
 865   void cvtss2sd(XMMRegister dst, XMMRegister src);
 866   void cvtss2sd(XMMRegister dst, Address src);
 867   void cvtsd2ss(XMMRegister dst, XMMRegister src);
 868   void cvtsd2ss(XMMRegister dst, Address src);
 869   void cvtsi2sdl(XMMRegister dst, Register src);
 870   void cvtsi2sdl(XMMRegister dst, Address src);
 871   void cvtsi2ssl(XMMRegister dst, Register src);
 872   void cvtsi2ssl(XMMRegister dst, Address src);
 873 #ifdef _LP64
 874   void cvtsi2sdq(XMMRegister dst, Register src);
 875   void cvtsi2sdq(XMMRegister dst, Address src);
 876   void cvtsi2ssq(XMMRegister dst, Register src);
 877   void cvtsi2ssq(XMMRegister dst, Address src);
 878 #endif
 879 
 880   void locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch = noreg);
 881 
 882   void imulptr(Register dst, Register src) { LP64_ONLY(imulq(dst, src)) NOT_LP64(imull(dst, src)); }
 883   void imulptr(Register dst, Register src, int imm32) { LP64_ONLY(imulq(dst, src, imm32)) NOT_LP64(imull(dst, src, imm32)); }
 884 
 885 
 886   void negptr(Register dst) { LP64_ONLY(negq(dst)) NOT_LP64(negl(dst)); }
 887 
 888   void notptr(Register dst) { LP64_ONLY(notq(dst)) NOT_LP64(notl(dst)); }
 889 
 890   void shlptr(Register dst, int32_t shift);
 891   void shlptr(Register dst) { LP64_ONLY(shlq(dst)) NOT_LP64(shll(dst)); }
 892 
 893   void shrptr(Register dst, int32_t shift);
 894   void shrptr(Register dst) { LP64_ONLY(shrq(dst)) NOT_LP64(shrl(dst)); }
 895 
 896   void sarptr(Register dst) { LP64_ONLY(sarq(dst)) NOT_LP64(sarl(dst)); }
 897   void sarptr(Register dst, int32_t src) { LP64_ONLY(sarq(dst, src)) NOT_LP64(sarl(dst, src)); }
 898 
 899   void subptr(Address dst, int32_t src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
 900 
 901   void subptr(Register dst, Address src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
 902   void subptr(Register dst, int32_t src);
 903   // Force generation of a 4 byte immediate value even if it fits into 8bit
 904   void subptr_imm32(Register dst, int32_t src);
 905   void subptr(Register dst, Register src);
 906   void subptr(Register dst, RegisterOrConstant src) {
 907     if (src.is_constant()) subptr(dst, (int) src.as_constant());
 908     else                   subptr(dst,       src.as_register());
 909   }
 910 
 911   void sbbptr(Address dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
 912   void sbbptr(Register dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
 913 
 914   void xchgptr(Register src1, Register src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
 915   void xchgptr(Register src1, Address src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
 916 
 917   void xaddptr(Address src1, Register src2) { LP64_ONLY(xaddq(src1, src2)) NOT_LP64(xaddl(src1, src2)) ; }
 918 
 919 
 920 
 921   // Helper functions for statistics gathering.
 922   // Conditionally (atomically, on MPs) increments passed counter address, preserving condition codes.
 923   void cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch = noreg);
 924   // Unconditional atomic increment.
 925   void atomic_incl(Address counter_addr);
 926   void atomic_incl(AddressLiteral counter_addr, Register rscratch = noreg);
 927 #ifdef _LP64
 928   void atomic_incq(Address counter_addr);
 929   void atomic_incq(AddressLiteral counter_addr, Register rscratch = noreg);
 930 #endif
 931   void atomic_incptr(AddressLiteral counter_addr, Register rscratch = noreg) { LP64_ONLY(atomic_incq(counter_addr, rscratch)) NOT_LP64(atomic_incl(counter_addr, rscratch)) ; }
 932   void atomic_incptr(Address counter_addr) { LP64_ONLY(atomic_incq(counter_addr)) NOT_LP64(atomic_incl(counter_addr)) ; }
 933 
 934   void lea(Register dst, Address        adr) { Assembler::lea(dst, adr); }
 935   void lea(Register dst, AddressLiteral adr);
 936   void lea(Address  dst, AddressLiteral adr, Register rscratch);
 937 
 938   void leal32(Register dst, Address src) { leal(dst, src); }
 939 
 940   // Import other testl() methods from the parent class or else
 941   // they will be hidden by the following overriding declaration.
 942   using Assembler::testl;
 943   void testl(Address dst, int32_t imm32);
 944   void testl(Register dst, int32_t imm32);
 945   void testl(Register dst, AddressLiteral src); // requires reachable address
 946   using Assembler::testq;
 947   void testq(Address dst, int32_t imm32);
 948   void testq(Register dst, int32_t imm32);
 949 
 950   void orptr(Register dst, Address src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 951   void orptr(Register dst, Register src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 952   void orptr(Register dst, int32_t src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 953   void orptr(Address dst, int32_t imm32) { LP64_ONLY(orq(dst, imm32)) NOT_LP64(orl(dst, imm32)); }
 954 
 955   void testptr(Register src, int32_t imm32) {  LP64_ONLY(testq(src, imm32)) NOT_LP64(testl(src, imm32)); }
 956   void testptr(Register src1, Address src2) { LP64_ONLY(testq(src1, src2)) NOT_LP64(testl(src1, src2)); }
 957   void testptr(Register src1, Register src2);
 958 
 959   void xorptr(Register dst, Register src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
 960   void xorptr(Register dst, Address src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
 961 
 962   // Calls
 963 
 964   void call(Label& L, relocInfo::relocType rtype);
 965   void call(Register entry);
 966   void call(Address addr) { Assembler::call(addr); }
 967 
 968   // NOTE: this call transfers to the effective address of entry NOT
 969   // the address contained by entry. This is because this is more natural
 970   // for jumps/calls.
 971   void call(AddressLiteral entry, Register rscratch = rax);
 972 
 973   // Emit the CompiledIC call idiom
 974   void ic_call(address entry, jint method_index = 0);
 975 
 976   void emit_static_call_stub();
 977 
 978   // Jumps
 979 
 980   // NOTE: these jumps transfer to the effective address of dst NOT
 981   // the address contained by dst. This is because this is more natural
 982   // for jumps/calls.
 983   void jump(AddressLiteral dst, Register rscratch = noreg);
 984 
 985   void jump_cc(Condition cc, AddressLiteral dst, Register rscratch = noreg);
 986 
 987   // 32bit can do a case table jump in one instruction but we no longer allow the base
 988   // to be installed in the Address class. This jump will transfer to the address
 989   // contained in the location described by entry (not the address of entry)
 990   void jump(ArrayAddress entry, Register rscratch);
 991 
 992   // Floating
 993 
 994   void push_f(XMMRegister r);
 995   void pop_f(XMMRegister r);
 996   void push_d(XMMRegister r);
 997   void pop_d(XMMRegister r);
 998 
 999   void andpd(XMMRegister dst, XMMRegister    src) { Assembler::andpd(dst, src); }
1000   void andpd(XMMRegister dst, Address        src) { Assembler::andpd(dst, src); }
1001   void andpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1002 
1003   void andps(XMMRegister dst, XMMRegister    src) { Assembler::andps(dst, src); }
1004   void andps(XMMRegister dst, Address        src) { Assembler::andps(dst, src); }
1005   void andps(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1006 
1007   void comiss(XMMRegister dst, XMMRegister    src) { Assembler::comiss(dst, src); }
1008   void comiss(XMMRegister dst, Address        src) { Assembler::comiss(dst, src); }
1009   void comiss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1010 
1011   void comisd(XMMRegister dst, XMMRegister    src) { Assembler::comisd(dst, src); }
1012   void comisd(XMMRegister dst, Address        src) { Assembler::comisd(dst, src); }
1013   void comisd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1014 
1015 #ifndef _LP64
1016   void fadd_s(Address        src) { Assembler::fadd_s(src); }
1017   void fadd_s(AddressLiteral src) { Assembler::fadd_s(as_Address(src)); }
1018 
1019   void fldcw(Address        src) { Assembler::fldcw(src); }
1020   void fldcw(AddressLiteral src);
1021 
1022   void fld_s(int index)          { Assembler::fld_s(index); }
1023   void fld_s(Address        src) { Assembler::fld_s(src); }
1024   void fld_s(AddressLiteral src);
1025 
1026   void fld_d(Address        src) { Assembler::fld_d(src); }
1027   void fld_d(AddressLiteral src);
1028 
1029   void fld_x(Address        src) { Assembler::fld_x(src); }
1030   void fld_x(AddressLiteral src) { Assembler::fld_x(as_Address(src)); }
1031 
1032   void fmul_s(Address        src) { Assembler::fmul_s(src); }
1033   void fmul_s(AddressLiteral src) { Assembler::fmul_s(as_Address(src)); }
1034 #endif // !_LP64
1035 
1036   void ldmxcsr(Address src) { Assembler::ldmxcsr(src); }
1037   void ldmxcsr(AddressLiteral src, Register rscratch = noreg);
1038 
1039 #ifdef _LP64
1040  private:
1041   void sha256_AVX2_one_round_compute(
1042     Register  reg_old_h,
1043     Register  reg_a,
1044     Register  reg_b,
1045     Register  reg_c,
1046     Register  reg_d,
1047     Register  reg_e,
1048     Register  reg_f,
1049     Register  reg_g,
1050     Register  reg_h,
1051     int iter);
1052   void sha256_AVX2_four_rounds_compute_first(int start);
1053   void sha256_AVX2_four_rounds_compute_last(int start);
1054   void sha256_AVX2_one_round_and_sched(
1055         XMMRegister xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
1056         XMMRegister xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
1057         XMMRegister xmm_2,     /* ymm6 */
1058         XMMRegister xmm_3,     /* ymm7 */
1059         Register    reg_a,      /* == eax on 0 iteration, then rotate 8 register right on each next iteration */
1060         Register    reg_b,      /* ebx */    /* full cycle is 8 iterations */
1061         Register    reg_c,      /* edi */
1062         Register    reg_d,      /* esi */
1063         Register    reg_e,      /* r8d */
1064         Register    reg_f,      /* r9d */
1065         Register    reg_g,      /* r10d */
1066         Register    reg_h,      /* r11d */
1067         int iter);
1068 
1069   void addm(int disp, Register r1, Register r2);
1070 
1071   void sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c, Register d,
1072                                      Register e, Register f, Register g, Register h, int iteration);
1073 
1074   void sha512_AVX2_one_round_and_schedule(XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1075                                           Register a, Register b, Register c, Register d, Register e, Register f,
1076                                           Register g, Register h, int iteration);
1077 
1078   void addmq(int disp, Register r1, Register r2);
1079  public:
1080   void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1081                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1082                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1083                    bool multi_block, XMMRegister shuf_mask);
1084   void sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1085                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1086                    Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block,
1087                    XMMRegister shuf_mask);
1088 #endif // _LP64
1089 
1090   void fast_md5(Register buf, Address state, Address ofs, Address limit,
1091                 bool multi_block);
1092 
1093   void fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
1094                  XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
1095                  Register buf, Register state, Register ofs, Register limit, Register rsp,
1096                  bool multi_block);
1097 
1098 #ifdef _LP64
1099   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1100                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1101                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1102                    bool multi_block, XMMRegister shuf_mask);
1103 #else
1104   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1105                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1106                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1107                    bool multi_block);
1108 #endif
1109 
1110   void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1111                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1112                 Register rax, Register rcx, Register rdx, Register tmp);
1113 
1114 #ifndef _LP64
1115  private:
1116   // Initialized in macroAssembler_x86_constants.cpp
1117   static address ONES;
1118   static address L_2IL0FLOATPACKET_0;
1119   static address PI4_INV;
1120   static address PI4X3;
1121   static address PI4X4;
1122 
1123  public:
1124   void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1125                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1126                 Register rax, Register rcx, Register rdx, Register tmp1);
1127 
1128   void fast_log10(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1129                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1130                 Register rax, Register rcx, Register rdx, Register tmp);
1131 
1132   void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1133                 XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
1134                 Register rdx, Register tmp);
1135 
1136   void fast_sin(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1137                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1138                 Register rax, Register rbx, Register rdx);
1139 
1140   void fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1141                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1142                 Register rax, Register rcx, Register rdx, Register tmp);
1143 
1144   void libm_sincos_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1145                         Register edx, Register ebx, Register esi, Register edi,
1146                         Register ebp, Register esp);
1147 
1148   void libm_reduce_pi04l(Register eax, Register ecx, Register edx, Register ebx,
1149                          Register esi, Register edi, Register ebp, Register esp);
1150 
1151   void libm_tancot_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1152                         Register edx, Register ebx, Register esi, Register edi,
1153                         Register ebp, Register esp);
1154 
1155   void fast_tan(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1156                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1157                 Register rax, Register rcx, Register rdx, Register tmp);
1158 #endif // !_LP64
1159 
1160 private:
1161 
1162   // these are private because users should be doing movflt/movdbl
1163 
1164   void movss(Address     dst, XMMRegister    src) { Assembler::movss(dst, src); }
1165   void movss(XMMRegister dst, XMMRegister    src) { Assembler::movss(dst, src); }
1166   void movss(XMMRegister dst, Address        src) { Assembler::movss(dst, src); }
1167   void movss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1168 
1169   void movlpd(XMMRegister dst, Address        src) {Assembler::movlpd(dst, src); }
1170   void movlpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1171 
1172 public:
1173 
1174   void addsd(XMMRegister dst, XMMRegister    src) { Assembler::addsd(dst, src); }
1175   void addsd(XMMRegister dst, Address        src) { Assembler::addsd(dst, src); }
1176   void addsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1177 
1178   void addss(XMMRegister dst, XMMRegister    src) { Assembler::addss(dst, src); }
1179   void addss(XMMRegister dst, Address        src) { Assembler::addss(dst, src); }
1180   void addss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1181 
1182   void addpd(XMMRegister dst, XMMRegister    src) { Assembler::addpd(dst, src); }
1183   void addpd(XMMRegister dst, Address        src) { Assembler::addpd(dst, src); }
1184   void addpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1185 
1186   using Assembler::vbroadcastsd;
1187   void vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1188 
1189   using Assembler::vbroadcastss;
1190   void vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1191 
1192   void divsd(XMMRegister dst, XMMRegister    src) { Assembler::divsd(dst, src); }
1193   void divsd(XMMRegister dst, Address        src) { Assembler::divsd(dst, src); }
1194   void divsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1195 
1196   void divss(XMMRegister dst, XMMRegister    src) { Assembler::divss(dst, src); }
1197   void divss(XMMRegister dst, Address        src) { Assembler::divss(dst, src); }
1198   void divss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1199 
1200   // Move Unaligned Double Quadword
1201   void movdqu(Address     dst, XMMRegister    src);
1202   void movdqu(XMMRegister dst, XMMRegister    src);
1203   void movdqu(XMMRegister dst, Address        src);
1204   void movdqu(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1205 
1206   void kmovwl(Register  dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1207   void kmovwl(Address   dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1208   void kmovwl(KRegister dst, KRegister      src) { Assembler::kmovwl(dst, src); }
1209   void kmovwl(KRegister dst, Register       src) { Assembler::kmovwl(dst, src); }
1210   void kmovwl(KRegister dst, Address        src) { Assembler::kmovwl(dst, src); }
1211   void kmovwl(KRegister dst, AddressLiteral src, Register rscratch = noreg);
1212 
1213   void kmovql(KRegister dst, KRegister      src) { Assembler::kmovql(dst, src); }
1214   void kmovql(KRegister dst, Register       src) { Assembler::kmovql(dst, src); }
1215   void kmovql(Register  dst, KRegister      src) { Assembler::kmovql(dst, src); }
1216   void kmovql(KRegister dst, Address        src) { Assembler::kmovql(dst, src); }
1217   void kmovql(Address   dst, KRegister      src) { Assembler::kmovql(dst, src); }
1218   void kmovql(KRegister dst, AddressLiteral src, Register rscratch = noreg);
1219 
1220   // Safe move operation, lowers down to 16bit moves for targets supporting
1221   // AVX512F feature and 64bit moves for targets supporting AVX512BW feature.
1222   void kmov(Address  dst, KRegister src);
1223   void kmov(KRegister dst, Address src);
1224   void kmov(KRegister dst, KRegister src);
1225   void kmov(Register dst, KRegister src);
1226   void kmov(KRegister dst, Register src);
1227 
1228   using Assembler::movddup;
1229   void movddup(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1230 
1231   using Assembler::vmovddup;
1232   void vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1233 
1234   // AVX Unaligned forms
1235   void vmovdqu(Address     dst, XMMRegister    src);
1236   void vmovdqu(XMMRegister dst, Address        src);
1237   void vmovdqu(XMMRegister dst, XMMRegister    src);
1238   void vmovdqu(XMMRegister dst, AddressLiteral src,                 Register rscratch = noreg);
1239   void vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1240 
1241   // AVX512 Unaligned
1242   void evmovdqu(BasicType type, KRegister kmask, Address     dst, XMMRegister src, bool merge, int vector_len);
1243   void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address     src, bool merge, int vector_len);
1244 
1245   void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::evmovdqub(dst, src, vector_len); }
1246   void evmovdqub(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdqub(dst, src, vector_len); }
1247 
1248   void evmovdqub(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1249     if (dst->encoding() != src->encoding() || mask != k0)  {
1250       Assembler::evmovdqub(dst, mask, src, merge, vector_len);
1251     }
1252   }
1253   void evmovdqub(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
1254   void evmovdqub(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
1255   void evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1256 
1257   void evmovdquw(Address     dst, XMMRegister src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
1258   void evmovdquw(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
1259 
1260   void evmovdquw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1261     if (dst->encoding() != src->encoding() || mask != k0) {
1262       Assembler::evmovdquw(dst, mask, src, merge, vector_len);
1263     }
1264   }
1265   void evmovdquw(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
1266   void evmovdquw(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
1267   void evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1268 
1269   void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
1270      if (dst->encoding() != src->encoding()) {
1271        Assembler::evmovdqul(dst, src, vector_len);
1272      }
1273   }
1274   void evmovdqul(Address     dst, XMMRegister src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
1275   void evmovdqul(XMMRegister dst, Address     src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
1276 
1277   void evmovdqul(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1278     if (dst->encoding() != src->encoding() || mask != k0)  {
1279       Assembler::evmovdqul(dst, mask, src, merge, vector_len);
1280     }
1281   }
1282   void evmovdqul(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
1283   void evmovdqul(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
1284   void evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1285 
1286   void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) {
1287     if (dst->encoding() != src->encoding()) {
1288       Assembler::evmovdquq(dst, src, vector_len);
1289     }
1290   }
1291   void evmovdquq(XMMRegister dst, Address        src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1292   void evmovdquq(Address     dst, XMMRegister    src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1293   void evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1294 
1295   void evmovdquq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1296     if (dst->encoding() != src->encoding() || mask != k0) {
1297       Assembler::evmovdquq(dst, mask, src, merge, vector_len);
1298     }
1299   }
1300   void evmovdquq(Address     dst, KRegister mask, XMMRegister    src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
1301   void evmovdquq(XMMRegister dst, KRegister mask, Address        src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
1302   void evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1303 
1304   // Move Aligned Double Quadword
1305   void movdqa(XMMRegister dst, XMMRegister    src) { Assembler::movdqa(dst, src); }
1306   void movdqa(XMMRegister dst, Address        src) { Assembler::movdqa(dst, src); }
1307   void movdqa(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1308 
1309   void movsd(Address     dst, XMMRegister    src) { Assembler::movsd(dst, src); }
1310   void movsd(XMMRegister dst, XMMRegister    src) { Assembler::movsd(dst, src); }
1311   void movsd(XMMRegister dst, Address        src) { Assembler::movsd(dst, src); }
1312   void movsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1313 
1314   void mulpd(XMMRegister dst, XMMRegister    src) { Assembler::mulpd(dst, src); }
1315   void mulpd(XMMRegister dst, Address        src) { Assembler::mulpd(dst, src); }
1316   void mulpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1317 
1318   void mulsd(XMMRegister dst, XMMRegister    src) { Assembler::mulsd(dst, src); }
1319   void mulsd(XMMRegister dst, Address        src) { Assembler::mulsd(dst, src); }
1320   void mulsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1321 
1322   void mulss(XMMRegister dst, XMMRegister    src) { Assembler::mulss(dst, src); }
1323   void mulss(XMMRegister dst, Address        src) { Assembler::mulss(dst, src); }
1324   void mulss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1325 
1326   // Carry-Less Multiplication Quadword
1327   void pclmulldq(XMMRegister dst, XMMRegister src) {
1328     // 0x00 - multiply lower 64 bits [0:63]
1329     Assembler::pclmulqdq(dst, src, 0x00);
1330   }
1331   void pclmulhdq(XMMRegister dst, XMMRegister src) {
1332     // 0x11 - multiply upper 64 bits [64:127]
1333     Assembler::pclmulqdq(dst, src, 0x11);
1334   }
1335 
1336   void pcmpeqb(XMMRegister dst, XMMRegister src);
1337   void pcmpeqw(XMMRegister dst, XMMRegister src);
1338 
1339   void pcmpestri(XMMRegister dst, Address src, int imm8);
1340   void pcmpestri(XMMRegister dst, XMMRegister src, int imm8);
1341 
1342   void pmovzxbw(XMMRegister dst, XMMRegister src);
1343   void pmovzxbw(XMMRegister dst, Address src);
1344 
1345   void pmovmskb(Register dst, XMMRegister src);
1346 
1347   void ptest(XMMRegister dst, XMMRegister src);
1348 
1349   void roundsd(XMMRegister dst, XMMRegister    src, int32_t rmode) { Assembler::roundsd(dst, src, rmode); }
1350   void roundsd(XMMRegister dst, Address        src, int32_t rmode) { Assembler::roundsd(dst, src, rmode); }
1351   void roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch = noreg);
1352 
1353   void sqrtss(XMMRegister dst, XMMRegister     src) { Assembler::sqrtss(dst, src); }
1354   void sqrtss(XMMRegister dst, Address         src) { Assembler::sqrtss(dst, src); }
1355   void sqrtss(XMMRegister dst, AddressLiteral  src, Register rscratch = noreg);
1356 
1357   void subsd(XMMRegister dst, XMMRegister    src) { Assembler::subsd(dst, src); }
1358   void subsd(XMMRegister dst, Address        src) { Assembler::subsd(dst, src); }
1359   void subsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1360 
1361   void subss(XMMRegister dst, XMMRegister    src) { Assembler::subss(dst, src); }
1362   void subss(XMMRegister dst, Address        src) { Assembler::subss(dst, src); }
1363   void subss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1364 
1365   void ucomiss(XMMRegister dst, XMMRegister    src) { Assembler::ucomiss(dst, src); }
1366   void ucomiss(XMMRegister dst, Address        src) { Assembler::ucomiss(dst, src); }
1367   void ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1368 
1369   void ucomisd(XMMRegister dst, XMMRegister    src) { Assembler::ucomisd(dst, src); }
1370   void ucomisd(XMMRegister dst, Address        src) { Assembler::ucomisd(dst, src); }
1371   void ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1372 
1373   // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
1374   void xorpd(XMMRegister dst, XMMRegister    src);
1375   void xorpd(XMMRegister dst, Address        src) { Assembler::xorpd(dst, src); }
1376   void xorpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1377 
1378   // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
1379   void xorps(XMMRegister dst, XMMRegister    src);
1380   void xorps(XMMRegister dst, Address        src) { Assembler::xorps(dst, src); }
1381   void xorps(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1382 
1383   // Shuffle Bytes
1384   void pshufb(XMMRegister dst, XMMRegister    src) { Assembler::pshufb(dst, src); }
1385   void pshufb(XMMRegister dst, Address        src) { Assembler::pshufb(dst, src); }
1386   void pshufb(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1387   // AVX 3-operands instructions
1388 
1389   void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vaddsd(dst, nds, src); }
1390   void vaddsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vaddsd(dst, nds, src); }
1391   void vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1392 
1393   void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vaddss(dst, nds, src); }
1394   void vaddss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vaddss(dst, nds, src); }
1395   void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1396 
1397   void vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch = noreg);
1398   void vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch = noreg);
1399 
1400   void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len);
1401   void vpaddb(XMMRegister dst, XMMRegister nds, Address        src, int vector_len);
1402   void vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1403 
1404   void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1405   void vpaddw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1406 
1407   void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1408   void vpaddd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1409   void vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1410 
1411   void vpand(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1412   void vpand(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1413   void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1414 
1415   using Assembler::vpbroadcastd;
1416   void vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1417 
1418   using Assembler::vpbroadcastq;
1419   void vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
1420 
1421   void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1422 
1423   void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1424   void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1425 
1426   // Vector compares
1427   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1428     Assembler::evpcmpd(kdst, mask, nds, src, comparison, is_signed, vector_len);
1429   }
1430   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1431 
1432   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1433     Assembler::evpcmpq(kdst, mask, nds, src, comparison, is_signed, vector_len);
1434   }
1435   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1436 
1437   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1438     Assembler::evpcmpb(kdst, mask, nds, src, comparison, is_signed, vector_len);
1439   }
1440   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1441 
1442   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister    src, int comparison, bool is_signed, int vector_len) {
1443     Assembler::evpcmpw(kdst, mask, nds, src, comparison, is_signed, vector_len);
1444   }
1445   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int comparison, bool is_signed, int vector_len, Register rscratch = noreg);
1446 
1447   void evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len);
1448 
1449   // Emit comparison instruction for the specified comparison predicate.
1450   void vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len);
1451   void vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len);
1452 
1453   void vpmovzxbw(XMMRegister dst, Address     src, int vector_len);
1454   void vpmovzxbw(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vpmovzxbw(dst, src, vector_len); }
1455 
1456   void vpmovmskb(Register dst, XMMRegister src, int vector_len = Assembler::AVX_256bit);
1457 
1458   void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1459   void vpmullw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1460 
1461   void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); }
1462   void vpmulld(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); }
1463   void vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1464 
1465   void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1466   void vpsubb(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1467 
1468   void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1469   void vpsubw(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
1470 
1471   void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1472   void vpsraw(XMMRegister dst, XMMRegister nds, int         shift, int vector_len);
1473 
1474   void evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1475   void evpsraq(XMMRegister dst, XMMRegister nds, int         shift, int vector_len);
1476 
1477   void evpsllw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1478     if (!is_varshift) {
1479       Assembler::evpsllw(dst, mask, nds, src, merge, vector_len);
1480     } else {
1481       Assembler::evpsllvw(dst, mask, nds, src, merge, vector_len);
1482     }
1483   }
1484   void evpslld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1485     if (!is_varshift) {
1486       Assembler::evpslld(dst, mask, nds, src, merge, vector_len);
1487     } else {
1488       Assembler::evpsllvd(dst, mask, nds, src, merge, vector_len);
1489     }
1490   }
1491   void evpsllq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1492     if (!is_varshift) {
1493       Assembler::evpsllq(dst, mask, nds, src, merge, vector_len);
1494     } else {
1495       Assembler::evpsllvq(dst, mask, nds, src, merge, vector_len);
1496     }
1497   }
1498   void evpsrlw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1499     if (!is_varshift) {
1500       Assembler::evpsrlw(dst, mask, nds, src, merge, vector_len);
1501     } else {
1502       Assembler::evpsrlvw(dst, mask, nds, src, merge, vector_len);
1503     }
1504   }
1505   void evpsrld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1506     if (!is_varshift) {
1507       Assembler::evpsrld(dst, mask, nds, src, merge, vector_len);
1508     } else {
1509       Assembler::evpsrlvd(dst, mask, nds, src, merge, vector_len);
1510     }
1511   }
1512   void evpsrlq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1513     if (!is_varshift) {
1514       Assembler::evpsrlq(dst, mask, nds, src, merge, vector_len);
1515     } else {
1516       Assembler::evpsrlvq(dst, mask, nds, src, merge, vector_len);
1517     }
1518   }
1519   void evpsraw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1520     if (!is_varshift) {
1521       Assembler::evpsraw(dst, mask, nds, src, merge, vector_len);
1522     } else {
1523       Assembler::evpsravw(dst, mask, nds, src, merge, vector_len);
1524     }
1525   }
1526   void evpsrad(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1527     if (!is_varshift) {
1528       Assembler::evpsrad(dst, mask, nds, src, merge, vector_len);
1529     } else {
1530       Assembler::evpsravd(dst, mask, nds, src, merge, vector_len);
1531     }
1532   }
1533   void evpsraq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1534     if (!is_varshift) {
1535       Assembler::evpsraq(dst, mask, nds, src, merge, vector_len);
1536     } else {
1537       Assembler::evpsravq(dst, mask, nds, src, merge, vector_len);
1538     }
1539   }
1540 
1541   void evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1542   void evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1543   void evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1544   void evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1545 
1546   void vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1547   void vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1548 
1549   void vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1550   void vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1551 
1552   void vptest(XMMRegister dst, XMMRegister src);
1553   void vptest(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vptest(dst, src, vector_len); }
1554 
1555   void punpcklbw(XMMRegister dst, XMMRegister src);
1556   void punpcklbw(XMMRegister dst, Address src) { Assembler::punpcklbw(dst, src); }
1557 
1558   void pshufd(XMMRegister dst, Address src, int mode);
1559   void pshufd(XMMRegister dst, XMMRegister src, int mode) { Assembler::pshufd(dst, src, mode); }
1560 
1561   void pshuflw(XMMRegister dst, XMMRegister src, int mode);
1562   void pshuflw(XMMRegister dst, Address src, int mode) { Assembler::pshuflw(dst, src, mode); }
1563 
1564   void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1565   void vandpd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1566   void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1567 
1568   void vandps(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1569   void vandps(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1570   void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1571 
1572   void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1573 
1574   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vdivsd(dst, nds, src); }
1575   void vdivsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vdivsd(dst, nds, src); }
1576   void vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1577 
1578   void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vdivss(dst, nds, src); }
1579   void vdivss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vdivss(dst, nds, src); }
1580   void vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1581 
1582   void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vmulsd(dst, nds, src); }
1583   void vmulsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vmulsd(dst, nds, src); }
1584   void vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1585 
1586   void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vmulss(dst, nds, src); }
1587   void vmulss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vmulss(dst, nds, src); }
1588   void vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1589 
1590   void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vsubsd(dst, nds, src); }
1591   void vsubsd(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vsubsd(dst, nds, src); }
1592   void vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1593 
1594   void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister    src) { Assembler::vsubss(dst, nds, src); }
1595   void vsubss(XMMRegister dst, XMMRegister nds, Address        src) { Assembler::vsubss(dst, nds, src); }
1596   void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1597 
1598   void vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1599   void vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch = noreg);
1600 
1601   // AVX Vector instructions
1602 
1603   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1604   void vxorpd(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1605   void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1606 
1607   void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1608   void vxorps(XMMRegister dst, XMMRegister nds, Address        src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1609   void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1610 
1611   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1612     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1613       Assembler::vpxor(dst, nds, src, vector_len);
1614     else
1615       Assembler::vxorpd(dst, nds, src, vector_len);
1616   }
1617   void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
1618     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1619       Assembler::vpxor(dst, nds, src, vector_len);
1620     else
1621       Assembler::vxorpd(dst, nds, src, vector_len);
1622   }
1623   void vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1624 
1625   // Simple version for AVX2 256bit vectors
1626   void vpxor(XMMRegister dst, XMMRegister src) {
1627     assert(UseAVX >= 2, "Should be at least AVX2");
1628     Assembler::vpxor(dst, dst, src, AVX_256bit);
1629   }
1630   void vpxor(XMMRegister dst, Address src) {
1631     assert(UseAVX >= 2, "Should be at least AVX2");
1632     Assembler::vpxor(dst, dst, src, AVX_256bit);
1633   }
1634 
1635   void vpermd(XMMRegister dst, XMMRegister nds, XMMRegister    src, int vector_len) { Assembler::vpermd(dst, nds, src, vector_len); }
1636   void vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1637 
1638   void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
1639     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1640       Assembler::vinserti32x4(dst, nds, src, imm8);
1641     } else if (UseAVX > 1) {
1642       // vinserti128 is available only in AVX2
1643       Assembler::vinserti128(dst, nds, src, imm8);
1644     } else {
1645       Assembler::vinsertf128(dst, nds, src, imm8);
1646     }
1647   }
1648 
1649   void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
1650     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1651       Assembler::vinserti32x4(dst, nds, src, imm8);
1652     } else if (UseAVX > 1) {
1653       // vinserti128 is available only in AVX2
1654       Assembler::vinserti128(dst, nds, src, imm8);
1655     } else {
1656       Assembler::vinsertf128(dst, nds, src, imm8);
1657     }
1658   }
1659 
1660   void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
1661     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1662       Assembler::vextracti32x4(dst, src, imm8);
1663     } else if (UseAVX > 1) {
1664       // vextracti128 is available only in AVX2
1665       Assembler::vextracti128(dst, src, imm8);
1666     } else {
1667       Assembler::vextractf128(dst, src, imm8);
1668     }
1669   }
1670 
1671   void vextracti128(Address dst, XMMRegister src, uint8_t imm8) {
1672     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1673       Assembler::vextracti32x4(dst, src, imm8);
1674     } else if (UseAVX > 1) {
1675       // vextracti128 is available only in AVX2
1676       Assembler::vextracti128(dst, src, imm8);
1677     } else {
1678       Assembler::vextractf128(dst, src, imm8);
1679     }
1680   }
1681 
1682   // 128bit copy to/from high 128 bits of 256bit (YMM) vector registers
1683   void vinserti128_high(XMMRegister dst, XMMRegister src) {
1684     vinserti128(dst, dst, src, 1);
1685   }
1686   void vinserti128_high(XMMRegister dst, Address src) {
1687     vinserti128(dst, dst, src, 1);
1688   }
1689   void vextracti128_high(XMMRegister dst, XMMRegister src) {
1690     vextracti128(dst, src, 1);
1691   }
1692   void vextracti128_high(Address dst, XMMRegister src) {
1693     vextracti128(dst, src, 1);
1694   }
1695 
1696   void vinsertf128_high(XMMRegister dst, XMMRegister src) {
1697     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1698       Assembler::vinsertf32x4(dst, dst, src, 1);
1699     } else {
1700       Assembler::vinsertf128(dst, dst, src, 1);
1701     }
1702   }
1703 
1704   void vinsertf128_high(XMMRegister dst, Address src) {
1705     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1706       Assembler::vinsertf32x4(dst, dst, src, 1);
1707     } else {
1708       Assembler::vinsertf128(dst, dst, src, 1);
1709     }
1710   }
1711 
1712   void vextractf128_high(XMMRegister dst, XMMRegister src) {
1713     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1714       Assembler::vextractf32x4(dst, src, 1);
1715     } else {
1716       Assembler::vextractf128(dst, src, 1);
1717     }
1718   }
1719 
1720   void vextractf128_high(Address dst, XMMRegister src) {
1721     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1722       Assembler::vextractf32x4(dst, src, 1);
1723     } else {
1724       Assembler::vextractf128(dst, src, 1);
1725     }
1726   }
1727 
1728   // 256bit copy to/from high 256 bits of 512bit (ZMM) vector registers
1729   void vinserti64x4_high(XMMRegister dst, XMMRegister src) {
1730     Assembler::vinserti64x4(dst, dst, src, 1);
1731   }
1732   void vinsertf64x4_high(XMMRegister dst, XMMRegister src) {
1733     Assembler::vinsertf64x4(dst, dst, src, 1);
1734   }
1735   void vextracti64x4_high(XMMRegister dst, XMMRegister src) {
1736     Assembler::vextracti64x4(dst, src, 1);
1737   }
1738   void vextractf64x4_high(XMMRegister dst, XMMRegister src) {
1739     Assembler::vextractf64x4(dst, src, 1);
1740   }
1741   void vextractf64x4_high(Address dst, XMMRegister src) {
1742     Assembler::vextractf64x4(dst, src, 1);
1743   }
1744   void vinsertf64x4_high(XMMRegister dst, Address src) {
1745     Assembler::vinsertf64x4(dst, dst, src, 1);
1746   }
1747 
1748   // 128bit copy to/from low 128 bits of 256bit (YMM) vector registers
1749   void vinserti128_low(XMMRegister dst, XMMRegister src) {
1750     vinserti128(dst, dst, src, 0);
1751   }
1752   void vinserti128_low(XMMRegister dst, Address src) {
1753     vinserti128(dst, dst, src, 0);
1754   }
1755   void vextracti128_low(XMMRegister dst, XMMRegister src) {
1756     vextracti128(dst, src, 0);
1757   }
1758   void vextracti128_low(Address dst, XMMRegister src) {
1759     vextracti128(dst, src, 0);
1760   }
1761 
1762   void vinsertf128_low(XMMRegister dst, XMMRegister src) {
1763     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1764       Assembler::vinsertf32x4(dst, dst, src, 0);
1765     } else {
1766       Assembler::vinsertf128(dst, dst, src, 0);
1767     }
1768   }
1769 
1770   void vinsertf128_low(XMMRegister dst, Address src) {
1771     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1772       Assembler::vinsertf32x4(dst, dst, src, 0);
1773     } else {
1774       Assembler::vinsertf128(dst, dst, src, 0);
1775     }
1776   }
1777 
1778   void vextractf128_low(XMMRegister dst, XMMRegister src) {
1779     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1780       Assembler::vextractf32x4(dst, src, 0);
1781     } else {
1782       Assembler::vextractf128(dst, src, 0);
1783     }
1784   }
1785 
1786   void vextractf128_low(Address dst, XMMRegister src) {
1787     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1788       Assembler::vextractf32x4(dst, src, 0);
1789     } else {
1790       Assembler::vextractf128(dst, src, 0);
1791     }
1792   }
1793 
1794   // 256bit copy to/from low 256 bits of 512bit (ZMM) vector registers
1795   void vinserti64x4_low(XMMRegister dst, XMMRegister src) {
1796     Assembler::vinserti64x4(dst, dst, src, 0);
1797   }
1798   void vinsertf64x4_low(XMMRegister dst, XMMRegister src) {
1799     Assembler::vinsertf64x4(dst, dst, src, 0);
1800   }
1801   void vextracti64x4_low(XMMRegister dst, XMMRegister src) {
1802     Assembler::vextracti64x4(dst, src, 0);
1803   }
1804   void vextractf64x4_low(XMMRegister dst, XMMRegister src) {
1805     Assembler::vextractf64x4(dst, src, 0);
1806   }
1807   void vextractf64x4_low(Address dst, XMMRegister src) {
1808     Assembler::vextractf64x4(dst, src, 0);
1809   }
1810   void vinsertf64x4_low(XMMRegister dst, Address src) {
1811     Assembler::vinsertf64x4(dst, dst, src, 0);
1812   }
1813 
1814   // Carry-Less Multiplication Quadword
1815   void vpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1816     // 0x00 - multiply lower 64 bits [0:63]
1817     Assembler::vpclmulqdq(dst, nds, src, 0x00);
1818   }
1819   void vpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1820     // 0x11 - multiply upper 64 bits [64:127]
1821     Assembler::vpclmulqdq(dst, nds, src, 0x11);
1822   }
1823   void vpclmullqhqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1824     // 0x10 - multiply nds[0:63] and src[64:127]
1825     Assembler::vpclmulqdq(dst, nds, src, 0x10);
1826   }
1827   void vpclmulhqlqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1828     //0x01 - multiply nds[64:127] and src[0:63]
1829     Assembler::vpclmulqdq(dst, nds, src, 0x01);
1830   }
1831 
1832   void evpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1833     // 0x00 - multiply lower 64 bits [0:63]
1834     Assembler::evpclmulqdq(dst, nds, src, 0x00, vector_len);
1835   }
1836   void evpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1837     // 0x11 - multiply upper 64 bits [64:127]
1838     Assembler::evpclmulqdq(dst, nds, src, 0x11, vector_len);
1839   }
1840 
1841   // AVX-512 mask operations.
1842   void kand(BasicType etype, KRegister dst, KRegister src1, KRegister src2);
1843   void kor(BasicType type, KRegister dst, KRegister src1, KRegister src2);
1844   void knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp = knoreg, Register rtmp = noreg);
1845   void kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2);
1846   void kortest(uint masklen, KRegister src1, KRegister src2);
1847   void ktest(uint masklen, KRegister src1, KRegister src2);
1848 
1849   void evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1850   void evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1851 
1852   void evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1853   void evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1854 
1855   void evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1856   void evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1857 
1858   void evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1859   void evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1860 
1861   void evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
1862   void evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
1863   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
1864   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
1865 
1866   using Assembler::evpandq;
1867   void evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1868 
1869   using Assembler::evpaddq;
1870   void evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1871 
1872   using Assembler::evporq;
1873   void evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1874 
1875   using Assembler::vpshufb;
1876   void vpshufb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
1877 
1878   using Assembler::vpternlogq;
1879   void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch = noreg);
1880 
1881   void cmov32( Condition cc, Register dst, Address  src);
1882   void cmov32( Condition cc, Register dst, Register src);
1883 
1884   void cmov(   Condition cc, Register dst, Register src) { cmovptr(cc, dst, src); }
1885 
1886   void cmovptr(Condition cc, Register dst, Address  src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
1887   void cmovptr(Condition cc, Register dst, Register src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
1888 
1889   void movoop(Register dst, jobject obj);
1890   void movoop(Address  dst, jobject obj, Register rscratch);
1891 
1892   void mov_metadata(Register dst, Metadata* obj);
1893   void mov_metadata(Address  dst, Metadata* obj, Register rscratch);
1894 
1895   void movptr(Register     dst, Register       src);
1896   void movptr(Register     dst, Address        src);
1897   void movptr(Register     dst, AddressLiteral src);
1898   void movptr(Register     dst, ArrayAddress   src);
1899   void movptr(Register     dst, intptr_t       src);
1900   void movptr(Address      dst, Register       src);
1901   void movptr(Address      dst, int32_t        imm);
1902   void movptr(Address      dst, intptr_t       src, Register rscratch);
1903   void movptr(ArrayAddress dst, Register       src, Register rscratch);
1904 
1905   void movptr(Register dst, RegisterOrConstant src) {
1906     if (src.is_constant()) movptr(dst, src.as_constant());
1907     else                   movptr(dst, src.as_register());
1908   }
1909 
1910 
1911   // to avoid hiding movl
1912   void mov32(Register       dst, AddressLiteral src);
1913   void mov32(AddressLiteral dst, Register        src, Register rscratch = noreg);
1914 
1915   // Import other mov() methods from the parent class or else
1916   // they will be hidden by the following overriding declaration.
1917   using Assembler::movdl;
1918   void movdl(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1919 
1920   using Assembler::movq;
1921   void movq(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
1922 
1923   // Can push value or effective address
1924   void pushptr(AddressLiteral src, Register rscratch);
1925 
1926   void pushptr(Address src) { LP64_ONLY(pushq(src)) NOT_LP64(pushl(src)); }
1927   void popptr(Address src) { LP64_ONLY(popq(src)) NOT_LP64(popl(src)); }
1928 
1929   void pushoop(jobject obj, Register rscratch);
1930   void pushklass(Metadata* obj, Register rscratch);
1931 
1932   // sign extend as need a l to ptr sized element
1933   void movl2ptr(Register dst, Address src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src)); }
1934   void movl2ptr(Register dst, Register src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(if (dst != src) movl(dst, src)); }
1935 
1936 
1937  public:
1938   // Inline type specific methods
1939   #include "asm/macroAssembler_common.hpp"
1940 
1941   int store_inline_type_fields_to_buf(ciInlineKlass* vk, bool from_interpreter = true);
1942   bool move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[]);
1943   bool unpack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index,
1944                             VMReg from, int& from_index, VMRegPair* to, int to_count, int& to_index,
1945                             RegState reg_state[]);
1946   bool pack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
1947                           VMRegPair* from, int from_count, int& from_index, VMReg to,
1948                           RegState reg_state[], Register val_array);
1949   int extend_stack_for_inline_args(int args_on_stack);
1950   void remove_frame(int initial_framesize, bool needs_stack_repair);
1951   VMReg spill_reg_for(VMReg reg);
1952 
1953   // clear memory of size 'cnt' qwords, starting at 'base';
1954   // if 'is_large' is set, do not try to produce short loop
1955   void clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp, bool is_large, bool word_copy_only, KRegister mask=knoreg);
1956 
1957   // clear memory initialization sequence for constant size;
1958   void clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
1959 
1960   // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
1961   void xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
1962 
1963   // Fill primitive arrays
1964   void generate_fill(BasicType t, bool aligned,
1965                      Register to, Register value, Register count,
1966                      Register rtmp, XMMRegister xtmp);
1967 
1968   void encode_iso_array(Register src, Register dst, Register len,
1969                         XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
1970                         XMMRegister tmp4, Register tmp5, Register result, bool ascii);
1971 
1972 #ifdef _LP64
1973   void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2);
1974   void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
1975                              Register y, Register y_idx, Register z,
1976                              Register carry, Register product,
1977                              Register idx, Register kdx);
1978   void multiply_add_128_x_128(Register x_xstart, Register y, Register z,
1979                               Register yz_idx, Register idx,
1980                               Register carry, Register product, int offset);
1981   void multiply_128_x_128_bmi2_loop(Register y, Register z,
1982                                     Register carry, Register carry2,
1983                                     Register idx, Register jdx,
1984                                     Register yz_idx1, Register yz_idx2,
1985                                     Register tmp, Register tmp3, Register tmp4);
1986   void multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
1987                                Register yz_idx, Register idx, Register jdx,
1988                                Register carry, Register product,
1989                                Register carry2);
1990   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
1991                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
1992   void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
1993                      Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
1994   void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
1995                             Register tmp2);
1996   void multiply_add_64(Register sum, Register op1, Register op2, Register carry,
1997                        Register rdxReg, Register raxReg);
1998   void add_one_64(Register z, Register zlen, Register carry, Register tmp1);
1999   void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
2000                        Register tmp3, Register tmp4);
2001   void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
2002                      Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
2003 
2004   void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1,
2005                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
2006                Register raxReg);
2007   void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
2008                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
2009                Register raxReg);
2010   void vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
2011                            Register result, Register tmp1, Register tmp2,
2012                            XMMRegister vec1, XMMRegister vec2, XMMRegister vec3);
2013 #endif
2014 
2015   // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
2016   void update_byte_crc32(Register crc, Register val, Register table);
2017   void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
2018 
2019 
2020 #ifdef _LP64
2021   void kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2);
2022   void kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
2023                                 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
2024                                 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup);
2025 #endif // _LP64
2026 
2027   // CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
2028   // Note on a naming convention:
2029   // Prefix w = register only used on a Westmere+ architecture
2030   // Prefix n = register only used on a Nehalem architecture
2031 #ifdef _LP64
2032   void crc32c_ipl_alg4(Register in_out, uint32_t n,
2033                        Register tmp1, Register tmp2, Register tmp3);
2034 #else
2035   void crc32c_ipl_alg4(Register in_out, uint32_t n,
2036                        Register tmp1, Register tmp2, Register tmp3,
2037                        XMMRegister xtmp1, XMMRegister xtmp2);
2038 #endif
2039   void crc32c_pclmulqdq(XMMRegister w_xtmp1,
2040                         Register in_out,
2041                         uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
2042                         XMMRegister w_xtmp2,
2043                         Register tmp1,
2044                         Register n_tmp2, Register n_tmp3);
2045   void crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
2046                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
2047                        Register tmp1, Register tmp2,
2048                        Register n_tmp3);
2049   void crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
2050                          Register in_out1, Register in_out2, Register in_out3,
2051                          Register tmp1, Register tmp2, Register tmp3,
2052                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
2053                          Register tmp4, Register tmp5,
2054                          Register n_tmp6);
2055   void crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
2056                             Register tmp1, Register tmp2, Register tmp3,
2057                             Register tmp4, Register tmp5, Register tmp6,
2058                             XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
2059                             bool is_pclmulqdq_supported);
2060   // Fold 128-bit data chunk
2061   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
2062   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf);
2063 #ifdef _LP64
2064   // Fold 512-bit data chunk
2065   void fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, Register pos, int offset);
2066 #endif // _LP64
2067   // Fold 8-bit data
2068   void fold_8bit_crc32(Register crc, Register table, Register tmp);
2069   void fold_8bit_crc32(XMMRegister crc, Register table, XMMRegister xtmp, Register tmp);
2070 
2071   // Compress char[] array to byte[].
2072   void char_array_compress(Register src, Register dst, Register len,
2073                            XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
2074                            XMMRegister tmp4, Register tmp5, Register result,
2075                            KRegister mask1 = knoreg, KRegister mask2 = knoreg);
2076 
2077   // Inflate byte[] array to char[].
2078   void byte_array_inflate(Register src, Register dst, Register len,
2079                           XMMRegister tmp1, Register tmp2, KRegister mask = knoreg);
2080 
2081   void fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
2082                    Register length, Register temp, int vec_enc);
2083 
2084   void fill64_masked(uint shift, Register dst, int disp,
2085                          XMMRegister xmm, KRegister mask, Register length,
2086                          Register temp, bool use64byteVector = false);
2087 
2088   void fill32_masked(uint shift, Register dst, int disp,
2089                          XMMRegister xmm, KRegister mask, Register length,
2090                          Register temp);
2091 
2092   void fill32(Address dst, XMMRegister xmm);
2093 
2094   void fill32(Register dst, int disp, XMMRegister xmm);
2095 
2096   void fill64(Address dst, XMMRegister xmm, bool use64byteVector = false);
2097 
2098   void fill64(Register dst, int dis, XMMRegister xmm, bool use64byteVector = false);
2099 
2100 #ifdef _LP64
2101   void convert_f2i(Register dst, XMMRegister src);
2102   void convert_d2i(Register dst, XMMRegister src);
2103   void convert_f2l(Register dst, XMMRegister src);
2104   void convert_d2l(Register dst, XMMRegister src);
2105   void round_double(Register dst, XMMRegister src, Register rtmp, Register rcx);
2106   void round_float(Register dst, XMMRegister src, Register rtmp, Register rcx);
2107 
2108   void cache_wb(Address line);
2109   void cache_wbsync(bool is_pre);
2110 
2111 #ifdef COMPILER2_OR_JVMCI
2112   void generate_fill_avx3(BasicType type, Register to, Register value,
2113                           Register count, Register rtmp, XMMRegister xtmp);
2114 #endif // COMPILER2_OR_JVMCI
2115 #endif // _LP64
2116 
2117   void vallones(XMMRegister dst, int vector_len);
2118 
2119   void check_stack_alignment(Register sp, const char* msg, unsigned bias = 0, Register tmp = noreg);
2120 
2121   void lightweight_lock(Register obj, Register hdr, Register thread, Register tmp, Label& slow);
2122   void lightweight_unlock(Register obj, Register hdr, Register tmp, Label& slow);
2123 };
2124 
2125 /**
2126  * class SkipIfEqual:
2127  *
2128  * Instantiating this class will result in assembly code being output that will
2129  * jump around any code emitted between the creation of the instance and it's
2130  * automatic destruction at the end of a scope block, depending on the value of
2131  * the flag passed to the constructor, which will be checked at run-time.
2132  */
2133 class SkipIfEqual {
2134  private:
2135   MacroAssembler* _masm;
2136   Label _label;
2137 
2138  public:
2139    SkipIfEqual(MacroAssembler*, const bool* flag_addr, bool value, Register rscratch);
2140    ~SkipIfEqual();
2141 };
2142 
2143 #endif // CPU_X86_MACROASSEMBLER_X86_HPP