1 /*
   2  * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef CPU_X86_MACROASSEMBLER_X86_HPP
  26 #define CPU_X86_MACROASSEMBLER_X86_HPP
  27 
  28 #include "asm/assembler.hpp"
  29 #include "asm/register.hpp"
  30 #include "code/vmreg.inline.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "utilities/macros.hpp"
  33 #include "runtime/rtmLocking.hpp"
  34 #include "runtime/signature.hpp"
  35 #include "runtime/vm_version.hpp"
  36 
  37 class ciInlineKlass;
  38 
  39 // MacroAssembler extends Assembler by frequently used macros.
  40 //
  41 // Instructions for which a 'better' code sequence exists depending
  42 // on arguments should also go in here.
  43 
  44 class MacroAssembler: public Assembler {
  45   friend class LIR_Assembler;
  46   friend class Runtime1;      // as_Address()
  47 
  48  public:
  49   // Support for VM calls
  50   //
  51   // This is the base routine called by the different versions of call_VM_leaf. The interpreter
  52   // may customize this version by overriding it for its purposes (e.g., to save/restore
  53   // additional registers when doing a VM call).
  54 
  55   virtual void call_VM_leaf_base(
  56     address entry_point,               // the entry point
  57     int     number_of_arguments        // the number of arguments to pop after the call
  58   );
  59 
  60  protected:
  61   // This is the base routine called by the different versions of call_VM. The interpreter
  62   // may customize this version by overriding it for its purposes (e.g., to save/restore
  63   // additional registers when doing a VM call).
  64   //
  65   // If no java_thread register is specified (noreg) than rdi will be used instead. call_VM_base
  66   // returns the register which contains the thread upon return. If a thread register has been
  67   // specified, the return value will correspond to that register. If no last_java_sp is specified
  68   // (noreg) than rsp will be used instead.
  69   virtual void call_VM_base(           // returns the register containing the thread upon return
  70     Register oop_result,               // where an oop-result ends up if any; use noreg otherwise
  71     Register java_thread,              // the thread if computed before     ; use noreg otherwise
  72     Register last_java_sp,             // to set up last_Java_frame in stubs; use noreg otherwise
  73     address  entry_point,              // the entry point
  74     int      number_of_arguments,      // the number of arguments (w/o thread) to pop after the call
  75     bool     check_exceptions          // whether to check for pending exceptions after return
  76   );
  77 
  78   void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions = true);
  79 
  80   // helpers for FPU flag access
  81   // tmp is a temporary register, if none is available use noreg
  82   void save_rax   (Register tmp);
  83   void restore_rax(Register tmp);
  84 
  85  public:
  86   MacroAssembler(CodeBuffer* code) : Assembler(code) {}
  87 
  88  // These routines should emit JVMTI PopFrame and ForceEarlyReturn handling code.
  89  // The implementation is only non-empty for the InterpreterMacroAssembler,
  90  // as only the interpreter handles PopFrame and ForceEarlyReturn requests.
  91  virtual void check_and_handle_popframe(Register java_thread);
  92  virtual void check_and_handle_earlyret(Register java_thread);
  93 
  94   Address as_Address(AddressLiteral adr);
  95   Address as_Address(ArrayAddress adr);
  96 
  97   // Support for NULL-checks
  98   //
  99   // Generates code that causes a NULL OS exception if the content of reg is NULL.
 100   // If the accessed location is M[reg + offset] and the offset is known, provide the
 101   // offset. No explicit code generation is needed if the offset is within a certain
 102   // range (0 <= offset <= page_size).
 103 
 104   void null_check(Register reg, int offset = -1);
 105   static bool needs_explicit_null_check(intptr_t offset);
 106   static bool uses_implicit_null_check(void* address);
 107 
 108   // markWord tests, kills markWord reg
 109   void test_markword_is_inline_type(Register markword, Label& is_inline_type);
 110 
 111   // inlineKlass queries, kills temp_reg
 112   void test_klass_is_inline_type(Register klass, Register temp_reg, Label& is_inline_type);
 113   void test_klass_is_empty_inline_type(Register klass, Register temp_reg, Label& is_empty_inline_type);
 114   void test_oop_is_not_inline_type(Register object, Register tmp, Label& not_inline_type);
 115 
 116   // Get the default value oop for the given InlineKlass
 117   void get_default_value_oop(Register inline_klass, Register temp_reg, Register obj);
 118   // The empty value oop, for the given InlineKlass ("empty" as in no instance fields)
 119   // get_default_value_oop with extra assertion for empty inline klass
 120   void get_empty_inline_type_oop(Register inline_klass, Register temp_reg, Register obj);
 121 
 122   void test_field_is_null_free_inline_type(Register flags, Register temp_reg, Label& is_null_free);
 123   void test_field_is_not_null_free_inline_type(Register flags, Register temp_reg, Label& not_null_free);
 124   void test_field_is_inlined(Register flags, Register temp_reg, Label& is_inlined);
 125 
 126   // Check oops for special arrays, i.e. flattened and/or null-free
 127   void test_oop_prototype_bit(Register oop, Register temp_reg, int32_t test_bit, bool jmp_set, Label& jmp_label);
 128   void test_flattened_array_oop(Register oop, Register temp_reg, Label&is_flattened_array);
 129   void test_non_flattened_array_oop(Register oop, Register temp_reg, Label&is_non_flattened_array);
 130   void test_null_free_array_oop(Register oop, Register temp_reg, Label&is_null_free_array);
 131   void test_non_null_free_array_oop(Register oop, Register temp_reg, Label&is_non_null_free_array);
 132 
 133   // Check array klass layout helper for flatten or null-free arrays...
 134   void test_flattened_array_layout(Register lh, Label& is_flattened_array);
 135   void test_non_flattened_array_layout(Register lh, Label& is_non_flattened_array);
 136   void test_null_free_array_layout(Register lh, Label& is_null_free_array);
 137   void test_non_null_free_array_layout(Register lh, Label& is_non_null_free_array);
 138 
 139   // Required platform-specific helpers for Label::patch_instructions.
 140   // They _shadow_ the declarations in AbstractAssembler, which are undefined.
 141   void pd_patch_instruction(address branch, address target, const char* file, int line) {
 142     unsigned char op = branch[0];
 143     assert(op == 0xE8 /* call */ ||
 144         op == 0xE9 /* jmp */ ||
 145         op == 0xEB /* short jmp */ ||
 146         (op & 0xF0) == 0x70 /* short jcc */ ||
 147         op == 0x0F && (branch[1] & 0xF0) == 0x80 /* jcc */ ||
 148         op == 0xC7 && branch[1] == 0xF8 /* xbegin */,
 149         "Invalid opcode at patch point");
 150 
 151     if (op == 0xEB || (op & 0xF0) == 0x70) {
 152       // short offset operators (jmp and jcc)
 153       char* disp = (char*) &branch[1];
 154       int imm8 = target - (address) &disp[1];
 155       guarantee(this->is8bit(imm8), "Short forward jump exceeds 8-bit offset at %s:%d",
 156                 file == NULL ? "<NULL>" : file, line);
 157       *disp = imm8;
 158     } else {
 159       int* disp = (int*) &branch[(op == 0x0F || op == 0xC7)? 2: 1];
 160       int imm32 = target - (address) &disp[1];
 161       *disp = imm32;
 162     }
 163   }
 164 
 165   // The following 4 methods return the offset of the appropriate move instruction
 166 
 167   // Support for fast byte/short loading with zero extension (depending on particular CPU)
 168   int load_unsigned_byte(Register dst, Address src);
 169   int load_unsigned_short(Register dst, Address src);
 170 
 171   // Support for fast byte/short loading with sign extension (depending on particular CPU)
 172   int load_signed_byte(Register dst, Address src);
 173   int load_signed_short(Register dst, Address src);
 174 
 175   // Support for sign-extension (hi:lo = extend_sign(lo))
 176   void extend_sign(Register hi, Register lo);
 177 
 178   // Load and store values by size and signed-ness
 179   void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2 = noreg);
 180   void store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2 = noreg);
 181 
 182   // Support for inc/dec with optimal instruction selection depending on value
 183 
 184   void increment(Register reg, int value = 1) { LP64_ONLY(incrementq(reg, value)) NOT_LP64(incrementl(reg, value)) ; }
 185   void decrement(Register reg, int value = 1) { LP64_ONLY(decrementq(reg, value)) NOT_LP64(decrementl(reg, value)) ; }
 186 
 187   void decrementl(Address dst, int value = 1);
 188   void decrementl(Register reg, int value = 1);
 189 
 190   void decrementq(Register reg, int value = 1);
 191   void decrementq(Address dst, int value = 1);
 192 
 193   void incrementl(Address dst, int value = 1);
 194   void incrementl(Register reg, int value = 1);
 195 
 196   void incrementq(Register reg, int value = 1);
 197   void incrementq(Address dst, int value = 1);
 198 
 199   // Support optimal SSE move instructions.
 200   void movflt(XMMRegister dst, XMMRegister src) {
 201     if (dst-> encoding() == src->encoding()) return;
 202     if (UseXmmRegToRegMoveAll) { movaps(dst, src); return; }
 203     else                       { movss (dst, src); return; }
 204   }
 205   void movflt(XMMRegister dst, Address src) { movss(dst, src); }
 206   void movflt(XMMRegister dst, AddressLiteral src);
 207   void movflt(Address dst, XMMRegister src) { movss(dst, src); }
 208 
 209   // Move with zero extension
 210   void movfltz(XMMRegister dst, XMMRegister src) { movss(dst, src); }
 211 
 212   void movdbl(XMMRegister dst, XMMRegister src) {
 213     if (dst-> encoding() == src->encoding()) return;
 214     if (UseXmmRegToRegMoveAll) { movapd(dst, src); return; }
 215     else                       { movsd (dst, src); return; }
 216   }
 217 
 218   void movdbl(XMMRegister dst, AddressLiteral src);
 219 
 220   void movdbl(XMMRegister dst, Address src) {
 221     if (UseXmmLoadAndClearUpper) { movsd (dst, src); return; }
 222     else                         { movlpd(dst, src); return; }
 223   }
 224   void movdbl(Address dst, XMMRegister src) { movsd(dst, src); }
 225 
 226   void incrementl(AddressLiteral dst);
 227   void incrementl(ArrayAddress dst);
 228 
 229   void incrementq(AddressLiteral dst);
 230 
 231   // Alignment
 232   void align32();
 233   void align64();
 234   void align(int modulus);
 235   void align(int modulus, int target);
 236 
 237   // A 5 byte nop that is safe for patching (see patch_verified_entry)
 238   void fat_nop();
 239 
 240   // Stack frame creation/removal
 241   void enter();
 242   void leave();
 243 
 244   // Support for getting the JavaThread pointer (i.e.; a reference to thread-local information)
 245   // The pointer will be loaded into the thread register.
 246   void get_thread(Register thread);
 247 
 248 #ifdef _LP64
 249   // Support for argument shuffling
 250 
 251   void move32_64(VMRegPair src, VMRegPair dst);
 252   void long_move(VMRegPair src, VMRegPair dst);
 253   void float_move(VMRegPair src, VMRegPair dst);
 254   void double_move(VMRegPair src, VMRegPair dst);
 255   void move_ptr(VMRegPair src, VMRegPair dst);
 256   void object_move(OopMap* map,
 257                    int oop_handle_offset,
 258                    int framesize_in_slots,
 259                    VMRegPair src,
 260                    VMRegPair dst,
 261                    bool is_receiver,
 262                    int* receiver_offset);
 263 #endif // _LP64
 264 
 265   // Support for VM calls
 266   //
 267   // It is imperative that all calls into the VM are handled via the call_VM macros.
 268   // They make sure that the stack linkage is setup correctly. call_VM's correspond
 269   // to ENTRY/ENTRY_X entry points while call_VM_leaf's correspond to LEAF entry points.
 270 
 271 
 272   void call_VM(Register oop_result,
 273                address entry_point,
 274                bool check_exceptions = true);
 275   void call_VM(Register oop_result,
 276                address entry_point,
 277                Register arg_1,
 278                bool check_exceptions = true);
 279   void call_VM(Register oop_result,
 280                address entry_point,
 281                Register arg_1, Register arg_2,
 282                bool check_exceptions = true);
 283   void call_VM(Register oop_result,
 284                address entry_point,
 285                Register arg_1, Register arg_2, Register arg_3,
 286                bool check_exceptions = true);
 287 
 288   // Overloadings with last_Java_sp
 289   void call_VM(Register oop_result,
 290                Register last_java_sp,
 291                address entry_point,
 292                int number_of_arguments = 0,
 293                bool check_exceptions = true);
 294   void call_VM(Register oop_result,
 295                Register last_java_sp,
 296                address entry_point,
 297                Register arg_1, bool
 298                check_exceptions = true);
 299   void call_VM(Register oop_result,
 300                Register last_java_sp,
 301                address entry_point,
 302                Register arg_1, Register arg_2,
 303                bool check_exceptions = true);
 304   void call_VM(Register oop_result,
 305                Register last_java_sp,
 306                address entry_point,
 307                Register arg_1, Register arg_2, Register arg_3,
 308                bool check_exceptions = true);
 309 
 310   void get_vm_result  (Register oop_result, Register thread);
 311   void get_vm_result_2(Register metadata_result, Register thread);
 312 
 313   // These always tightly bind to MacroAssembler::call_VM_base
 314   // bypassing the virtual implementation
 315   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments = 0, bool check_exceptions = true);
 316   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions = true);
 317   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions = true);
 318   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions = true);
 319   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4, bool check_exceptions = true);
 320 
 321   void call_VM_leaf0(address entry_point);
 322   void call_VM_leaf(address entry_point,
 323                     int number_of_arguments = 0);
 324   void call_VM_leaf(address entry_point,
 325                     Register arg_1);
 326   void call_VM_leaf(address entry_point,
 327                     Register arg_1, Register arg_2);
 328   void call_VM_leaf(address entry_point,
 329                     Register arg_1, Register arg_2, Register arg_3);
 330 
 331   // These always tightly bind to MacroAssembler::call_VM_leaf_base
 332   // bypassing the virtual implementation
 333   void super_call_VM_leaf(address entry_point);
 334   void super_call_VM_leaf(address entry_point, Register arg_1);
 335   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
 336   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
 337   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 338 
 339   // last Java Frame (fills frame anchor)
 340   void set_last_Java_frame(Register thread,
 341                            Register last_java_sp,
 342                            Register last_java_fp,
 343                            address last_java_pc);
 344 
 345   // thread in the default location (r15_thread on 64bit)
 346   void set_last_Java_frame(Register last_java_sp,
 347                            Register last_java_fp,
 348                            address last_java_pc);
 349 
 350   void reset_last_Java_frame(Register thread, bool clear_fp);
 351 
 352   // thread in the default location (r15_thread on 64bit)
 353   void reset_last_Java_frame(bool clear_fp);
 354 
 355   // jobjects
 356   void clear_jweak_tag(Register possibly_jweak);
 357   void resolve_jobject(Register value, Register thread, Register tmp);
 358 
 359   // C 'boolean' to Java boolean: x == 0 ? 0 : 1
 360   void c2bool(Register x);
 361 
 362   // C++ bool manipulation
 363 
 364   void movbool(Register dst, Address src);
 365   void movbool(Address dst, bool boolconst);
 366   void movbool(Address dst, Register src);
 367   void testbool(Register dst);
 368 
 369   void resolve_oop_handle(Register result, Register tmp = rscratch2);
 370   void resolve_weak_handle(Register result, Register tmp);
 371   void load_mirror(Register mirror, Register method, Register tmp = rscratch2);
 372   void load_method_holder_cld(Register rresult, Register rmethod);
 373 
 374   void load_method_holder(Register holder, Register method);
 375 
 376   // oop manipulations
 377   void load_metadata(Register dst, Register src);
 378   void load_klass(Register dst, Register src, Register tmp);
 379   void store_klass(Register dst, Register src, Register tmp);
 380 
 381   void access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
 382                       Register tmp1, Register thread_tmp);
 383   void access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
 384                        Register tmp1, Register tmp2, Register tmp3);
 385 
 386   void access_value_copy(DecoratorSet decorators, Register src, Register dst, Register inline_klass);
 387 
 388   // inline type data payload offsets...
 389   void first_field_offset(Register inline_klass, Register offset);
 390   void data_for_oop(Register oop, Register data, Register inline_klass);
 391   // get data payload ptr a flat value array at index, kills rcx and index
 392   void data_for_value_array_index(Register array, Register array_klass,
 393                                   Register index, Register data);
 394 
 395 
 396   void load_heap_oop(Register dst, Address src, Register tmp1 = noreg,
 397                      Register thread_tmp = noreg, DecoratorSet decorators = 0);
 398   void load_heap_oop_not_null(Register dst, Address src, Register tmp1 = noreg,
 399                               Register thread_tmp = noreg, DecoratorSet decorators = 0);
 400   void store_heap_oop(Address dst, Register src, Register tmp1 = noreg,
 401                       Register tmp2 = noreg, Register tmp3 = noreg, DecoratorSet decorators = 0);
 402 
 403   // Used for storing NULL. All other oop constants should be
 404   // stored using routines that take a jobject.
 405   void store_heap_oop_null(Address dst);
 406 
 407   void load_prototype_header(Register dst, Register src, Register tmp);
 408 
 409 #ifdef _LP64
 410   void store_klass_gap(Register dst, Register src);
 411 
 412   // This dummy is to prevent a call to store_heap_oop from
 413   // converting a zero (like NULL) into a Register by giving
 414   // the compiler two choices it can't resolve
 415 
 416   void store_heap_oop(Address dst, void* dummy);
 417 
 418   void encode_heap_oop(Register r);
 419   void decode_heap_oop(Register r);
 420   void encode_heap_oop_not_null(Register r);
 421   void decode_heap_oop_not_null(Register r);
 422   void encode_heap_oop_not_null(Register dst, Register src);
 423   void decode_heap_oop_not_null(Register dst, Register src);
 424 
 425   void set_narrow_oop(Register dst, jobject obj);
 426   void set_narrow_oop(Address dst, jobject obj);
 427   void cmp_narrow_oop(Register dst, jobject obj);
 428   void cmp_narrow_oop(Address dst, jobject obj);
 429 
 430   void encode_klass_not_null(Register r, Register tmp);
 431   void decode_klass_not_null(Register r, Register tmp);
 432   void encode_and_move_klass_not_null(Register dst, Register src);
 433   void decode_and_move_klass_not_null(Register dst, Register src);
 434   void set_narrow_klass(Register dst, Klass* k);
 435   void set_narrow_klass(Address dst, Klass* k);
 436   void cmp_narrow_klass(Register dst, Klass* k);
 437   void cmp_narrow_klass(Address dst, Klass* k);
 438 
 439   // if heap base register is used - reinit it with the correct value
 440   void reinit_heapbase();
 441 
 442   DEBUG_ONLY(void verify_heapbase(const char* msg);)
 443 
 444 #endif // _LP64
 445 
 446   // Int division/remainder for Java
 447   // (as idivl, but checks for special case as described in JVM spec.)
 448   // returns idivl instruction offset for implicit exception handling
 449   int corrected_idivl(Register reg);
 450 
 451   // Long division/remainder for Java
 452   // (as idivq, but checks for special case as described in JVM spec.)
 453   // returns idivq instruction offset for implicit exception handling
 454   int corrected_idivq(Register reg);
 455 
 456   void int3();
 457 
 458   // Long operation macros for a 32bit cpu
 459   // Long negation for Java
 460   void lneg(Register hi, Register lo);
 461 
 462   // Long multiplication for Java
 463   // (destroys contents of eax, ebx, ecx and edx)
 464   void lmul(int x_rsp_offset, int y_rsp_offset); // rdx:rax = x * y
 465 
 466   // Long shifts for Java
 467   // (semantics as described in JVM spec.)
 468   void lshl(Register hi, Register lo);                               // hi:lo << (rcx & 0x3f)
 469   void lshr(Register hi, Register lo, bool sign_extension = false);  // hi:lo >> (rcx & 0x3f)
 470 
 471   // Long compare for Java
 472   // (semantics as described in JVM spec.)
 473   void lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo); // x_hi = lcmp(x, y)
 474 
 475 
 476   // misc
 477 
 478   // Sign extension
 479   void sign_extend_short(Register reg);
 480   void sign_extend_byte(Register reg);
 481 
 482   // Division by power of 2, rounding towards 0
 483   void division_with_shift(Register reg, int shift_value);
 484 
 485 #ifndef _LP64
 486   // Compares the top-most stack entries on the FPU stack and sets the eflags as follows:
 487   //
 488   // CF (corresponds to C0) if x < y
 489   // PF (corresponds to C2) if unordered
 490   // ZF (corresponds to C3) if x = y
 491   //
 492   // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
 493   // tmp is a temporary register, if none is available use noreg (only matters for non-P6 code)
 494   void fcmp(Register tmp);
 495   // Variant of the above which allows y to be further down the stack
 496   // and which only pops x and y if specified. If pop_right is
 497   // specified then pop_left must also be specified.
 498   void fcmp(Register tmp, int index, bool pop_left, bool pop_right);
 499 
 500   // Floating-point comparison for Java
 501   // Compares the top-most stack entries on the FPU stack and stores the result in dst.
 502   // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
 503   // (semantics as described in JVM spec.)
 504   void fcmp2int(Register dst, bool unordered_is_less);
 505   // Variant of the above which allows y to be further down the stack
 506   // and which only pops x and y if specified. If pop_right is
 507   // specified then pop_left must also be specified.
 508   void fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right);
 509 
 510   // Floating-point remainder for Java (ST0 = ST0 fremr ST1, ST1 is empty afterwards)
 511   // tmp is a temporary register, if none is available use noreg
 512   void fremr(Register tmp);
 513 
 514   // only if +VerifyFPU
 515   void verify_FPU(int stack_depth, const char* s = "illegal FPU state");
 516 #endif // !LP64
 517 
 518   // dst = c = a * b + c
 519   void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 520   void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 521 
 522   void vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 523   void vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 524   void vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 525   void vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 526 
 527 
 528   // same as fcmp2int, but using SSE2
 529   void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 530   void cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 531 
 532   // branch to L if FPU flag C2 is set/not set
 533   // tmp is a temporary register, if none is available use noreg
 534   void jC2 (Register tmp, Label& L);
 535   void jnC2(Register tmp, Label& L);
 536 
 537   // Load float value from 'address'. If UseSSE >= 1, the value is loaded into
 538   // register xmm0. Otherwise, the value is loaded onto the FPU stack.
 539   void load_float(Address src);
 540 
 541   // Store float value to 'address'. If UseSSE >= 1, the value is stored
 542   // from register xmm0. Otherwise, the value is stored from the FPU stack.
 543   void store_float(Address dst);
 544 
 545   // Load double value from 'address'. If UseSSE >= 2, the value is loaded into
 546   // register xmm0. Otherwise, the value is loaded onto the FPU stack.
 547   void load_double(Address src);
 548 
 549   // Store double value to 'address'. If UseSSE >= 2, the value is stored
 550   // from register xmm0. Otherwise, the value is stored from the FPU stack.
 551   void store_double(Address dst);
 552 
 553 #ifndef _LP64
 554   // Pop ST (ffree & fincstp combined)
 555   void fpop();
 556 
 557   void empty_FPU_stack();
 558 #endif // !_LP64
 559 
 560   void push_IU_state();
 561   void pop_IU_state();
 562 
 563   void push_FPU_state();
 564   void pop_FPU_state();
 565 
 566   void push_CPU_state();
 567   void pop_CPU_state();
 568 
 569   // Round up to a power of two
 570   void round_to(Register reg, int modulus);
 571 
 572 private:
 573   // General purpose and XMM registers potentially clobbered by native code; there
 574   // is no need for FPU or AVX opmask related methods because C1/interpreter
 575   // - we save/restore FPU state as a whole always
 576   // - do not care about AVX-512 opmask
 577   static RegSet call_clobbered_gp_registers();
 578   static XMMRegSet call_clobbered_xmm_registers();
 579 
 580   void push_set(XMMRegSet set, int offset);
 581   void pop_set(XMMRegSet set, int offset);
 582 
 583 public:
 584   void push_set(RegSet set, int offset = -1);
 585   void pop_set(RegSet set, int offset = -1);
 586 
 587   // Push and pop everything that might be clobbered by a native
 588   // runtime call.
 589   // Only save the lower 64 bits of each vector register.
 590   // Additonal registers can be excluded in a passed RegSet.
 591   void push_call_clobbered_registers_except(RegSet exclude, bool save_fpu = true);
 592   void pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu = true);
 593 
 594   void push_call_clobbered_registers(bool save_fpu = true) {
 595     push_call_clobbered_registers_except(RegSet(), save_fpu);
 596   }
 597   void pop_call_clobbered_registers(bool restore_fpu = true) {
 598     pop_call_clobbered_registers_except(RegSet(), restore_fpu);
 599   }
 600 
 601   // allocation
 602 
 603   // Object / value buffer allocation...
 604   // Allocate instance of klass, assumes klass initialized by caller
 605   // new_obj prefers to be rax
 606   // Kills t1 and t2, perserves klass, return allocation in new_obj (rsi on LP64)
 607   void allocate_instance(Register klass, Register new_obj,
 608                          Register t1, Register t2,
 609                          bool clear_fields, Label& alloc_failed);
 610 
 611   void eden_allocate(
 612     Register thread,                   // Current thread
 613     Register obj,                      // result: pointer to object after successful allocation
 614     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
 615     int      con_size_in_bytes,        // object size in bytes if   known at compile time
 616     Register t1,                       // temp register
 617     Label&   slow_case                 // continuation point if fast allocation fails
 618   );
 619   void tlab_allocate(
 620     Register thread,                   // Current thread
 621     Register obj,                      // result: pointer to object after successful allocation
 622     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
 623     int      con_size_in_bytes,        // object size in bytes if   known at compile time
 624     Register t1,                       // temp register
 625     Register t2,                       // temp register
 626     Label&   slow_case                 // continuation point if fast allocation fails
 627   );
 628   void zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp);
 629 
 630   // For field "index" within "klass", return inline_klass ...
 631   void get_inline_type_field_klass(Register klass, Register index, Register inline_klass);
 632 
 633   // interface method calling
 634   void lookup_interface_method(Register recv_klass,
 635                                Register intf_klass,
 636                                RegisterOrConstant itable_index,
 637                                Register method_result,
 638                                Register scan_temp,
 639                                Label& no_such_interface,
 640                                bool return_method = true);
 641 
 642   // virtual method calling
 643   void lookup_virtual_method(Register recv_klass,
 644                              RegisterOrConstant vtable_index,
 645                              Register method_result);
 646 
 647   // Test sub_klass against super_klass, with fast and slow paths.
 648 
 649   // The fast path produces a tri-state answer: yes / no / maybe-slow.
 650   // One of the three labels can be NULL, meaning take the fall-through.
 651   // If super_check_offset is -1, the value is loaded up from super_klass.
 652   // No registers are killed, except temp_reg.
 653   void check_klass_subtype_fast_path(Register sub_klass,
 654                                      Register super_klass,
 655                                      Register temp_reg,
 656                                      Label* L_success,
 657                                      Label* L_failure,
 658                                      Label* L_slow_path,
 659                 RegisterOrConstant super_check_offset = RegisterOrConstant(-1));
 660 
 661   // The rest of the type check; must be wired to a corresponding fast path.
 662   // It does not repeat the fast path logic, so don't use it standalone.
 663   // The temp_reg and temp2_reg can be noreg, if no temps are available.
 664   // Updates the sub's secondary super cache as necessary.
 665   // If set_cond_codes, condition codes will be Z on success, NZ on failure.
 666   void check_klass_subtype_slow_path(Register sub_klass,
 667                                      Register super_klass,
 668                                      Register temp_reg,
 669                                      Register temp2_reg,
 670                                      Label* L_success,
 671                                      Label* L_failure,
 672                                      bool set_cond_codes = false);
 673 
 674   // Simplified, combined version, good for typical uses.
 675   // Falls through on failure.
 676   void check_klass_subtype(Register sub_klass,
 677                            Register super_klass,
 678                            Register temp_reg,
 679                            Label& L_success);
 680 
 681   void clinit_barrier(Register klass,
 682                       Register thread,
 683                       Label* L_fast_path = NULL,
 684                       Label* L_slow_path = NULL);
 685 
 686   // method handles (JSR 292)
 687   Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0);
 688 
 689   // Debugging
 690 
 691   // only if +VerifyOops
 692   void _verify_oop(Register reg, const char* s, const char* file, int line);
 693   void _verify_oop_addr(Address addr, const char* s, const char* file, int line);
 694 
 695   void _verify_oop_checked(Register reg, const char* s, const char* file, int line) {
 696     if (VerifyOops) {
 697       _verify_oop(reg, s, file, line);
 698     }
 699   }
 700   void _verify_oop_addr_checked(Address reg, const char* s, const char* file, int line) {
 701     if (VerifyOops) {
 702       _verify_oop_addr(reg, s, file, line);
 703     }
 704   }
 705 
 706   // TODO: verify method and klass metadata (compare against vptr?)
 707   void _verify_method_ptr(Register reg, const char * msg, const char * file, int line) {}
 708   void _verify_klass_ptr(Register reg, const char * msg, const char * file, int line){}
 709 
 710 #define verify_oop(reg) _verify_oop_checked(reg, "broken oop " #reg, __FILE__, __LINE__)
 711 #define verify_oop_msg(reg, msg) _verify_oop_checked(reg, "broken oop " #reg ", " #msg, __FILE__, __LINE__)
 712 #define verify_oop_addr(addr) _verify_oop_addr_checked(addr, "broken oop addr " #addr, __FILE__, __LINE__)
 713 #define verify_method_ptr(reg) _verify_method_ptr(reg, "broken method " #reg, __FILE__, __LINE__)
 714 #define verify_klass_ptr(reg) _verify_klass_ptr(reg, "broken klass " #reg, __FILE__, __LINE__)
 715 
 716   // Verify or restore cpu control state after JNI call
 717   void restore_cpu_control_state_after_jni();
 718 
 719   // prints msg, dumps registers and stops execution
 720   void stop(const char* msg);
 721 
 722   // prints msg and continues
 723   void warn(const char* msg);
 724 
 725   // dumps registers and other state
 726   void print_state();
 727 
 728   static void debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg);
 729   static void debug64(char* msg, int64_t pc, int64_t regs[]);
 730   static void print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip);
 731   static void print_state64(int64_t pc, int64_t regs[]);
 732 
 733   void os_breakpoint();
 734 
 735   void untested()                                { stop("untested"); }
 736 
 737   void unimplemented(const char* what = "");
 738 
 739   void should_not_reach_here()                   { stop("should not reach here"); }
 740 
 741   void print_CPU_state();
 742 
 743   // Stack overflow checking
 744   void bang_stack_with_offset(int offset) {
 745     // stack grows down, caller passes positive offset
 746     assert(offset > 0, "must bang with negative offset");
 747     movl(Address(rsp, (-offset)), rax);
 748   }
 749 
 750   // Writes to stack successive pages until offset reached to check for
 751   // stack overflow + shadow pages.  Also, clobbers tmp
 752   void bang_stack_size(Register size, Register tmp);
 753 
 754   // Check for reserved stack access in method being exited (for JIT)
 755   void reserved_stack_check();
 756 
 757   void safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod);
 758 
 759   void verify_tlab();
 760 
 761   Condition negate_condition(Condition cond);
 762 
 763   // Instructions that use AddressLiteral operands. These instruction can handle 32bit/64bit
 764   // operands. In general the names are modified to avoid hiding the instruction in Assembler
 765   // so that we don't need to implement all the varieties in the Assembler with trivial wrappers
 766   // here in MacroAssembler. The major exception to this rule is call
 767 
 768   // Arithmetics
 769 
 770 
 771   void addptr(Address dst, int32_t src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)) ; }
 772   void addptr(Address dst, Register src);
 773 
 774   void addptr(Register dst, Address src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); }
 775   void addptr(Register dst, int32_t src);
 776   void addptr(Register dst, Register src);
 777   void addptr(Register dst, RegisterOrConstant src) {
 778     if (src.is_constant()) addptr(dst, (int) src.as_constant());
 779     else                   addptr(dst,       src.as_register());
 780   }
 781 
 782   void andptr(Register dst, int32_t src);
 783   void andptr(Register dst, Register src) { LP64_ONLY(andq(dst, src)) NOT_LP64(andl(dst, src)) ; }
 784   void andptr(Register dst, Address src) { LP64_ONLY(andq(dst, src)) NOT_LP64(andl(dst, src)) ; }
 785 
 786   void cmp8(AddressLiteral src1, int imm);
 787 
 788   // renamed to drag out the casting of address to int32_t/intptr_t
 789   void cmp32(Register src1, int32_t imm);
 790 
 791   void cmp32(AddressLiteral src1, int32_t imm);
 792   // compare reg - mem, or reg - &mem
 793   void cmp32(Register src1, AddressLiteral src2);
 794 
 795   void cmp32(Register src1, Address src2);
 796 
 797 #ifndef _LP64
 798   void cmpklass(Address dst, Metadata* obj);
 799   void cmpklass(Register dst, Metadata* obj);
 800   void cmpoop(Address dst, jobject obj);
 801 #endif // _LP64
 802 
 803   void cmpoop(Register src1, Register src2);
 804   void cmpoop(Register src1, Address src2);
 805   void cmpoop(Register dst, jobject obj);
 806 
 807   // NOTE src2 must be the lval. This is NOT an mem-mem compare
 808   void cmpptr(Address src1, AddressLiteral src2);
 809 
 810   void cmpptr(Register src1, AddressLiteral src2);
 811 
 812   void cmpptr(Register src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 813   void cmpptr(Register src1, Address src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 814   // void cmpptr(Address src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 815 
 816   void cmpptr(Register src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 817   void cmpptr(Address src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 818 
 819   // cmp64 to avoild hiding cmpq
 820   void cmp64(Register src1, AddressLiteral src);
 821 
 822   void cmpxchgptr(Register reg, Address adr);
 823 
 824   void locked_cmpxchgptr(Register reg, AddressLiteral adr);
 825 
 826 
 827   void imulptr(Register dst, Register src) { LP64_ONLY(imulq(dst, src)) NOT_LP64(imull(dst, src)); }
 828   void imulptr(Register dst, Register src, int imm32) { LP64_ONLY(imulq(dst, src, imm32)) NOT_LP64(imull(dst, src, imm32)); }
 829 
 830 
 831   void negptr(Register dst) { LP64_ONLY(negq(dst)) NOT_LP64(negl(dst)); }
 832 
 833   void notptr(Register dst) { LP64_ONLY(notq(dst)) NOT_LP64(notl(dst)); }
 834 
 835   void shlptr(Register dst, int32_t shift);
 836   void shlptr(Register dst) { LP64_ONLY(shlq(dst)) NOT_LP64(shll(dst)); }
 837 
 838   void shrptr(Register dst, int32_t shift);
 839   void shrptr(Register dst) { LP64_ONLY(shrq(dst)) NOT_LP64(shrl(dst)); }
 840 
 841   void sarptr(Register dst) { LP64_ONLY(sarq(dst)) NOT_LP64(sarl(dst)); }
 842   void sarptr(Register dst, int32_t src) { LP64_ONLY(sarq(dst, src)) NOT_LP64(sarl(dst, src)); }
 843 
 844   void subptr(Address dst, int32_t src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
 845 
 846   void subptr(Register dst, Address src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
 847   void subptr(Register dst, int32_t src);
 848   // Force generation of a 4 byte immediate value even if it fits into 8bit
 849   void subptr_imm32(Register dst, int32_t src);
 850   void subptr(Register dst, Register src);
 851   void subptr(Register dst, RegisterOrConstant src) {
 852     if (src.is_constant()) subptr(dst, (int) src.as_constant());
 853     else                   subptr(dst,       src.as_register());
 854   }
 855 
 856   void sbbptr(Address dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
 857   void sbbptr(Register dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
 858 
 859   void xchgptr(Register src1, Register src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
 860   void xchgptr(Register src1, Address src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
 861 
 862   void xaddptr(Address src1, Register src2) { LP64_ONLY(xaddq(src1, src2)) NOT_LP64(xaddl(src1, src2)) ; }
 863 
 864 
 865 
 866   // Helper functions for statistics gathering.
 867   // Conditionally (atomically, on MPs) increments passed counter address, preserving condition codes.
 868   void cond_inc32(Condition cond, AddressLiteral counter_addr);
 869   // Unconditional atomic increment.
 870   void atomic_incl(Address counter_addr);
 871   void atomic_incl(AddressLiteral counter_addr, Register scr = rscratch1);
 872 #ifdef _LP64
 873   void atomic_incq(Address counter_addr);
 874   void atomic_incq(AddressLiteral counter_addr, Register scr = rscratch1);
 875 #endif
 876   void atomic_incptr(AddressLiteral counter_addr, Register scr = rscratch1) { LP64_ONLY(atomic_incq(counter_addr, scr)) NOT_LP64(atomic_incl(counter_addr, scr)) ; }
 877   void atomic_incptr(Address counter_addr) { LP64_ONLY(atomic_incq(counter_addr)) NOT_LP64(atomic_incl(counter_addr)) ; }
 878 
 879   void lea(Register dst, AddressLiteral adr);
 880   void lea(Address dst, AddressLiteral adr);
 881   void lea(Register dst, Address adr) { Assembler::lea(dst, adr); }
 882 
 883   void leal32(Register dst, Address src) { leal(dst, src); }
 884 
 885   // Import other testl() methods from the parent class or else
 886   // they will be hidden by the following overriding declaration.
 887   using Assembler::testl;
 888   void testl(Register dst, AddressLiteral src);
 889 
 890   void orptr(Register dst, Address src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 891   void orptr(Register dst, Register src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 892   void orptr(Register dst, int32_t src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 893   void orptr(Address dst, int32_t imm32) { LP64_ONLY(orq(dst, imm32)) NOT_LP64(orl(dst, imm32)); }
 894 
 895   void testptr(Register src, int32_t imm32) {  LP64_ONLY(testq(src, imm32)) NOT_LP64(testl(src, imm32)); }
 896   void testptr(Register src1, Address src2) { LP64_ONLY(testq(src1, src2)) NOT_LP64(testl(src1, src2)); }
 897   void testptr(Register src1, Register src2);
 898 
 899   void xorptr(Register dst, Register src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
 900   void xorptr(Register dst, Address src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
 901 
 902   // Calls
 903 
 904   void call(Label& L, relocInfo::relocType rtype);
 905   void call(Register entry);
 906   void call(Address addr) { Assembler::call(addr); }
 907 
 908   // NOTE: this call transfers to the effective address of entry NOT
 909   // the address contained by entry. This is because this is more natural
 910   // for jumps/calls.
 911   void call(AddressLiteral entry);
 912 
 913   // Emit the CompiledIC call idiom
 914   void ic_call(address entry, jint method_index = 0);
 915 
 916   // Jumps
 917 
 918   // NOTE: these jumps tranfer to the effective address of dst NOT
 919   // the address contained by dst. This is because this is more natural
 920   // for jumps/calls.
 921   void jump(AddressLiteral dst);
 922   void jump_cc(Condition cc, AddressLiteral dst);
 923 
 924   // 32bit can do a case table jump in one instruction but we no longer allow the base
 925   // to be installed in the Address class. This jump will tranfers to the address
 926   // contained in the location described by entry (not the address of entry)
 927   void jump(ArrayAddress entry);
 928 
 929   // Floating
 930 
 931   void andpd(XMMRegister dst, Address src) { Assembler::andpd(dst, src); }
 932   void andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);
 933   void andpd(XMMRegister dst, XMMRegister src) { Assembler::andpd(dst, src); }
 934 
 935   void andps(XMMRegister dst, XMMRegister src) { Assembler::andps(dst, src); }
 936   void andps(XMMRegister dst, Address src) { Assembler::andps(dst, src); }
 937   void andps(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);
 938 
 939   void comiss(XMMRegister dst, XMMRegister src) { Assembler::comiss(dst, src); }
 940   void comiss(XMMRegister dst, Address src) { Assembler::comiss(dst, src); }
 941   void comiss(XMMRegister dst, AddressLiteral src);
 942 
 943   void comisd(XMMRegister dst, XMMRegister src) { Assembler::comisd(dst, src); }
 944   void comisd(XMMRegister dst, Address src) { Assembler::comisd(dst, src); }
 945   void comisd(XMMRegister dst, AddressLiteral src);
 946 
 947 #ifndef _LP64
 948   void fadd_s(Address src)        { Assembler::fadd_s(src); }
 949   void fadd_s(AddressLiteral src) { Assembler::fadd_s(as_Address(src)); }
 950 
 951   void fldcw(Address src) { Assembler::fldcw(src); }
 952   void fldcw(AddressLiteral src);
 953 
 954   void fld_s(int index)   { Assembler::fld_s(index); }
 955   void fld_s(Address src) { Assembler::fld_s(src); }
 956   void fld_s(AddressLiteral src);
 957 
 958   void fld_d(Address src) { Assembler::fld_d(src); }
 959   void fld_d(AddressLiteral src);
 960 
 961   void fmul_s(Address src)        { Assembler::fmul_s(src); }
 962   void fmul_s(AddressLiteral src) { Assembler::fmul_s(as_Address(src)); }
 963 #endif // _LP64
 964 
 965   void fld_x(Address src) { Assembler::fld_x(src); }
 966   void fld_x(AddressLiteral src);
 967 
 968   void ldmxcsr(Address src) { Assembler::ldmxcsr(src); }
 969   void ldmxcsr(AddressLiteral src);
 970 
 971 #ifdef _LP64
 972  private:
 973   void sha256_AVX2_one_round_compute(
 974     Register  reg_old_h,
 975     Register  reg_a,
 976     Register  reg_b,
 977     Register  reg_c,
 978     Register  reg_d,
 979     Register  reg_e,
 980     Register  reg_f,
 981     Register  reg_g,
 982     Register  reg_h,
 983     int iter);
 984   void sha256_AVX2_four_rounds_compute_first(int start);
 985   void sha256_AVX2_four_rounds_compute_last(int start);
 986   void sha256_AVX2_one_round_and_sched(
 987         XMMRegister xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
 988         XMMRegister xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
 989         XMMRegister xmm_2,     /* ymm6 */
 990         XMMRegister xmm_3,     /* ymm7 */
 991         Register    reg_a,      /* == eax on 0 iteration, then rotate 8 register right on each next iteration */
 992         Register    reg_b,      /* ebx */    /* full cycle is 8 iterations */
 993         Register    reg_c,      /* edi */
 994         Register    reg_d,      /* esi */
 995         Register    reg_e,      /* r8d */
 996         Register    reg_f,      /* r9d */
 997         Register    reg_g,      /* r10d */
 998         Register    reg_h,      /* r11d */
 999         int iter);
1000 
1001   void addm(int disp, Register r1, Register r2);
1002   void gfmul(XMMRegister tmp0, XMMRegister t);
1003   void schoolbookAAD(int i, Register subkeyH, XMMRegister data, XMMRegister tmp0,
1004                      XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3);
1005   void generateHtbl_one_block(Register htbl);
1006   void generateHtbl_eight_blocks(Register htbl);
1007  public:
1008   void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1009                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1010                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1011                    bool multi_block, XMMRegister shuf_mask);
1012   void avx_ghash(Register state, Register htbl, Register data, Register blocks);
1013 #endif
1014 
1015 #ifdef _LP64
1016  private:
1017   void sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c, Register d,
1018                                      Register e, Register f, Register g, Register h, int iteration);
1019 
1020   void sha512_AVX2_one_round_and_schedule(XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1021                                           Register a, Register b, Register c, Register d, Register e, Register f,
1022                                           Register g, Register h, int iteration);
1023 
1024   void addmq(int disp, Register r1, Register r2);
1025  public:
1026   void sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1027                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1028                    Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block,
1029                    XMMRegister shuf_mask);
1030 private:
1031   void roundEnc(XMMRegister key, int rnum);
1032   void lastroundEnc(XMMRegister key, int rnum);
1033   void roundDec(XMMRegister key, int rnum);
1034   void lastroundDec(XMMRegister key, int rnum);
1035   void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask);
1036   void gfmul_avx512(XMMRegister ghash, XMMRegister hkey);
1037   void generateHtbl_48_block_zmm(Register htbl, Register avx512_subkeyHtbl);
1038   void ghash16_encrypt16_parallel(Register key, Register subkeyHtbl, XMMRegister ctr_blockx,
1039                                   XMMRegister aad_hashx, Register in, Register out, Register data, Register pos, bool reduction,
1040                                   XMMRegister addmask, bool no_ghash_input, Register rounds, Register ghash_pos,
1041                                   bool final_reduction, int index, XMMRegister counter_inc_mask);
1042 public:
1043   void aesecb_encrypt(Register source_addr, Register dest_addr, Register key, Register len);
1044   void aesecb_decrypt(Register source_addr, Register dest_addr, Register key, Register len);
1045   void aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter,
1046                       Register len_reg, Register used, Register used_addr, Register saved_encCounter_start);
1047   void aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key,
1048                       Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter);
1049 
1050 #endif
1051 
1052   void fast_md5(Register buf, Address state, Address ofs, Address limit,
1053                 bool multi_block);
1054 
1055   void fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
1056                  XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
1057                  Register buf, Register state, Register ofs, Register limit, Register rsp,
1058                  bool multi_block);
1059 
1060 #ifdef _LP64
1061   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1062                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1063                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1064                    bool multi_block, XMMRegister shuf_mask);
1065 #else
1066   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1067                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1068                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1069                    bool multi_block);
1070 #endif
1071 
1072   void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1073                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1074                 Register rax, Register rcx, Register rdx, Register tmp);
1075 
1076 #ifdef _LP64
1077   void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1078                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1079                 Register rax, Register rcx, Register rdx, Register tmp1, Register tmp2);
1080 
1081   void fast_log10(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1082                   XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1083                   Register rax, Register rcx, Register rdx, Register r11);
1084 
1085   void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1086                 XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
1087                 Register rdx, Register tmp1, Register tmp2, Register tmp3, Register tmp4);
1088 
1089   void fast_sin(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1090                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1091                 Register rax, Register rbx, Register rcx, Register rdx, Register tmp1, Register tmp2,
1092                 Register tmp3, Register tmp4);
1093 
1094   void fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1095                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1096                 Register rax, Register rcx, Register rdx, Register tmp1,
1097                 Register tmp2, Register tmp3, Register tmp4);
1098   void fast_tan(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1099                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1100                 Register rax, Register rcx, Register rdx, Register tmp1,
1101                 Register tmp2, Register tmp3, Register tmp4);
1102 #else
1103   void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1104                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1105                 Register rax, Register rcx, Register rdx, Register tmp1);
1106 
1107   void fast_log10(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1108                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1109                 Register rax, Register rcx, Register rdx, Register tmp);
1110 
1111   void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1112                 XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
1113                 Register rdx, Register tmp);
1114 
1115   void fast_sin(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1116                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1117                 Register rax, Register rbx, Register rdx);
1118 
1119   void fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1120                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1121                 Register rax, Register rcx, Register rdx, Register tmp);
1122 
1123   void libm_sincos_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1124                         Register edx, Register ebx, Register esi, Register edi,
1125                         Register ebp, Register esp);
1126 
1127   void libm_reduce_pi04l(Register eax, Register ecx, Register edx, Register ebx,
1128                          Register esi, Register edi, Register ebp, Register esp);
1129 
1130   void libm_tancot_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1131                         Register edx, Register ebx, Register esi, Register edi,
1132                         Register ebp, Register esp);
1133 
1134   void fast_tan(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1135                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1136                 Register rax, Register rcx, Register rdx, Register tmp);
1137 #endif
1138 
1139 private:
1140 
1141   // these are private because users should be doing movflt/movdbl
1142 
1143   void movss(XMMRegister dst, XMMRegister src) { Assembler::movss(dst, src); }
1144   void movss(Address dst, XMMRegister src)     { Assembler::movss(dst, src); }
1145   void movss(XMMRegister dst, Address src)     { Assembler::movss(dst, src); }
1146   void movss(XMMRegister dst, AddressLiteral src);
1147 
1148   void movlpd(XMMRegister dst, Address src)    {Assembler::movlpd(dst, src); }
1149   void movlpd(XMMRegister dst, AddressLiteral src);
1150 
1151 public:
1152 
1153   void addsd(XMMRegister dst, XMMRegister src)    { Assembler::addsd(dst, src); }
1154   void addsd(XMMRegister dst, Address src)        { Assembler::addsd(dst, src); }
1155   void addsd(XMMRegister dst, AddressLiteral src);
1156 
1157   void addss(XMMRegister dst, XMMRegister src)    { Assembler::addss(dst, src); }
1158   void addss(XMMRegister dst, Address src)        { Assembler::addss(dst, src); }
1159   void addss(XMMRegister dst, AddressLiteral src);
1160 
1161   void addpd(XMMRegister dst, XMMRegister src)    { Assembler::addpd(dst, src); }
1162   void addpd(XMMRegister dst, Address src)        { Assembler::addpd(dst, src); }
1163   void addpd(XMMRegister dst, AddressLiteral src);
1164 
1165   void divsd(XMMRegister dst, XMMRegister src)    { Assembler::divsd(dst, src); }
1166   void divsd(XMMRegister dst, Address src)        { Assembler::divsd(dst, src); }
1167   void divsd(XMMRegister dst, AddressLiteral src);
1168 
1169   void divss(XMMRegister dst, XMMRegister src)    { Assembler::divss(dst, src); }
1170   void divss(XMMRegister dst, Address src)        { Assembler::divss(dst, src); }
1171   void divss(XMMRegister dst, AddressLiteral src);
1172 
1173   // Move Unaligned Double Quadword
1174   void movdqu(Address     dst, XMMRegister src);
1175   void movdqu(XMMRegister dst, Address src);
1176   void movdqu(XMMRegister dst, XMMRegister src);
1177   void movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg = rscratch1);
1178 
1179   void kmovwl(KRegister dst, Register src) { Assembler::kmovwl(dst, src); }
1180   void kmovwl(Register dst, KRegister src) { Assembler::kmovwl(dst, src); }
1181   void kmovwl(KRegister dst, Address src) { Assembler::kmovwl(dst, src); }
1182   void kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);
1183   void kmovwl(Address dst,  KRegister src) { Assembler::kmovwl(dst, src); }
1184   void kmovwl(KRegister dst, KRegister src) { Assembler::kmovwl(dst, src); }
1185 
1186   void kmovql(KRegister dst, KRegister src) { Assembler::kmovql(dst, src); }
1187   void kmovql(KRegister dst, Register src) { Assembler::kmovql(dst, src); }
1188   void kmovql(Register dst, KRegister src) { Assembler::kmovql(dst, src); }
1189   void kmovql(KRegister dst, Address src) { Assembler::kmovql(dst, src); }
1190   void kmovql(Address  dst, KRegister src) { Assembler::kmovql(dst, src); }
1191   void kmovql(KRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);
1192 
1193   // Safe move operation, lowers down to 16bit moves for targets supporting
1194   // AVX512F feature and 64bit moves for targets supporting AVX512BW feature.
1195   void kmov(Address  dst, KRegister src);
1196   void kmov(KRegister dst, Address src);
1197   void kmov(KRegister dst, KRegister src);
1198   void kmov(Register dst, KRegister src);
1199   void kmov(KRegister dst, Register src);
1200 
1201   // AVX Unaligned forms
1202   void vmovdqu(Address     dst, XMMRegister src);
1203   void vmovdqu(XMMRegister dst, Address src);
1204   void vmovdqu(XMMRegister dst, XMMRegister src);
1205   void vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);
1206   void vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg, int vector_len);
1207 
1208 
1209   // AVX512 Unaligned
1210   void evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len);
1211   void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len);
1212 
1213   void evmovdqub(Address dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); }
1214   void evmovdqub(XMMRegister dst, Address src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); }
1215   void evmovdqub(XMMRegister dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); }
1216   void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
1217   void evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
1218   void evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register scratch_reg);
1219 
1220   void evmovdquw(Address dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdquw(dst, src, merge, vector_len); }
1221   void evmovdquw(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
1222   void evmovdquw(XMMRegister dst, Address src, bool merge, int vector_len) { Assembler::evmovdquw(dst, src, merge, vector_len); }
1223   void evmovdquw(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
1224   void evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register scratch_reg);
1225 
1226   void evmovdqul(Address dst, XMMRegister src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
1227   void evmovdqul(XMMRegister dst, Address src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
1228   void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
1229      if (dst->encoding() == src->encoding()) return;
1230      Assembler::evmovdqul(dst, src, vector_len);
1231   }
1232   void evmovdqul(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
1233   void evmovdqul(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
1234   void evmovdqul(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1235     if (dst->encoding() == src->encoding() && mask == k0) return;
1236     Assembler::evmovdqul(dst, mask, src, merge, vector_len);
1237    }
1238   void evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register scratch_reg);
1239 
1240   void evmovdquq(XMMRegister dst, Address src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1241   void evmovdquq(Address dst, XMMRegister src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1242   void evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch);
1243   void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) {
1244     if (dst->encoding() == src->encoding()) return;
1245     Assembler::evmovdquq(dst, src, vector_len);
1246   }
1247   void evmovdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
1248   void evmovdquq(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
1249   void evmovdquq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
1250     if (dst->encoding() == src->encoding() && mask == k0) return;
1251     Assembler::evmovdquq(dst, mask, src, merge, vector_len);
1252   }
1253   void evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register scratch_reg);
1254 
1255   // Move Aligned Double Quadword
1256   void movdqa(XMMRegister dst, Address src)       { Assembler::movdqa(dst, src); }
1257   void movdqa(XMMRegister dst, XMMRegister src)   { Assembler::movdqa(dst, src); }
1258   void movdqa(XMMRegister dst, AddressLiteral src);
1259 
1260   void movsd(XMMRegister dst, XMMRegister src) { Assembler::movsd(dst, src); }
1261   void movsd(Address dst, XMMRegister src)     { Assembler::movsd(dst, src); }
1262   void movsd(XMMRegister dst, Address src)     { Assembler::movsd(dst, src); }
1263   void movsd(XMMRegister dst, AddressLiteral src);
1264 
1265   using Assembler::vmovddup;
1266   void vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1);
1267 
1268   void mulpd(XMMRegister dst, XMMRegister src)    { Assembler::mulpd(dst, src); }
1269   void mulpd(XMMRegister dst, Address src)        { Assembler::mulpd(dst, src); }
1270   void mulpd(XMMRegister dst, AddressLiteral src);
1271 
1272   void mulsd(XMMRegister dst, XMMRegister src)    { Assembler::mulsd(dst, src); }
1273   void mulsd(XMMRegister dst, Address src)        { Assembler::mulsd(dst, src); }
1274   void mulsd(XMMRegister dst, AddressLiteral src);
1275 
1276   void mulss(XMMRegister dst, XMMRegister src)    { Assembler::mulss(dst, src); }
1277   void mulss(XMMRegister dst, Address src)        { Assembler::mulss(dst, src); }
1278   void mulss(XMMRegister dst, AddressLiteral src);
1279 
1280   // Carry-Less Multiplication Quadword
1281   void pclmulldq(XMMRegister dst, XMMRegister src) {
1282     // 0x00 - multiply lower 64 bits [0:63]
1283     Assembler::pclmulqdq(dst, src, 0x00);
1284   }
1285   void pclmulhdq(XMMRegister dst, XMMRegister src) {
1286     // 0x11 - multiply upper 64 bits [64:127]
1287     Assembler::pclmulqdq(dst, src, 0x11);
1288   }
1289 
1290   void pcmpeqb(XMMRegister dst, XMMRegister src);
1291   void pcmpeqw(XMMRegister dst, XMMRegister src);
1292 
1293   void pcmpestri(XMMRegister dst, Address src, int imm8);
1294   void pcmpestri(XMMRegister dst, XMMRegister src, int imm8);
1295 
1296   void pmovzxbw(XMMRegister dst, XMMRegister src);
1297   void pmovzxbw(XMMRegister dst, Address src);
1298 
1299   void pmovmskb(Register dst, XMMRegister src);
1300 
1301   void ptest(XMMRegister dst, XMMRegister src);
1302 
1303   void sqrtsd(XMMRegister dst, XMMRegister src)    { Assembler::sqrtsd(dst, src); }
1304   void sqrtsd(XMMRegister dst, Address src)        { Assembler::sqrtsd(dst, src); }
1305   void sqrtsd(XMMRegister dst, AddressLiteral src);
1306 
1307   void roundsd(XMMRegister dst, XMMRegister src, int32_t rmode)    { Assembler::roundsd(dst, src, rmode); }
1308   void roundsd(XMMRegister dst, Address src, int32_t rmode)        { Assembler::roundsd(dst, src, rmode); }
1309   void roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg);
1310 
1311   void sqrtss(XMMRegister dst, XMMRegister src)    { Assembler::sqrtss(dst, src); }
1312   void sqrtss(XMMRegister dst, Address src)        { Assembler::sqrtss(dst, src); }
1313   void sqrtss(XMMRegister dst, AddressLiteral src);
1314 
1315   void subsd(XMMRegister dst, XMMRegister src)    { Assembler::subsd(dst, src); }
1316   void subsd(XMMRegister dst, Address src)        { Assembler::subsd(dst, src); }
1317   void subsd(XMMRegister dst, AddressLiteral src);
1318 
1319   void subss(XMMRegister dst, XMMRegister src)    { Assembler::subss(dst, src); }
1320   void subss(XMMRegister dst, Address src)        { Assembler::subss(dst, src); }
1321   void subss(XMMRegister dst, AddressLiteral src);
1322 
1323   void ucomiss(XMMRegister dst, XMMRegister src) { Assembler::ucomiss(dst, src); }
1324   void ucomiss(XMMRegister dst, Address src)     { Assembler::ucomiss(dst, src); }
1325   void ucomiss(XMMRegister dst, AddressLiteral src);
1326 
1327   void ucomisd(XMMRegister dst, XMMRegister src) { Assembler::ucomisd(dst, src); }
1328   void ucomisd(XMMRegister dst, Address src)     { Assembler::ucomisd(dst, src); }
1329   void ucomisd(XMMRegister dst, AddressLiteral src);
1330 
1331   // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
1332   void xorpd(XMMRegister dst, XMMRegister src);
1333   void xorpd(XMMRegister dst, Address src)     { Assembler::xorpd(dst, src); }
1334   void xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);
1335 
1336   // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
1337   void xorps(XMMRegister dst, XMMRegister src);
1338   void xorps(XMMRegister dst, Address src)     { Assembler::xorps(dst, src); }
1339   void xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);
1340 
1341   // Shuffle Bytes
1342   void pshufb(XMMRegister dst, XMMRegister src) { Assembler::pshufb(dst, src); }
1343   void pshufb(XMMRegister dst, Address src)     { Assembler::pshufb(dst, src); }
1344   void pshufb(XMMRegister dst, AddressLiteral src);
1345   // AVX 3-operands instructions
1346 
1347   void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddsd(dst, nds, src); }
1348   void vaddsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vaddsd(dst, nds, src); }
1349   void vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1350 
1351   void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddss(dst, nds, src); }
1352   void vaddss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vaddss(dst, nds, src); }
1353   void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1354 
1355   void vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len);
1356   void vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len);
1357 
1358   void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1359   void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1360   void vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch);
1361 
1362   void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1363   void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1364 
1365   void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1366   void vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1367   void vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch);
1368 
1369   void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1370   void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1371   void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1);
1372 
1373   void vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
1374   void vpbroadcastw(XMMRegister dst, Address src, int vector_len) { Assembler::vpbroadcastw(dst, src, vector_len); }
1375 
1376   using Assembler::vbroadcastsd;
1377   void vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1);
1378 
1379   void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1380 
1381   void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1382   void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg);
1383 
1384   // Vector compares
1385   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
1386                int comparison, bool is_signed, int vector_len) { Assembler::evpcmpd(kdst, mask, nds, src, comparison, is_signed, vector_len); }
1387   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
1388                int comparison, bool is_signed, int vector_len, Register scratch_reg);
1389   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
1390                int comparison, bool is_signed, int vector_len) { Assembler::evpcmpq(kdst, mask, nds, src, comparison, is_signed, vector_len); }
1391   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
1392                int comparison, bool is_signed, int vector_len, Register scratch_reg);
1393   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
1394                int comparison, bool is_signed, int vector_len) { Assembler::evpcmpb(kdst, mask, nds, src, comparison, is_signed, vector_len); }
1395   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
1396                int comparison, bool is_signed, int vector_len, Register scratch_reg);
1397   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
1398                int comparison, bool is_signed, int vector_len) { Assembler::evpcmpw(kdst, mask, nds, src, comparison, is_signed, vector_len); }
1399   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
1400                int comparison, bool is_signed, int vector_len, Register scratch_reg);
1401 
1402   void evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len);
1403 
1404   // Emit comparison instruction for the specified comparison predicate.
1405   void vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len);
1406   void vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len);
1407 
1408   void vpmovzxbw(XMMRegister dst, Address src, int vector_len);
1409   void vpmovzxbw(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vpmovzxbw(dst, src, vector_len); }
1410 
1411   void vpmovmskb(Register dst, XMMRegister src, int vector_len = Assembler::AVX_256bit);
1412 
1413   void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1414   void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1415   void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
1416     Assembler::vpmulld(dst, nds, src, vector_len);
1417   };
1418   void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1419     Assembler::vpmulld(dst, nds, src, vector_len);
1420   }
1421   void vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg);
1422 
1423   void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1424   void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1425 
1426   void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1427   void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1428 
1429   void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1430   void vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1431 
1432   void evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1433   void evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1434 
1435   void evpsllw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1436     if (!is_varshift) {
1437       Assembler::evpsllw(dst, mask, nds, src, merge, vector_len);
1438     } else {
1439       Assembler::evpsllvw(dst, mask, nds, src, merge, vector_len);
1440     }
1441   }
1442   void evpslld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1443     if (!is_varshift) {
1444       Assembler::evpslld(dst, mask, nds, src, merge, vector_len);
1445     } else {
1446       Assembler::evpsllvd(dst, mask, nds, src, merge, vector_len);
1447     }
1448   }
1449   void evpsllq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1450     if (!is_varshift) {
1451       Assembler::evpsllq(dst, mask, nds, src, merge, vector_len);
1452     } else {
1453       Assembler::evpsllvq(dst, mask, nds, src, merge, vector_len);
1454     }
1455   }
1456   void evpsrlw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1457     if (!is_varshift) {
1458       Assembler::evpsrlw(dst, mask, nds, src, merge, vector_len);
1459     } else {
1460       Assembler::evpsrlvw(dst, mask, nds, src, merge, vector_len);
1461     }
1462   }
1463   void evpsrld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1464     if (!is_varshift) {
1465       Assembler::evpsrld(dst, mask, nds, src, merge, vector_len);
1466     } else {
1467       Assembler::evpsrlvd(dst, mask, nds, src, merge, vector_len);
1468     }
1469   }
1470   void evpsrlq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1471     if (!is_varshift) {
1472       Assembler::evpsrlq(dst, mask, nds, src, merge, vector_len);
1473     } else {
1474       Assembler::evpsrlvq(dst, mask, nds, src, merge, vector_len);
1475     }
1476   }
1477   void evpsraw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1478     if (!is_varshift) {
1479       Assembler::evpsraw(dst, mask, nds, src, merge, vector_len);
1480     } else {
1481       Assembler::evpsravw(dst, mask, nds, src, merge, vector_len);
1482     }
1483   }
1484   void evpsrad(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1485     if (!is_varshift) {
1486       Assembler::evpsrad(dst, mask, nds, src, merge, vector_len);
1487     } else {
1488       Assembler::evpsravd(dst, mask, nds, src, merge, vector_len);
1489     }
1490   }
1491   void evpsraq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
1492     if (!is_varshift) {
1493       Assembler::evpsraq(dst, mask, nds, src, merge, vector_len);
1494     } else {
1495       Assembler::evpsravq(dst, mask, nds, src, merge, vector_len);
1496     }
1497   }
1498 
1499   void evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1500   void evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1501   void evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1502   void evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1503 
1504   void vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1505   void vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1506 
1507   void vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1508   void vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1509 
1510   void vptest(XMMRegister dst, XMMRegister src);
1511   void vptest(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vptest(dst, src, vector_len); }
1512 
1513   void punpcklbw(XMMRegister dst, XMMRegister src);
1514   void punpcklbw(XMMRegister dst, Address src) { Assembler::punpcklbw(dst, src); }
1515 
1516   void pshufd(XMMRegister dst, Address src, int mode);
1517   void pshufd(XMMRegister dst, XMMRegister src, int mode) { Assembler::pshufd(dst, src, mode); }
1518 
1519   void pshuflw(XMMRegister dst, XMMRegister src, int mode);
1520   void pshuflw(XMMRegister dst, Address src, int mode) { Assembler::pshuflw(dst, src, mode); }
1521 
1522   void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1523   void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len)     { Assembler::vandpd(dst, nds, src, vector_len); }
1524   void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1);
1525 
1526   void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1527   void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len)     { Assembler::vandps(dst, nds, src, vector_len); }
1528   void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1);
1529 
1530   void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register scratch_reg);
1531 
1532   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); }
1533   void vdivsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vdivsd(dst, nds, src); }
1534   void vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1535 
1536   void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivss(dst, nds, src); }
1537   void vdivss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vdivss(dst, nds, src); }
1538   void vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1539 
1540   void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vmulsd(dst, nds, src); }
1541   void vmulsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vmulsd(dst, nds, src); }
1542   void vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1543 
1544   void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vmulss(dst, nds, src); }
1545   void vmulss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vmulss(dst, nds, src); }
1546   void vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1547 
1548   void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vsubsd(dst, nds, src); }
1549   void vsubsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubsd(dst, nds, src); }
1550   void vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1551 
1552   void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vsubss(dst, nds, src); }
1553   void vsubss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubss(dst, nds, src); }
1554   void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1555 
1556   void vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1557   void vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1558 
1559   // AVX Vector instructions
1560 
1561   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1562   void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1563   void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1);
1564 
1565   void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1566   void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1567   void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1);
1568 
1569   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1570     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1571       Assembler::vpxor(dst, nds, src, vector_len);
1572     else
1573       Assembler::vxorpd(dst, nds, src, vector_len);
1574   }
1575   void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
1576     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1577       Assembler::vpxor(dst, nds, src, vector_len);
1578     else
1579       Assembler::vxorpd(dst, nds, src, vector_len);
1580   }
1581   void vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1);
1582 
1583   // Simple version for AVX2 256bit vectors
1584   void vpxor(XMMRegister dst, XMMRegister src) {
1585     assert(UseAVX >= 2, "Should be at least AVX2");
1586     Assembler::vpxor(dst, dst, src, AVX_256bit);
1587   }
1588   void vpxor(XMMRegister dst, Address src) {
1589     assert(UseAVX >= 2, "Should be at least AVX2");
1590     Assembler::vpxor(dst, dst, src, AVX_256bit);
1591   }
1592 
1593   void vpermd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vpermd(dst, nds, src, vector_len); }
1594   void vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg);
1595 
1596   void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
1597     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1598       Assembler::vinserti32x4(dst, nds, src, imm8);
1599     } else if (UseAVX > 1) {
1600       // vinserti128 is available only in AVX2
1601       Assembler::vinserti128(dst, nds, src, imm8);
1602     } else {
1603       Assembler::vinsertf128(dst, nds, src, imm8);
1604     }
1605   }
1606 
1607   void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
1608     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1609       Assembler::vinserti32x4(dst, nds, src, imm8);
1610     } else if (UseAVX > 1) {
1611       // vinserti128 is available only in AVX2
1612       Assembler::vinserti128(dst, nds, src, imm8);
1613     } else {
1614       Assembler::vinsertf128(dst, nds, src, imm8);
1615     }
1616   }
1617 
1618   void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
1619     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1620       Assembler::vextracti32x4(dst, src, imm8);
1621     } else if (UseAVX > 1) {
1622       // vextracti128 is available only in AVX2
1623       Assembler::vextracti128(dst, src, imm8);
1624     } else {
1625       Assembler::vextractf128(dst, src, imm8);
1626     }
1627   }
1628 
1629   void vextracti128(Address dst, XMMRegister src, uint8_t imm8) {
1630     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1631       Assembler::vextracti32x4(dst, src, imm8);
1632     } else if (UseAVX > 1) {
1633       // vextracti128 is available only in AVX2
1634       Assembler::vextracti128(dst, src, imm8);
1635     } else {
1636       Assembler::vextractf128(dst, src, imm8);
1637     }
1638   }
1639 
1640   // 128bit copy to/from high 128 bits of 256bit (YMM) vector registers
1641   void vinserti128_high(XMMRegister dst, XMMRegister src) {
1642     vinserti128(dst, dst, src, 1);
1643   }
1644   void vinserti128_high(XMMRegister dst, Address src) {
1645     vinserti128(dst, dst, src, 1);
1646   }
1647   void vextracti128_high(XMMRegister dst, XMMRegister src) {
1648     vextracti128(dst, src, 1);
1649   }
1650   void vextracti128_high(Address dst, XMMRegister src) {
1651     vextracti128(dst, src, 1);
1652   }
1653 
1654   void vinsertf128_high(XMMRegister dst, XMMRegister src) {
1655     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1656       Assembler::vinsertf32x4(dst, dst, src, 1);
1657     } else {
1658       Assembler::vinsertf128(dst, dst, src, 1);
1659     }
1660   }
1661 
1662   void vinsertf128_high(XMMRegister dst, Address src) {
1663     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1664       Assembler::vinsertf32x4(dst, dst, src, 1);
1665     } else {
1666       Assembler::vinsertf128(dst, dst, src, 1);
1667     }
1668   }
1669 
1670   void vextractf128_high(XMMRegister dst, XMMRegister src) {
1671     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1672       Assembler::vextractf32x4(dst, src, 1);
1673     } else {
1674       Assembler::vextractf128(dst, src, 1);
1675     }
1676   }
1677 
1678   void vextractf128_high(Address dst, XMMRegister src) {
1679     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1680       Assembler::vextractf32x4(dst, src, 1);
1681     } else {
1682       Assembler::vextractf128(dst, src, 1);
1683     }
1684   }
1685 
1686   // 256bit copy to/from high 256 bits of 512bit (ZMM) vector registers
1687   void vinserti64x4_high(XMMRegister dst, XMMRegister src) {
1688     Assembler::vinserti64x4(dst, dst, src, 1);
1689   }
1690   void vinsertf64x4_high(XMMRegister dst, XMMRegister src) {
1691     Assembler::vinsertf64x4(dst, dst, src, 1);
1692   }
1693   void vextracti64x4_high(XMMRegister dst, XMMRegister src) {
1694     Assembler::vextracti64x4(dst, src, 1);
1695   }
1696   void vextractf64x4_high(XMMRegister dst, XMMRegister src) {
1697     Assembler::vextractf64x4(dst, src, 1);
1698   }
1699   void vextractf64x4_high(Address dst, XMMRegister src) {
1700     Assembler::vextractf64x4(dst, src, 1);
1701   }
1702   void vinsertf64x4_high(XMMRegister dst, Address src) {
1703     Assembler::vinsertf64x4(dst, dst, src, 1);
1704   }
1705 
1706   // 128bit copy to/from low 128 bits of 256bit (YMM) vector registers
1707   void vinserti128_low(XMMRegister dst, XMMRegister src) {
1708     vinserti128(dst, dst, src, 0);
1709   }
1710   void vinserti128_low(XMMRegister dst, Address src) {
1711     vinserti128(dst, dst, src, 0);
1712   }
1713   void vextracti128_low(XMMRegister dst, XMMRegister src) {
1714     vextracti128(dst, src, 0);
1715   }
1716   void vextracti128_low(Address dst, XMMRegister src) {
1717     vextracti128(dst, src, 0);
1718   }
1719 
1720   void vinsertf128_low(XMMRegister dst, XMMRegister src) {
1721     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1722       Assembler::vinsertf32x4(dst, dst, src, 0);
1723     } else {
1724       Assembler::vinsertf128(dst, dst, src, 0);
1725     }
1726   }
1727 
1728   void vinsertf128_low(XMMRegister dst, Address src) {
1729     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1730       Assembler::vinsertf32x4(dst, dst, src, 0);
1731     } else {
1732       Assembler::vinsertf128(dst, dst, src, 0);
1733     }
1734   }
1735 
1736   void vextractf128_low(XMMRegister dst, XMMRegister src) {
1737     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1738       Assembler::vextractf32x4(dst, src, 0);
1739     } else {
1740       Assembler::vextractf128(dst, src, 0);
1741     }
1742   }
1743 
1744   void vextractf128_low(Address dst, XMMRegister src) {
1745     if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
1746       Assembler::vextractf32x4(dst, src, 0);
1747     } else {
1748       Assembler::vextractf128(dst, src, 0);
1749     }
1750   }
1751 
1752   // 256bit copy to/from low 256 bits of 512bit (ZMM) vector registers
1753   void vinserti64x4_low(XMMRegister dst, XMMRegister src) {
1754     Assembler::vinserti64x4(dst, dst, src, 0);
1755   }
1756   void vinsertf64x4_low(XMMRegister dst, XMMRegister src) {
1757     Assembler::vinsertf64x4(dst, dst, src, 0);
1758   }
1759   void vextracti64x4_low(XMMRegister dst, XMMRegister src) {
1760     Assembler::vextracti64x4(dst, src, 0);
1761   }
1762   void vextractf64x4_low(XMMRegister dst, XMMRegister src) {
1763     Assembler::vextractf64x4(dst, src, 0);
1764   }
1765   void vextractf64x4_low(Address dst, XMMRegister src) {
1766     Assembler::vextractf64x4(dst, src, 0);
1767   }
1768   void vinsertf64x4_low(XMMRegister dst, Address src) {
1769     Assembler::vinsertf64x4(dst, dst, src, 0);
1770   }
1771 
1772   // Carry-Less Multiplication Quadword
1773   void vpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1774     // 0x00 - multiply lower 64 bits [0:63]
1775     Assembler::vpclmulqdq(dst, nds, src, 0x00);
1776   }
1777   void vpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1778     // 0x11 - multiply upper 64 bits [64:127]
1779     Assembler::vpclmulqdq(dst, nds, src, 0x11);
1780   }
1781   void vpclmullqhqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1782     // 0x10 - multiply nds[0:63] and src[64:127]
1783     Assembler::vpclmulqdq(dst, nds, src, 0x10);
1784   }
1785   void vpclmulhqlqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1786     //0x01 - multiply nds[64:127] and src[0:63]
1787     Assembler::vpclmulqdq(dst, nds, src, 0x01);
1788   }
1789 
1790   void evpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1791     // 0x00 - multiply lower 64 bits [0:63]
1792     Assembler::evpclmulqdq(dst, nds, src, 0x00, vector_len);
1793   }
1794   void evpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1795     // 0x11 - multiply upper 64 bits [64:127]
1796     Assembler::evpclmulqdq(dst, nds, src, 0x11, vector_len);
1797   }
1798 
1799   // AVX-512 mask operations.
1800   void kand(BasicType etype, KRegister dst, KRegister src1, KRegister src2);
1801   void kor(BasicType type, KRegister dst, KRegister src1, KRegister src2);
1802   void knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp = knoreg, Register rtmp = noreg);
1803   void kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2);
1804   void kortest(uint masklen, KRegister src1, KRegister src2);
1805   void ktest(uint masklen, KRegister src1, KRegister src2);
1806 
1807   void evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1808   void evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1809 
1810   void evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1811   void evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1812 
1813   void evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1814   void evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1815 
1816   void evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1817   void evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
1818 
1819   void evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
1820   void evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
1821   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc);
1822   void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc);
1823 
1824   void alltrue(Register dst, uint masklen, KRegister src1, KRegister src2, KRegister kscratch);
1825   void anytrue(Register dst, uint masklen, KRegister src, KRegister kscratch);
1826 
1827   void cmov32( Condition cc, Register dst, Address  src);
1828   void cmov32( Condition cc, Register dst, Register src);
1829 
1830   void cmov(   Condition cc, Register dst, Register src) { cmovptr(cc, dst, src); }
1831 
1832   void cmovptr(Condition cc, Register dst, Address  src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
1833   void cmovptr(Condition cc, Register dst, Register src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
1834 
1835   void movoop(Register dst, jobject obj);
1836   void movoop(Address dst, jobject obj);
1837 
1838   void mov_metadata(Register dst, Metadata* obj);
1839   void mov_metadata(Address dst, Metadata* obj);
1840 
1841   void movptr(ArrayAddress dst, Register src);
1842   // can this do an lea?
1843   void movptr(Register dst, ArrayAddress src);
1844 
1845   void movptr(Register dst, Address src);
1846 
1847 #ifdef _LP64
1848   void movptr(Register dst, AddressLiteral src, Register scratch=rscratch1);
1849 #else
1850   void movptr(Register dst, AddressLiteral src, Register scratch=noreg); // Scratch reg is ignored in 32-bit
1851 #endif
1852 
1853   void movptr(Register dst, intptr_t src);
1854   void movptr(Register dst, Register src);
1855   void movptr(Address dst, intptr_t src);
1856 
1857   void movptr(Address dst, Register src);
1858 
1859   void movptr(Register dst, RegisterOrConstant src) {
1860     if (src.is_constant()) movptr(dst, src.as_constant());
1861     else                   movptr(dst, src.as_register());
1862   }
1863 
1864 #ifdef _LP64
1865   // Generally the next two are only used for moving NULL
1866   // Although there are situations in initializing the mark word where
1867   // they could be used. They are dangerous.
1868 
1869   // They only exist on LP64 so that int32_t and intptr_t are not the same
1870   // and we have ambiguous declarations.
1871 
1872   void movptr(Address dst, int32_t imm32);
1873   void movptr(Register dst, int32_t imm32);
1874 #endif // _LP64
1875 
1876   // to avoid hiding movl
1877   void mov32(AddressLiteral dst, Register src);
1878   void mov32(Register dst, AddressLiteral src);
1879 
1880   // to avoid hiding movb
1881   void movbyte(ArrayAddress dst, int src);
1882 
1883   // Import other mov() methods from the parent class or else
1884   // they will be hidden by the following overriding declaration.
1885   using Assembler::movdl;
1886   using Assembler::movq;
1887   void movdl(XMMRegister dst, AddressLiteral src);
1888   void movq(XMMRegister dst, AddressLiteral src);
1889 
1890   // Can push value or effective address
1891   void pushptr(AddressLiteral src);
1892 
1893   void pushptr(Address src) { LP64_ONLY(pushq(src)) NOT_LP64(pushl(src)); }
1894   void popptr(Address src) { LP64_ONLY(popq(src)) NOT_LP64(popl(src)); }
1895 
1896   void pushoop(jobject obj);
1897   void pushklass(Metadata* obj);
1898 
1899   // sign extend as need a l to ptr sized element
1900   void movl2ptr(Register dst, Address src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src)); }
1901   void movl2ptr(Register dst, Register src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(if (dst != src) movl(dst, src)); }
1902 
1903 
1904  public:
1905   // C2 compiled method's prolog code.
1906   void verified_entry(Compile* C, int sp_inc = 0);
1907 
1908   // Inline type specific methods
1909   #include "asm/macroAssembler_common.hpp"
1910 
1911   int store_inline_type_fields_to_buf(ciInlineKlass* vk, bool from_interpreter = true);
1912   bool move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[]);
1913   bool unpack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index,
1914                             VMReg from, int& from_index, VMRegPair* to, int to_count, int& to_index,
1915                             RegState reg_state[]);
1916   bool pack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
1917                           VMRegPair* from, int from_count, int& from_index, VMReg to,
1918                           RegState reg_state[], Register val_array);
1919   int extend_stack_for_inline_args(int args_on_stack);
1920   void remove_frame(int initial_framesize, bool needs_stack_repair);
1921   VMReg spill_reg_for(VMReg reg);
1922 
1923   // clear memory of size 'cnt' qwords, starting at 'base';
1924   // if 'is_large' is set, do not try to produce short loop
1925   void clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp, bool is_large, bool word_copy_only, KRegister mask=knoreg);
1926 
1927   // clear memory initialization sequence for constant size;
1928   void clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
1929 
1930   // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
1931   void xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
1932 
1933   // Fill primitive arrays
1934   void generate_fill(BasicType t, bool aligned,
1935                      Register to, Register value, Register count,
1936                      Register rtmp, XMMRegister xtmp);
1937 
1938   void encode_iso_array(Register src, Register dst, Register len,
1939                         XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
1940                         XMMRegister tmp4, Register tmp5, Register result, bool ascii);
1941 
1942 #ifdef _LP64
1943   void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2);
1944   void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
1945                              Register y, Register y_idx, Register z,
1946                              Register carry, Register product,
1947                              Register idx, Register kdx);
1948   void multiply_add_128_x_128(Register x_xstart, Register y, Register z,
1949                               Register yz_idx, Register idx,
1950                               Register carry, Register product, int offset);
1951   void multiply_128_x_128_bmi2_loop(Register y, Register z,
1952                                     Register carry, Register carry2,
1953                                     Register idx, Register jdx,
1954                                     Register yz_idx1, Register yz_idx2,
1955                                     Register tmp, Register tmp3, Register tmp4);
1956   void multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
1957                                Register yz_idx, Register idx, Register jdx,
1958                                Register carry, Register product,
1959                                Register carry2);
1960   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
1961                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
1962   void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
1963                      Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
1964   void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
1965                             Register tmp2);
1966   void multiply_add_64(Register sum, Register op1, Register op2, Register carry,
1967                        Register rdxReg, Register raxReg);
1968   void add_one_64(Register z, Register zlen, Register carry, Register tmp1);
1969   void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
1970                        Register tmp3, Register tmp4);
1971   void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
1972                      Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
1973 
1974   void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1,
1975                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
1976                Register raxReg);
1977   void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
1978                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
1979                Register raxReg);
1980   void vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
1981                            Register result, Register tmp1, Register tmp2,
1982                            XMMRegister vec1, XMMRegister vec2, XMMRegister vec3);
1983 #endif
1984 
1985   // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
1986   void update_byte_crc32(Register crc, Register val, Register table);
1987   void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
1988 
1989 
1990 #ifdef _LP64
1991   void kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2);
1992   void kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
1993                                 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
1994                                 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup);
1995   void updateBytesAdler32(Register adler32, Register buf, Register length, XMMRegister shuf0, XMMRegister shuf1, ExternalAddress scale);
1996 #endif // _LP64
1997 
1998   // CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
1999   // Note on a naming convention:
2000   // Prefix w = register only used on a Westmere+ architecture
2001   // Prefix n = register only used on a Nehalem architecture
2002 #ifdef _LP64
2003   void crc32c_ipl_alg4(Register in_out, uint32_t n,
2004                        Register tmp1, Register tmp2, Register tmp3);
2005 #else
2006   void crc32c_ipl_alg4(Register in_out, uint32_t n,
2007                        Register tmp1, Register tmp2, Register tmp3,
2008                        XMMRegister xtmp1, XMMRegister xtmp2);
2009 #endif
2010   void crc32c_pclmulqdq(XMMRegister w_xtmp1,
2011                         Register in_out,
2012                         uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
2013                         XMMRegister w_xtmp2,
2014                         Register tmp1,
2015                         Register n_tmp2, Register n_tmp3);
2016   void crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
2017                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
2018                        Register tmp1, Register tmp2,
2019                        Register n_tmp3);
2020   void crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
2021                          Register in_out1, Register in_out2, Register in_out3,
2022                          Register tmp1, Register tmp2, Register tmp3,
2023                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
2024                          Register tmp4, Register tmp5,
2025                          Register n_tmp6);
2026   void crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
2027                             Register tmp1, Register tmp2, Register tmp3,
2028                             Register tmp4, Register tmp5, Register tmp6,
2029                             XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
2030                             bool is_pclmulqdq_supported);
2031   // Fold 128-bit data chunk
2032   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
2033   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf);
2034 #ifdef _LP64
2035   // Fold 512-bit data chunk
2036   void fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, Register pos, int offset);
2037 #endif // _LP64
2038   // Fold 8-bit data
2039   void fold_8bit_crc32(Register crc, Register table, Register tmp);
2040   void fold_8bit_crc32(XMMRegister crc, Register table, XMMRegister xtmp, Register tmp);
2041 
2042   // Compress char[] array to byte[].
2043   void char_array_compress(Register src, Register dst, Register len,
2044                            XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
2045                            XMMRegister tmp4, Register tmp5, Register result,
2046                            KRegister mask1 = knoreg, KRegister mask2 = knoreg);
2047 
2048   // Inflate byte[] array to char[].
2049   void byte_array_inflate(Register src, Register dst, Register len,
2050                           XMMRegister tmp1, Register tmp2, KRegister mask = knoreg);
2051 
2052   void fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
2053                    Register length, Register temp, int vec_enc);
2054 
2055   void fill64_masked(uint shift, Register dst, int disp,
2056                          XMMRegister xmm, KRegister mask, Register length,
2057                          Register temp, bool use64byteVector = false);
2058 
2059   void fill32_masked(uint shift, Register dst, int disp,
2060                          XMMRegister xmm, KRegister mask, Register length,
2061                          Register temp);
2062 
2063   void fill32(Register dst, int disp, XMMRegister xmm);
2064 
2065   void fill64(Register dst, int dis, XMMRegister xmm, bool use64byteVector = false);
2066 
2067 #ifdef _LP64
2068   void convert_f2i(Register dst, XMMRegister src);
2069   void convert_d2i(Register dst, XMMRegister src);
2070   void convert_f2l(Register dst, XMMRegister src);
2071   void convert_d2l(Register dst, XMMRegister src);
2072 
2073   void cache_wb(Address line);
2074   void cache_wbsync(bool is_pre);
2075 
2076 #if COMPILER2_OR_JVMCI
2077   void arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
2078                                     Register to, Register count, int shift,
2079                                     Register index, Register temp,
2080                                     bool use64byteVector, Label& L_entry, Label& L_exit);
2081 
2082   void arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
2083                                              Register to, Register start_index, Register end_index,
2084                                              Register count, int shift, Register temp,
2085                                              bool use64byteVector, Label& L_entry, Label& L_exit);
2086 
2087   void copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
2088                          KRegister mask, Register length, Register index,
2089                          Register temp, int shift = Address::times_1, int offset = 0,
2090                          bool use64byteVector = false);
2091 
2092   void copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
2093                          KRegister mask, Register length, Register index,
2094                          Register temp, int shift = Address::times_1, int offset = 0);
2095 
2096   void copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
2097                   int shift = Address::times_1, int offset = 0);
2098 
2099   void copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
2100                   bool conjoint, int shift = Address::times_1, int offset = 0,
2101                   bool use64byteVector = false);
2102 
2103   void generate_fill_avx3(BasicType type, Register to, Register value,
2104                           Register count, Register rtmp, XMMRegister xtmp);
2105 
2106 #endif // COMPILER2_OR_JVMCI
2107 
2108 #endif // _LP64
2109 
2110   void vallones(XMMRegister dst, int vector_len);
2111 };
2112 
2113 /**
2114  * class SkipIfEqual:
2115  *
2116  * Instantiating this class will result in assembly code being output that will
2117  * jump around any code emitted between the creation of the instance and it's
2118  * automatic destruction at the end of a scope block, depending on the value of
2119  * the flag passed to the constructor, which will be checked at run-time.
2120  */
2121 class SkipIfEqual {
2122  private:
2123   MacroAssembler* _masm;
2124   Label _label;
2125 
2126  public:
2127    SkipIfEqual(MacroAssembler*, const bool* flag_addr, bool value);
2128    ~SkipIfEqual();
2129 };
2130 
2131 #endif // CPU_X86_MACROASSEMBLER_X86_HPP