1 /*
   2  * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef CPU_X86_ASSEMBLER_X86_HPP
  26 #define CPU_X86_ASSEMBLER_X86_HPP
  27 
  28 #include "asm/register.hpp"
  29 #include "utilities/checkedCast.hpp"
  30 #include "utilities/powerOfTwo.hpp"
  31 
  32 // Contains all the definitions needed for x86 assembly code generation.
  33 
  34 // Calling convention
  35 class Argument {
  36  public:
  37   enum {
  38 #ifdef _LP64
  39 #ifdef _WIN64
  40     n_int_register_parameters_c   = 4, // rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
  41     n_float_register_parameters_c = 4,  // xmm0 - xmm3 (c_farg0, c_farg1, ... )
  42     n_int_register_returns_c = 1, // rax
  43     n_float_register_returns_c = 1, // xmm0
  44 #else
  45     n_int_register_parameters_c   = 6, // rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
  46     n_float_register_parameters_c = 8,  // xmm0 - xmm7 (c_farg0, c_farg1, ... )
  47     n_int_register_returns_c = 2, // rax, rdx
  48     n_float_register_returns_c = 2, // xmm0, xmm1
  49 #endif // _WIN64
  50     n_int_register_parameters_j   = 6, // j_rarg0, j_rarg1, ...
  51     n_float_register_parameters_j = 8  // j_farg0, j_farg1, ...
  52 #else
  53     n_register_parameters = 0,   // 0 registers used to pass arguments
  54     n_int_register_parameters_j   = 0,
  55     n_float_register_parameters_j = 0
  56 #endif // _LP64
  57   };
  58 };
  59 
  60 
  61 #ifdef _LP64
  62 // Symbolically name the register arguments used by the c calling convention.
  63 // Windows is different from linux/solaris. So much for standards...
  64 
  65 #ifdef _WIN64
  66 
  67 constexpr Register c_rarg0 = rcx;
  68 constexpr Register c_rarg1 = rdx;
  69 constexpr Register c_rarg2 =  r8;
  70 constexpr Register c_rarg3 =  r9;
  71 
  72 constexpr XMMRegister c_farg0 = xmm0;
  73 constexpr XMMRegister c_farg1 = xmm1;
  74 constexpr XMMRegister c_farg2 = xmm2;
  75 constexpr XMMRegister c_farg3 = xmm3;
  76 
  77 #else
  78 
  79 constexpr Register c_rarg0 = rdi;
  80 constexpr Register c_rarg1 = rsi;
  81 constexpr Register c_rarg2 = rdx;
  82 constexpr Register c_rarg3 = rcx;
  83 constexpr Register c_rarg4 =  r8;
  84 constexpr Register c_rarg5 =  r9;
  85 
  86 constexpr XMMRegister c_farg0 = xmm0;
  87 constexpr XMMRegister c_farg1 = xmm1;
  88 constexpr XMMRegister c_farg2 = xmm2;
  89 constexpr XMMRegister c_farg3 = xmm3;
  90 constexpr XMMRegister c_farg4 = xmm4;
  91 constexpr XMMRegister c_farg5 = xmm5;
  92 constexpr XMMRegister c_farg6 = xmm6;
  93 constexpr XMMRegister c_farg7 = xmm7;
  94 
  95 #endif // _WIN64
  96 
  97 // Symbolically name the register arguments used by the Java calling convention.
  98 // We have control over the convention for java so we can do what we please.
  99 // What pleases us is to offset the java calling convention so that when
 100 // we call a suitable jni method the arguments are lined up and we don't
 101 // have to do little shuffling. A suitable jni method is non-static and a
 102 // small number of arguments (two fewer args on windows)
 103 //
 104 //        |-------------------------------------------------------|
 105 //        | c_rarg0   c_rarg1  c_rarg2 c_rarg3 c_rarg4 c_rarg5    |
 106 //        |-------------------------------------------------------|
 107 //        | rcx       rdx      r8      r9      rdi*    rsi*       | windows (* not a c_rarg)
 108 //        | rdi       rsi      rdx     rcx     r8      r9         | solaris/linux
 109 //        |-------------------------------------------------------|
 110 //        | j_rarg5   j_rarg0  j_rarg1 j_rarg2 j_rarg3 j_rarg4    |
 111 //        |-------------------------------------------------------|
 112 
 113 constexpr Register j_rarg0 = c_rarg1;
 114 constexpr Register j_rarg1 = c_rarg2;
 115 constexpr Register j_rarg2 = c_rarg3;
 116 // Windows runs out of register args here
 117 #ifdef _WIN64
 118 constexpr Register j_rarg3 = rdi;
 119 constexpr Register j_rarg4 = rsi;
 120 #else
 121 constexpr Register j_rarg3 = c_rarg4;
 122 constexpr Register j_rarg4 = c_rarg5;
 123 #endif /* _WIN64 */
 124 constexpr Register j_rarg5 = c_rarg0;
 125 
 126 constexpr XMMRegister j_farg0 = xmm0;
 127 constexpr XMMRegister j_farg1 = xmm1;
 128 constexpr XMMRegister j_farg2 = xmm2;
 129 constexpr XMMRegister j_farg3 = xmm3;
 130 constexpr XMMRegister j_farg4 = xmm4;
 131 constexpr XMMRegister j_farg5 = xmm5;
 132 constexpr XMMRegister j_farg6 = xmm6;
 133 constexpr XMMRegister j_farg7 = xmm7;
 134 
 135 constexpr Register rscratch1 = r10;  // volatile
 136 constexpr Register rscratch2 = r11;  // volatile
 137 
 138 constexpr Register r12_heapbase = r12; // callee-saved
 139 constexpr Register r15_thread   = r15; // callee-saved
 140 
 141 #else
 142 // rscratch1 will appear in 32bit code that is dead but of course must compile
 143 // Using noreg ensures if the dead code is incorrectly live and executed it
 144 // will cause an assertion failure
 145 #define rscratch1 noreg
 146 #define rscratch2 noreg
 147 
 148 #endif // _LP64
 149 
 150 // JSR 292
 151 // On x86, the SP does not have to be saved when invoking method handle intrinsics
 152 // or compiled lambda forms. We indicate that by setting rbp_mh_SP_save to noreg.
 153 constexpr Register rbp_mh_SP_save = noreg;
 154 
 155 // Address is an abstraction used to represent a memory location
 156 // using any of the amd64 addressing modes with one object.
 157 //
 158 // Note: A register location is represented via a Register, not
 159 //       via an address for efficiency & simplicity reasons.
 160 
 161 class ArrayAddress;
 162 
 163 class Address {
 164  public:
 165   enum ScaleFactor {
 166     no_scale = -1,
 167     times_1  =  0,
 168     times_2  =  1,
 169     times_4  =  2,
 170     times_8  =  3,
 171     times_ptr = LP64_ONLY(times_8) NOT_LP64(times_4)
 172   };
 173   static ScaleFactor times(int size) {
 174     assert(size >= 1 && size <= 8 && is_power_of_2(size), "bad scale size");
 175     if (size == 8)  return times_8;
 176     if (size == 4)  return times_4;
 177     if (size == 2)  return times_2;
 178     return times_1;
 179   }
 180   static int scale_size(ScaleFactor scale) {
 181     assert(scale != no_scale, "");
 182     assert(((1 << (int)times_1) == 1 &&
 183             (1 << (int)times_2) == 2 &&
 184             (1 << (int)times_4) == 4 &&
 185             (1 << (int)times_8) == 8), "");
 186     return (1 << (int)scale);
 187   }
 188 
 189  private:
 190   Register         _base;
 191   Register         _index;
 192   XMMRegister      _xmmindex;
 193   ScaleFactor      _scale;
 194   int              _disp;
 195   bool             _isxmmindex;
 196   RelocationHolder _rspec;
 197 
 198   // Easily misused constructors make them private
 199   // %%% can we make these go away?
 200   NOT_LP64(Address(address loc, RelocationHolder spec);)
 201   Address(int disp, address loc, relocInfo::relocType rtype);
 202   Address(int disp, address loc, RelocationHolder spec);
 203 
 204  public:
 205 
 206  int disp() { return _disp; }
 207   // creation
 208   Address()
 209     : _base(noreg),
 210       _index(noreg),
 211       _xmmindex(xnoreg),
 212       _scale(no_scale),
 213       _disp(0),
 214       _isxmmindex(false){
 215   }
 216 
 217   explicit Address(Register base, int disp = 0)
 218     : _base(base),
 219       _index(noreg),
 220       _xmmindex(xnoreg),
 221       _scale(no_scale),
 222       _disp(disp),
 223       _isxmmindex(false){
 224   }
 225 
 226   Address(Register base, Register index, ScaleFactor scale, int disp = 0)
 227     : _base (base),
 228       _index(index),
 229       _xmmindex(xnoreg),
 230       _scale(scale),
 231       _disp (disp),
 232       _isxmmindex(false) {
 233     assert(!index->is_valid() == (scale == Address::no_scale),
 234            "inconsistent address");
 235   }
 236 
 237   Address(Register base, RegisterOrConstant index, ScaleFactor scale = times_1, int disp = 0)
 238     : _base (base),
 239       _index(index.register_or_noreg()),
 240       _xmmindex(xnoreg),
 241       _scale(scale),
 242       _disp (disp + checked_cast<int>(index.constant_or_zero() * scale_size(scale))),
 243       _isxmmindex(false){
 244     if (!index.is_register())  scale = Address::no_scale;
 245     assert(!_index->is_valid() == (scale == Address::no_scale),
 246            "inconsistent address");
 247   }
 248 
 249   Address(Register base, XMMRegister index, ScaleFactor scale, int disp = 0)
 250     : _base (base),
 251       _index(noreg),
 252       _xmmindex(index),
 253       _scale(scale),
 254       _disp(disp),
 255       _isxmmindex(true) {
 256       assert(!index->is_valid() == (scale == Address::no_scale),
 257              "inconsistent address");
 258   }
 259 
 260   // The following overloads are used in connection with the
 261   // ByteSize type (see sizes.hpp).  They simplify the use of
 262   // ByteSize'd arguments in assembly code.
 263 
 264   Address(Register base, ByteSize disp)
 265     : Address(base, in_bytes(disp)) {}
 266 
 267   Address(Register base, Register index, ScaleFactor scale, ByteSize disp)
 268     : Address(base, index, scale, in_bytes(disp)) {}
 269 
 270   Address(Register base, RegisterOrConstant index, ScaleFactor scale, ByteSize disp)
 271     : Address(base, index, scale, in_bytes(disp)) {}
 272 
 273   Address plus_disp(int disp) const {
 274     Address a = (*this);
 275     a._disp += disp;
 276     return a;
 277   }
 278   Address plus_disp(RegisterOrConstant disp, ScaleFactor scale = times_1) const {
 279     Address a = (*this);
 280     a._disp += checked_cast<int>(disp.constant_or_zero() * scale_size(scale));
 281     if (disp.is_register()) {
 282       assert(!a.index()->is_valid(), "competing indexes");
 283       a._index = disp.as_register();
 284       a._scale = scale;
 285     }
 286     return a;
 287   }
 288   bool is_same_address(Address a) const {
 289     // disregard _rspec
 290     return _base == a._base && _disp == a._disp && _index == a._index && _scale == a._scale;
 291   }
 292 
 293   // accessors
 294   bool        uses(Register reg) const { return _base == reg || _index == reg; }
 295   Register    base()             const { return _base;  }
 296   Register    index()            const { return _index; }
 297   XMMRegister xmmindex()         const { return _xmmindex; }
 298   ScaleFactor scale()            const { return _scale; }
 299   int         disp()             const { return _disp;  }
 300   bool        isxmmindex()       const { return _isxmmindex; }
 301 
 302   // Convert the raw encoding form into the form expected by the constructor for
 303   // Address.  An index of 4 (rsp) corresponds to having no index, so convert
 304   // that to noreg for the Address constructor.
 305   static Address make_raw(int base, int index, int scale, int disp, relocInfo::relocType disp_reloc);
 306 
 307   static Address make_array(ArrayAddress);
 308 
 309  private:
 310   bool base_needs_rex() const {
 311     return _base->is_valid() && ((_base->encoding() & 8) == 8);
 312   }
 313 
 314   bool base_needs_rex2() const {
 315     return _base->is_valid() && _base->encoding() >= 16;
 316   }
 317 
 318   bool index_needs_rex() const {
 319     return _index->is_valid() && ((_index->encoding() & 8) == 8);
 320   }
 321 
 322   bool index_needs_rex2() const {
 323     return _index->is_valid() &&_index->encoding() >= 16;
 324   }
 325 
 326   bool xmmindex_needs_rex() const {
 327     return _xmmindex->is_valid() && ((_xmmindex->encoding() & 8) == 8);
 328   }
 329 
 330   bool xmmindex_needs_rex2() const {
 331     return _xmmindex->is_valid() && _xmmindex->encoding() >= 16;
 332   }
 333 
 334   relocInfo::relocType reloc() const { return _rspec.type(); }
 335 
 336   friend class Assembler;
 337   friend class MacroAssembler;
 338   friend class LIR_Assembler; // base/index/scale/disp
 339 };
 340 
 341 //
 342 // AddressLiteral has been split out from Address because operands of this type
 343 // need to be treated specially on 32bit vs. 64bit platforms. By splitting it out
 344 // the few instructions that need to deal with address literals are unique and the
 345 // MacroAssembler does not have to implement every instruction in the Assembler
 346 // in order to search for address literals that may need special handling depending
 347 // on the instruction and the platform. As small step on the way to merging i486/amd64
 348 // directories.
 349 //
 350 class AddressLiteral {
 351   friend class ArrayAddress;
 352   RelocationHolder _rspec;
 353   // Typically we use AddressLiterals we want to use their rval
 354   // However in some situations we want the lval (effect address) of the item.
 355   // We provide a special factory for making those lvals.
 356   bool _is_lval;
 357 
 358   // If the target is far we'll need to load the ea of this to
 359   // a register to reach it. Otherwise if near we can do rip
 360   // relative addressing.
 361 
 362   address          _target;
 363 
 364  protected:
 365   // creation
 366   AddressLiteral()
 367     : _is_lval(false),
 368       _target(nullptr)
 369   {}
 370 
 371   public:
 372 
 373 
 374   AddressLiteral(address target, relocInfo::relocType rtype);
 375 
 376   AddressLiteral(address target, RelocationHolder const& rspec)
 377     : _rspec(rspec),
 378       _is_lval(false),
 379       _target(target)
 380   {}
 381 
 382   AddressLiteral addr() {
 383     AddressLiteral ret = *this;
 384     ret._is_lval = true;
 385     return ret;
 386   }
 387 
 388 
 389  private:
 390 
 391   address target() { return _target; }
 392   bool is_lval() const { return _is_lval; }
 393 
 394   relocInfo::relocType reloc() const { return _rspec.type(); }
 395   const RelocationHolder& rspec() const { return _rspec; }
 396 
 397   friend class Assembler;
 398   friend class MacroAssembler;
 399   friend class Address;
 400   friend class LIR_Assembler;
 401 };
 402 
 403 // Convenience classes
 404 class RuntimeAddress: public AddressLiteral {
 405 
 406   public:
 407 
 408   RuntimeAddress(address target) : AddressLiteral(target, relocInfo::runtime_call_type) {}
 409 
 410 };
 411 
 412 class ExternalAddress: public AddressLiteral {
 413  private:
 414   static relocInfo::relocType reloc_for_target(address target) {
 415     // Sometimes ExternalAddress is used for values which aren't
 416     // exactly addresses, like the card table base.
 417     // external_word_type can't be used for values in the first page
 418     // so just skip the reloc in that case.
 419     return external_word_Relocation::can_be_relocated(target) ? relocInfo::external_word_type : relocInfo::none;
 420   }
 421 
 422  public:
 423 
 424   ExternalAddress(address target) : AddressLiteral(target, reloc_for_target(target)) {}
 425 
 426 };
 427 
 428 class InternalAddress: public AddressLiteral {
 429 
 430   public:
 431 
 432   InternalAddress(address target) : AddressLiteral(target, relocInfo::internal_word_type) {}
 433 
 434 };
 435 
 436 // x86 can do array addressing as a single operation since disp can be an absolute
 437 // address amd64 can't. We create a class that expresses the concept but does extra
 438 // magic on amd64 to get the final result
 439 
 440 class ArrayAddress {
 441   private:
 442 
 443   AddressLiteral _base;
 444   Address        _index;
 445 
 446   public:
 447 
 448   ArrayAddress() {};
 449   ArrayAddress(AddressLiteral base, Address index): _base(base), _index(index) {};
 450   AddressLiteral base() { return _base; }
 451   Address index() { return _index; }
 452 
 453 };
 454 
 455 class InstructionAttr;
 456 
 457 // 64-bit reflect the fxsave size which is 512 bytes and the new xsave area on EVEX which is another 2176 bytes
 458 // See fxsave and xsave(EVEX enabled) documentation for layout
 459 const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY(2688 / wordSize);
 460 
 461 // The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
 462 // level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
 463 // is what you get. The Assembler is generating code into a CodeBuffer.
 464 
 465 class Assembler : public AbstractAssembler  {
 466   friend class AbstractAssembler; // for the non-virtual hack
 467   friend class LIR_Assembler; // as_Address()
 468   friend class StubGenerator;
 469 
 470  public:
 471   enum Condition {                     // The x86 condition codes used for conditional jumps/moves.
 472     zero          = 0x4,
 473     notZero       = 0x5,
 474     equal         = 0x4,
 475     notEqual      = 0x5,
 476     less          = 0xc,
 477     lessEqual     = 0xe,
 478     greater       = 0xf,
 479     greaterEqual  = 0xd,
 480     below         = 0x2,
 481     belowEqual    = 0x6,
 482     above         = 0x7,
 483     aboveEqual    = 0x3,
 484     overflow      = 0x0,
 485     noOverflow    = 0x1,
 486     carrySet      = 0x2,
 487     carryClear    = 0x3,
 488     negative      = 0x8,
 489     positive      = 0x9,
 490     parity        = 0xa,
 491     noParity      = 0xb
 492   };
 493 
 494   enum Prefix {
 495     // segment overrides
 496     CS_segment = 0x2e,
 497     SS_segment = 0x36,
 498     DS_segment = 0x3e,
 499     ES_segment = 0x26,
 500     FS_segment = 0x64,
 501     GS_segment = 0x65,
 502 
 503     REX        = 0x40,
 504 
 505     REX_B      = 0x41,
 506     REX_X      = 0x42,
 507     REX_XB     = 0x43,
 508     REX_R      = 0x44,
 509     REX_RB     = 0x45,
 510     REX_RX     = 0x46,
 511     REX_RXB    = 0x47,
 512 
 513     REX_W      = 0x48,
 514 
 515     REX_WB     = 0x49,
 516     REX_WX     = 0x4A,
 517     REX_WXB    = 0x4B,
 518     REX_WR     = 0x4C,
 519     REX_WRB    = 0x4D,
 520     REX_WRX    = 0x4E,
 521     REX_WRXB   = 0x4F,
 522 
 523     REX2       = 0xd5,
 524     WREX2      = REX2 << 8,
 525 
 526     VEX_3bytes = 0xC4,
 527     VEX_2bytes = 0xC5,
 528     EVEX_4bytes = 0x62,
 529     Prefix_EMPTY = 0x0
 530   };
 531 
 532   enum PrefixBits {
 533     REX2BIT_B  = 0x01,
 534     REX2BIT_X  = 0x02,
 535     REX2BIT_R  = 0x04,
 536     REX2BIT_W  = 0x08,
 537     REX2BIT_B4 = 0x10,
 538     REX2BIT_X4 = 0x20,
 539     REX2BIT_R4 = 0x40,
 540     REX2BIT_M0 = 0x80,
 541     REX2BIT_WB = 0x09,
 542     REX2BIT_WB4 = 0x18,
 543   };
 544 
 545   enum VexPrefix {
 546     VEX_B = 0x20,
 547     VEX_X = 0x40,
 548     VEX_R = 0x80,
 549     VEX_W = 0x80
 550   };
 551 
 552   enum ExexPrefix {
 553     EVEX_F  = 0x04,
 554     EVEX_V  = 0x08,
 555     EVEX_Rb = 0x10,
 556     EVEX_B  = 0x20,
 557     EVEX_X  = 0x40,
 558     EVEX_Z  = 0x80
 559   };
 560 
 561   enum ExtEvexPrefix {
 562     EEVEX_R = 0x10,
 563     EEVEX_B = 0x08,
 564     EEVEX_X = 0x04,
 565     EEVEX_V = 0x08
 566   };
 567 
 568   enum EvexRoundPrefix {
 569     EVEX_RNE = 0x0,
 570     EVEX_RD  = 0x1,
 571     EVEX_RU  = 0x2,
 572     EVEX_RZ  = 0x3
 573   };
 574 
 575   enum VexSimdPrefix {
 576     VEX_SIMD_NONE = 0x0,
 577     VEX_SIMD_66   = 0x1,
 578     VEX_SIMD_F3   = 0x2,
 579     VEX_SIMD_F2   = 0x3,
 580   };
 581 
 582   enum VexOpcode {
 583     VEX_OPCODE_NONE  = 0x0,
 584     VEX_OPCODE_0F    = 0x1,
 585     VEX_OPCODE_0F_38 = 0x2,
 586     VEX_OPCODE_0F_3A = 0x3,
 587     VEX_OPCODE_0F_3C = 0x4,
 588     VEX_OPCODE_MASK  = 0x1F
 589   };
 590 
 591   enum AvxVectorLen {
 592     AVX_128bit = 0x0,
 593     AVX_256bit = 0x1,
 594     AVX_512bit = 0x2,
 595     AVX_NoVec  = 0x4
 596   };
 597 
 598   enum EvexTupleType {
 599     EVEX_FV   = 0,
 600     EVEX_HV   = 4,
 601     EVEX_FVM  = 6,
 602     EVEX_T1S  = 7,
 603     EVEX_T1F  = 11,
 604     EVEX_T2   = 13,
 605     EVEX_T4   = 15,
 606     EVEX_T8   = 17,
 607     EVEX_HVM  = 18,
 608     EVEX_QVM  = 19,
 609     EVEX_OVM  = 20,
 610     EVEX_M128 = 21,
 611     EVEX_DUP  = 22,
 612     EVEX_NOSCALE = 23,
 613     EVEX_ETUP = 24
 614   };
 615 
 616   enum EvexInputSizeInBits {
 617     EVEX_8bit  = 0,
 618     EVEX_16bit = 1,
 619     EVEX_32bit = 2,
 620     EVEX_64bit = 3,
 621     EVEX_NObit = 4
 622   };
 623 
 624   enum WhichOperand {
 625     // input to locate_operand, and format code for relocations
 626     imm_operand  = 0,            // embedded 32-bit|64-bit immediate operand
 627     disp32_operand = 1,          // embedded 32-bit displacement or address
 628     call32_operand = 2,          // embedded 32-bit self-relative displacement
 629 #ifndef _LP64
 630     _WhichOperand_limit = 3
 631 #else
 632      narrow_oop_operand = 3,     // embedded 32-bit immediate narrow oop
 633     _WhichOperand_limit = 4
 634 #endif
 635   };
 636 
 637   // Comparison predicates for integral types & FP types when using SSE
 638   enum ComparisonPredicate {
 639     eq = 0,
 640     lt = 1,
 641     le = 2,
 642     _false = 3,
 643     neq = 4,
 644     nlt = 5,
 645     nle = 6,
 646     _true = 7
 647   };
 648 
 649   // Comparison predicates for FP types when using AVX
 650   // O means ordered. U is unordered. When using ordered, any NaN comparison is false. Otherwise, it is true.
 651   // S means signaling. Q means non-signaling. When signaling is true, instruction signals #IA on NaN.
 652   enum ComparisonPredicateFP {
 653     EQ_OQ = 0,
 654     LT_OS = 1,
 655     LE_OS = 2,
 656     UNORD_Q = 3,
 657     NEQ_UQ = 4,
 658     NLT_US = 5,
 659     NLE_US = 6,
 660     ORD_Q = 7,
 661     EQ_UQ = 8,
 662     NGE_US = 9,
 663     NGT_US = 0xA,
 664     FALSE_OQ = 0XB,
 665     NEQ_OQ = 0xC,
 666     GE_OS = 0xD,
 667     GT_OS = 0xE,
 668     TRUE_UQ = 0xF,
 669     EQ_OS = 0x10,
 670     LT_OQ = 0x11,
 671     LE_OQ = 0x12,
 672     UNORD_S = 0x13,
 673     NEQ_US = 0x14,
 674     NLT_UQ = 0x15,
 675     NLE_UQ = 0x16,
 676     ORD_S = 0x17,
 677     EQ_US = 0x18,
 678     NGE_UQ = 0x19,
 679     NGT_UQ = 0x1A,
 680     FALSE_OS = 0x1B,
 681     NEQ_OS = 0x1C,
 682     GE_OQ = 0x1D,
 683     GT_OQ = 0x1E,
 684     TRUE_US =0x1F
 685   };
 686 
 687   enum Width {
 688     B = 0,
 689     W = 1,
 690     D = 2,
 691     Q = 3
 692   };
 693 
 694   //---<  calculate length of instruction  >---
 695   // As instruction size can't be found out easily on x86/x64,
 696   // we just use '4' for len and maxlen.
 697   // instruction must start at passed address
 698   static unsigned int instr_len(unsigned char *instr) { return 4; }
 699 
 700   //---<  longest instructions  >---
 701   // Max instruction length is not specified in architecture documentation.
 702   // We could use a "safe enough" estimate (15), but just default to
 703   // instruction length guess from above.
 704   static unsigned int instr_maxlen() { return 4; }
 705 
 706   // NOTE: The general philopsophy of the declarations here is that 64bit versions
 707   // of instructions are freely declared without the need for wrapping them an ifdef.
 708   // (Some dangerous instructions are ifdef's out of inappropriate jvm's.)
 709   // In the .cpp file the implementations are wrapped so that they are dropped out
 710   // of the resulting jvm. This is done mostly to keep the footprint of MINIMAL
 711   // to the size it was prior to merging up the 32bit and 64bit assemblers.
 712   //
 713   // This does mean you'll get a linker/runtime error if you use a 64bit only instruction
 714   // in a 32bit vm. This is somewhat unfortunate but keeps the ifdef noise down.
 715 
 716 private:
 717 
 718   bool _legacy_mode_bw;
 719   bool _legacy_mode_dq;
 720   bool _legacy_mode_vl;
 721   bool _legacy_mode_vlbw;
 722   NOT_LP64(bool _is_managed;)
 723 
 724   InstructionAttr *_attributes;
 725   void set_attributes(InstructionAttr* attributes);
 726 
 727   int get_base_prefix_bits(int enc);
 728   int get_index_prefix_bits(int enc);
 729   int get_base_prefix_bits(Register base);
 730   int get_index_prefix_bits(Register index);
 731   int get_reg_prefix_bits(int enc);
 732 
 733   // 64bit prefixes
 734   void prefix(Register reg);
 735   void prefix(Register dst, Register src, Prefix p);
 736   void prefix_rex2(Register dst, Register src);
 737   void prefix(Register dst, Address adr, Prefix p);
 738   void prefix_rex2(Register dst, Address adr);
 739 
 740   // The is_map1 bool indicates an x86 map1 instruction which, when
 741   // legacy encoded, uses a 0x0F opcode prefix.  By specification, the
 742   // opcode prefix is omitted when using rex2 encoding in support
 743   // of APX extended GPRs.
 744   void prefix(Address adr, bool is_map1 = false);
 745   void prefix_rex2(Address adr, bool is_map1 = false);
 746   void prefix(Address adr, Register reg,  bool byteinst = false, bool is_map1 = false);
 747   void prefix_rex2(Address adr, Register reg,  bool byteinst = false, bool is_map1 = false);
 748   void prefix(Address adr, XMMRegister reg);
 749   void prefix_rex2(Address adr, XMMRegister reg);
 750 
 751   int prefix_and_encode(int reg_enc, bool byteinst = false, bool is_map1 = false);
 752   int prefix_and_encode_rex2(int reg_enc, bool is_map1 = false);
 753   int prefix_and_encode(int dst_enc, int src_enc, bool is_map1 = false) {
 754     return prefix_and_encode(dst_enc, false, src_enc, false, is_map1);
 755   }
 756   int prefix_and_encode(int dst_enc, bool dst_is_byte, int src_enc, bool src_is_byte, bool is_map1 = false);
 757 
 758   int prefix_and_encode_rex2(int dst_enc, int src_enc, int init_bits = 0);
 759   // Some prefixq variants always emit exactly one prefix byte, so besides a
 760   // prefix-emitting method we provide a method to get the prefix byte to emit,
 761   // which can then be folded into a byte stream.
 762   int get_prefixq(Address adr, bool is_map1 = false);
 763   int get_prefixq_rex2(Address adr, bool is_map1 = false);
 764   int get_prefixq(Address adr, Register reg, bool is_map1 = false);
 765   int get_prefixq_rex2(Address adr, Register reg, bool ismap1 = false);
 766 
 767   void prefixq(Address adr);
 768   void prefixq(Address adr, Register reg, bool is_map1 = false);
 769   void prefixq(Address adr, XMMRegister reg);
 770   void prefixq_rex2(Address adr, XMMRegister src);
 771 
 772   bool prefix_is_rex2(int prefix);
 773 
 774   int prefixq_and_encode(int reg_enc, bool is_map1 = false);
 775   int prefixq_and_encode_rex2(int reg_enc, bool is_map1 = false);
 776   int prefixq_and_encode(int dst_enc, int src_enc, bool is_map1 = false);
 777   int prefixq_and_encode_rex2(int dst_enc, int src_enc, bool is_map1 = false);
 778 
 779   bool needs_rex2(Register reg1, Register reg2 = noreg, Register reg3 = noreg);
 780 
 781   bool needs_eevex(Register reg1, Register reg2 = noreg, Register reg3 = noreg);
 782   bool needs_eevex(int enc1, int enc2 = -1, int enc3 = -1);
 783   NOT_PRODUCT(bool needs_evex(XMMRegister reg1, XMMRegister reg2 = xnoreg, XMMRegister reg3 = xnoreg);)
 784 
 785   void rex_prefix(Address adr, XMMRegister xreg,
 786                   VexSimdPrefix pre, VexOpcode opc, bool rex_w);
 787   int  rex_prefix_and_encode(int dst_enc, int src_enc,
 788                              VexSimdPrefix pre, VexOpcode opc, bool rex_w);
 789 
 790   void vex_prefix(bool vex_r, bool vex_b, bool vex_x, int nds_enc, VexSimdPrefix pre, VexOpcode opc);
 791 
 792   void evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool evex_v, bool evex_r, bool evex_b,
 793                        bool eevex_x, int nds_enc, VexSimdPrefix pre, VexOpcode opc, bool no_flags = false);
 794 
 795   void evex_prefix_ndd(Address adr, int ndd_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc,
 796                        InstructionAttr *attributes, bool no_flags = false);
 797 
 798   void evex_prefix_nf(Address adr, int ndd_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc,
 799                       InstructionAttr *attributes, bool no_flags = false);
 800 
 801   void vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc,
 802                   InstructionAttr *attributes, bool nds_is_ndd = false, bool no_flags = false);
 803 
 804   int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
 805                              VexSimdPrefix pre, VexOpcode opc,
 806                              InstructionAttr *attributes, bool src_is_gpr = false, bool nds_is_ndd = false, bool no_flags = false);
 807 
 808   int  evex_prefix_and_encode_ndd(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc,
 809                                   InstructionAttr *attributes, bool no_flags = false);
 810 
 811   int  evex_prefix_and_encode_nf(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc,
 812                                  InstructionAttr *attributes, bool no_flags = false);
 813 
 814   void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre,
 815                    VexOpcode opc, InstructionAttr *attributes);
 816 
 817   int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre,
 818                              VexOpcode opc, InstructionAttr *attributes, bool src_is_gpr = false);
 819 
 820   // Helper functions for groups of instructions
 821   void emit_arith_b(int op1, int op2, Register dst, int imm8);
 822 
 823   void emit_arith(int op1, int op2, Register dst, int32_t imm32);
 824   // Force generation of a 4 byte immediate value even if it fits into 8bit
 825   void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32);
 826   void emit_arith(int op1, int op2, Register dst, Register src);
 827 
 828   bool emit_compressed_disp_byte(int &disp);
 829 
 830   void emit_modrm(int mod, int dst_enc, int src_enc);
 831   void emit_modrm_disp8(int mod, int dst_enc, int src_enc,
 832                         int disp);
 833   void emit_modrm_sib(int mod, int dst_enc, int src_enc,
 834                       Address::ScaleFactor scale, int index_enc, int base_enc);
 835   void emit_modrm_sib_disp8(int mod, int dst_enc, int src_enc,
 836                             Address::ScaleFactor scale, int index_enc, int base_enc,
 837                             int disp);
 838 
 839   void emit_operand_helper(int reg_enc,
 840                            int base_enc, int index_enc, Address::ScaleFactor scale,
 841                            int disp,
 842                            RelocationHolder const& rspec,
 843                            int post_addr_length);
 844 
 845   void emit_operand(Register reg,
 846                     Register base, Register index, Address::ScaleFactor scale,
 847                     int disp,
 848                     RelocationHolder const& rspec,
 849                     int post_addr_length);
 850 
 851   void emit_operand(Register reg,
 852                     Register base, XMMRegister index, Address::ScaleFactor scale,
 853                     int disp,
 854                     RelocationHolder const& rspec,
 855                     int post_addr_length);
 856 
 857   void emit_operand(XMMRegister xreg,
 858                     Register base, XMMRegister xindex, Address::ScaleFactor scale,
 859                     int disp,
 860                     RelocationHolder const& rspec,
 861                     int post_addr_length);
 862 
 863   void emit_operand(Register reg, Address adr,
 864                     int post_addr_length);
 865 
 866   void emit_operand(XMMRegister reg,
 867                     Register base, Register index, Address::ScaleFactor scale,
 868                     int disp,
 869                     RelocationHolder const& rspec,
 870                     int post_addr_length);
 871 
 872   void emit_operand_helper(KRegister kreg,
 873                            int base_enc, int index_enc, Address::ScaleFactor scale,
 874                            int disp,
 875                            RelocationHolder const& rspec,
 876                            int post_addr_length);
 877 
 878   void emit_operand(KRegister kreg, Address adr,
 879                     int post_addr_length);
 880 
 881   void emit_operand(KRegister kreg,
 882                     Register base, Register index, Address::ScaleFactor scale,
 883                     int disp,
 884                     RelocationHolder const& rspec,
 885                     int post_addr_length);
 886 
 887   void emit_operand(XMMRegister reg, Address adr, int post_addr_length);
 888 
 889   // Immediate-to-memory forms
 890   void emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32);
 891   void emit_arith_operand_imm32(int op1, Register rm, Address adr, int32_t imm32);
 892 
 893  protected:
 894 #ifdef ASSERT
 895   void check_relocation(RelocationHolder const& rspec, int format);
 896 #endif
 897 
 898   void emit_data(jint data, relocInfo::relocType    rtype, int format = 0);
 899   void emit_data(jint data, RelocationHolder const& rspec, int format = 0);
 900   void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
 901   void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);
 902 
 903   void emit_prefix_and_int8(int prefix, int b1);
 904   void emit_opcode_prefix_and_encoding(int byte1, int ocp_and_encoding);
 905   void emit_opcode_prefix_and_encoding(int byte1, int byte2, int ocp_and_encoding);
 906   void emit_opcode_prefix_and_encoding(int byte1, int byte2, int ocp_and_encoding, int byte3);
 907   bool always_reachable(AddressLiteral adr) NOT_LP64( { return true; } );
 908   bool        reachable(AddressLiteral adr) NOT_LP64( { return true; } );
 909 
 910 
 911   // These are all easily abused and hence protected
 912 
 913  public:
 914   // 32BIT ONLY SECTION
 915 #ifndef _LP64
 916   // Make these disappear in 64bit mode since they would never be correct
 917   void cmp_literal32(Register src1, int32_t imm32, RelocationHolder const& rspec);   // 32BIT ONLY
 918   void cmp_literal32(Address src1, int32_t imm32, RelocationHolder const& rspec);    // 32BIT ONLY
 919 
 920   void mov_literal32(Register dst, int32_t imm32, RelocationHolder const& rspec);    // 32BIT ONLY
 921   void mov_literal32(Address dst, int32_t imm32, RelocationHolder const& rspec);     // 32BIT ONLY
 922 
 923   void push_literal32(int32_t imm32, RelocationHolder const& rspec);                 // 32BIT ONLY
 924 #else
 925   // 64BIT ONLY SECTION
 926   void mov_literal64(Register dst, intptr_t imm64, RelocationHolder const& rspec);   // 64BIT ONLY
 927 
 928   void cmp_narrow_oop(Register src1, int32_t imm32, RelocationHolder const& rspec);
 929   void cmp_narrow_oop(Address src1, int32_t imm32, RelocationHolder const& rspec);
 930 
 931   void mov_narrow_oop(Register dst, int32_t imm32, RelocationHolder const& rspec);
 932   void mov_narrow_oop(Address dst, int32_t imm32, RelocationHolder const& rspec);
 933 #endif // _LP64
 934 
 935  protected:
 936   // These are unique in that we are ensured by the caller that the 32bit
 937   // relative in these instructions will always be able to reach the potentially
 938   // 64bit address described by entry. Since they can take a 64bit address they
 939   // don't have the 32 suffix like the other instructions in this class.
 940 
 941   void call_literal(address entry, RelocationHolder const& rspec);
 942   void jmp_literal(address entry, RelocationHolder const& rspec);
 943 
 944   // Avoid using directly section
 945   // Instructions in this section are actually usable by anyone without danger
 946   // of failure but have performance issues that are addressed my enhanced
 947   // instructions which will do the proper thing base on the particular cpu.
 948   // We protect them because we don't trust you...
 949 
 950   // Don't use next inc() and dec() methods directly. INC & DEC instructions
 951   // could cause a partial flag stall since they don't set CF flag.
 952   // Use MacroAssembler::decrement() & MacroAssembler::increment() methods
 953   // which call inc() & dec() or add() & sub() in accordance with
 954   // the product flag UseIncDec value.
 955 
 956   void decl(Register dst);
 957   void edecl(Register dst, Register src, bool no_flags);
 958   void decl(Address dst);
 959   void edecl(Register dst, Address src, bool no_flags);
 960   void decq(Address dst);
 961   void edecq(Register dst, Address src, bool no_flags);
 962 
 963   void incl(Register dst);
 964   void eincl(Register dst, Register src, bool no_flags);
 965   void incl(Address dst);
 966   void eincl(Register dst, Address src, bool no_flags);
 967   void incq(Register dst);
 968   void eincq(Register dst, Register src, bool no_flags);
 969   void incq(Address dst);
 970   void eincq(Register dst, Address src, bool no_flags);
 971 
 972   // New cpus require use of movsd and movss to avoid partial register stall
 973   // when loading from memory. But for old Opteron use movlpd instead of movsd.
 974   // The selection is done in MacroAssembler::movdbl() and movflt().
 975 
 976   // Move Scalar Single-Precision Floating-Point Values
 977   void movss(XMMRegister dst, Address src);
 978   void movss(XMMRegister dst, XMMRegister src);
 979   void movss(Address dst, XMMRegister src);
 980 
 981   // Move Scalar Double-Precision Floating-Point Values
 982   void movsd(XMMRegister dst, Address src);
 983   void movsd(XMMRegister dst, XMMRegister src);
 984   void movsd(Address dst, XMMRegister src);
 985   void movlpd(XMMRegister dst, Address src);
 986 
 987   void vmovsd(XMMRegister dst, XMMRegister src, XMMRegister src2);
 988 
 989   // New cpus require use of movaps and movapd to avoid partial register stall
 990   // when moving between registers.
 991   void movaps(XMMRegister dst, XMMRegister src);
 992   void movapd(XMMRegister dst, XMMRegister src);
 993 
 994   // End avoid using directly
 995 
 996 
 997   // Instruction prefixes
 998   void prefix(Prefix p);
 999 
1000   void prefix16(int p);
1001 
1002   public:
1003 
1004   // Creation
1005   Assembler(CodeBuffer* code) : AbstractAssembler(code) {
1006     init_attributes();
1007   }
1008 
1009   // Decoding
1010   static address locate_operand(address inst, WhichOperand which);
1011   static address locate_next_instruction(address inst);
1012 
1013   // Utilities
1014   static bool query_compressed_disp_byte(int disp, bool is_evex_inst, int vector_len,
1015                                          int cur_tuple_type, int in_size_in_bits, int cur_encoding);
1016 
1017   // Generic instructions
1018   // Does 32bit or 64bit as needed for the platform. In some sense these
1019   // belong in macro assembler but there is no need for both varieties to exist
1020 
1021   void init_attributes(void);
1022   void clear_attributes(void) { _attributes = nullptr; }
1023 
1024   void set_managed(void) { NOT_LP64(_is_managed = true;) }
1025   void clear_managed(void) { NOT_LP64(_is_managed = false;) }
1026   bool is_managed(void) {
1027     NOT_LP64(return _is_managed;)
1028     LP64_ONLY(return false;) }
1029 
1030   void lea(Register dst, Address src);
1031 
1032   void mov(Register dst, Register src);
1033 
1034 #ifdef _LP64
1035   // support caching the result of some routines
1036 
1037   // must be called before pusha(), popa(), vzeroupper() - checked with asserts
1038   static void precompute_instructions();
1039 
1040   void pusha_uncached();
1041   void popa_uncached();
1042 
1043   // APX ISA Extensions for register save/restore optimizations.
1044   void push2(Register src1, Register src2, bool with_ppx = false);
1045   void pop2(Register src1, Register src2, bool with_ppx = false);
1046   void push2p(Register src1, Register src2);
1047   void pop2p(Register src1, Register src2);
1048   void pushp(Register src);
1049   void popp(Register src);
1050 
1051   // New Zero Upper setcc instruction.
1052   void esetzucc(Condition cc, Register dst);
1053 
1054 #endif
1055   void vzeroupper_uncached();
1056   void decq(Register dst);
1057   void edecq(Register dst, Register src, bool no_flags);
1058 
1059   void pusha();
1060   void popa();
1061 
1062   void pushf();
1063   void popf();
1064 
1065   void push(int32_t imm32);
1066 
1067   void push(Register src);
1068 
1069   void pop(Register dst);
1070 
1071   // These do register sized moves/scans
1072   void rep_mov();
1073   void rep_stos();
1074   void rep_stosb();
1075   void repne_scan();
1076 #ifdef _LP64
1077   void repne_scanl();
1078 #endif
1079 
1080   // Vanilla instructions in lexical order
1081 
1082   void adcl(Address dst, int32_t imm32);
1083   void adcl(Address dst, Register src);
1084   void adcl(Register dst, int32_t imm32);
1085   void adcl(Register dst, Address src);
1086   void adcl(Register dst, Register src);
1087 
1088   void adcq(Register dst, int32_t imm32);
1089   void adcq(Register dst, Address src);
1090   void adcq(Register dst, Register src);
1091 
1092   void addb(Address dst, int imm8);
1093   void addb(Address dst, Register src);
1094   void addb(Register dst, int imm8);
1095   void addw(Address dst, int imm16);
1096   void addw(Address dst, Register src);
1097 
1098   void addl(Address dst, int32_t imm32);
1099   void eaddl(Register dst, Address src, int32_t imm32, bool no_flags);
1100   void addl(Address dst, Register src);
1101   void eaddl(Register dst, Address src1, Register src2, bool no_flags);
1102   void addl(Register dst, int32_t imm32);
1103   void eaddl(Register dst, Register src, int32_t imm32, bool no_flags);
1104   void addl(Register dst, Address src);
1105   void eaddl(Register dst, Register src1, Address src2, bool no_flags);
1106   void addl(Register dst, Register src);
1107   void eaddl(Register dst, Register src1, Register src2, bool no_flags);
1108 
1109   void addq(Address dst, int32_t imm32);
1110   void eaddq(Register dst, Address src, int32_t imm32, bool no_flags);
1111   void addq(Address dst, Register src);
1112   void eaddq(Register dst, Address src1, Register src2, bool no_flags);
1113   void addq(Register dst, int32_t imm32);
1114   void eaddq(Register dst, Register src, int32_t imm32, bool no_flags);
1115   void addq(Register dst, Address src);
1116   void eaddq(Register dst, Register src1, Address src2, bool no_flags);
1117   void addq(Register dst, Register src);
1118   void eaddq(Register dst, Register src1, Register src2, bool no_flags);
1119 
1120 #ifdef _LP64
1121  //Add Unsigned Integers with Carry Flag
1122   void adcxq(Register dst, Register src);
1123   void eadcxq(Register dst, Register src1, Register src2);
1124 
1125  //Add Unsigned Integers with Overflow Flag
1126   void adoxq(Register dst, Register src);
1127   void eadoxq(Register dst, Register src1, Register src2);
1128 #endif
1129 
1130   void addr_nop_4();
1131   void addr_nop_5();
1132   void addr_nop_7();
1133   void addr_nop_8();
1134 
1135   // Add Scalar Double-Precision Floating-Point Values
1136   void addsd(XMMRegister dst, Address src);
1137   void addsd(XMMRegister dst, XMMRegister src);
1138 
1139   // Add Scalar Single-Precision Floating-Point Values
1140   void addss(XMMRegister dst, Address src);
1141   void addss(XMMRegister dst, XMMRegister src);
1142 
1143   // AES instructions
1144   void aesdec(XMMRegister dst, Address src);
1145   void aesdec(XMMRegister dst, XMMRegister src);
1146   void aesdeclast(XMMRegister dst, Address src);
1147   void aesdeclast(XMMRegister dst, XMMRegister src);
1148   void aesenc(XMMRegister dst, Address src);
1149   void aesenc(XMMRegister dst, XMMRegister src);
1150   void aesenclast(XMMRegister dst, Address src);
1151   void aesenclast(XMMRegister dst, XMMRegister src);
1152   // Vector AES instructions
1153   void vaesenc(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1154   void vaesenclast(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1155   void vaesdec(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1156   void vaesdeclast(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1157 
1158   void andb(Address dst, Register src);
1159 
1160   void andl(Address  dst, int32_t imm32);
1161   void eandl(Register dst, Address  src, int32_t imm32, bool no_flags);
1162   void andl(Register dst, int32_t imm32);
1163   void eandl(Register dst, Register src, int32_t imm32, bool no_flags);
1164   void andl(Register dst, Address src);
1165   void eandl(Register dst, Register src1, Address src2, bool no_flags);
1166   void andl(Register dst, Register src);
1167   void eandl(Register dst, Register src1, Register src2, bool no_flags);
1168   void andl(Address dst, Register src);
1169 
1170   void andq(Address  dst, int32_t imm32);
1171   void eandq(Register dst, Address  src, int32_t imm32, bool no_flags);
1172   void andq(Register dst, int32_t imm32);
1173   void eandq(Register dst, Register src, int32_t imm32, bool no_flags);
1174   void andq(Register dst, Address src);
1175   void eandq(Register dst, Register src1, Address src2, bool no_flags);
1176   void andq(Register dst, Register src);
1177   void eandq(Register dst, Register src1, Register src2, bool no_flags);
1178   void andq(Address dst, Register src);
1179   void eandq(Register dst, Address src1, Register src2, bool no_flags);
1180 
1181   // BMI instructions
1182   void andnl(Register dst, Register src1, Register src2);
1183   void andnl(Register dst, Register src1, Address src2);
1184   void andnq(Register dst, Register src1, Register src2);
1185   void andnq(Register dst, Register src1, Address src2);
1186 
1187   void blsil(Register dst, Register src);
1188   void blsil(Register dst, Address src);
1189   void blsiq(Register dst, Register src);
1190   void blsiq(Register dst, Address src);
1191 
1192   void blsmskl(Register dst, Register src);
1193   void blsmskl(Register dst, Address src);
1194   void blsmskq(Register dst, Register src);
1195   void blsmskq(Register dst, Address src);
1196 
1197   void blsrl(Register dst, Register src);
1198   void blsrl(Register dst, Address src);
1199   void blsrq(Register dst, Register src);
1200   void blsrq(Register dst, Address src);
1201 
1202   void bsfl(Register dst, Register src);
1203   void bsrl(Register dst, Register src);
1204 
1205 #ifdef _LP64
1206   void bsfq(Register dst, Register src);
1207   void bsrq(Register dst, Register src);
1208 #endif
1209 
1210   void bswapl(Register reg);
1211 
1212   void bswapq(Register reg);
1213 
1214   void call(Label& L, relocInfo::relocType rtype);
1215   void call(Register reg);  // push pc; pc <- reg
1216   void call(Address adr);   // push pc; pc <- adr
1217 
1218   void cdql();
1219 
1220   void cdqq();
1221   void cdqe();
1222 
1223   void cld();
1224 
1225   void clflush(Address adr);
1226   void clflushopt(Address adr);
1227   void clwb(Address adr);
1228 
1229   void cmovl(Condition cc, Register dst, Register src);
1230   void ecmovl(Condition cc, Register dst, Register src1, Register src2);
1231   void cmovl(Condition cc, Register dst, Address src);
1232   void ecmovl(Condition cc, Register dst, Register src1, Address src2);
1233 
1234   void cmovq(Condition cc, Register dst, Register src);
1235   void ecmovq(Condition cc, Register dst, Register src1, Register src2);
1236   void cmovq(Condition cc, Register dst, Address src);
1237   void ecmovq(Condition cc, Register dst, Register src1, Address src2);
1238 
1239 
1240   void cmpb(Address dst, int imm8);
1241   void cmpb(Address dst, Register reg);
1242   void cmpb(Register reg, Address dst);
1243   void cmpb(Register reg, int imm8);
1244 
1245   void cmpl(Address dst, int32_t imm32);
1246   void cmpl(Register dst, int32_t imm32);
1247   void cmpl(Register dst, Register src);
1248   void cmpl(Register dst, Address src);
1249   void cmpl_imm32(Address dst, int32_t imm32);
1250   void cmpl(Address dst,  Register reg);
1251 
1252   void cmpq(Address dst, int32_t imm32);
1253   void cmpq(Address dst, Register src);
1254   void cmpq(Register dst, int32_t imm32);
1255   void cmpq(Register dst, Register src);
1256   void cmpq(Register dst, Address src);
1257 
1258   void cmpw(Address dst, int imm16);
1259   void cmpw(Address dst, Register reg);
1260 
1261   void cmpxchg8 (Address adr);
1262 
1263   void cmpxchgb(Register reg, Address adr);
1264   void cmpxchgl(Register reg, Address adr);
1265 
1266   void cmpxchgq(Register reg, Address adr);
1267   void cmpxchgw(Register reg, Address adr);
1268 
1269   // Ordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
1270   void comisd(XMMRegister dst, Address src);
1271   void comisd(XMMRegister dst, XMMRegister src);
1272 
1273   // Ordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
1274   void comiss(XMMRegister dst, Address src);
1275   void comiss(XMMRegister dst, XMMRegister src);
1276 
1277   // Identify processor type and features
1278   void cpuid();
1279 
1280   // CRC32C
1281   void crc32(Register crc, Register v, int8_t sizeInBytes);
1282   void crc32(Register crc, Address adr, int8_t sizeInBytes);
1283 
1284   // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
1285   void cvtsd2ss(XMMRegister dst, XMMRegister src);
1286   void cvtsd2ss(XMMRegister dst, Address src);
1287 
1288   // Convert Doubleword Integer to Scalar Double-Precision Floating-Point Value
1289   void cvtsi2sdl(XMMRegister dst, Register src);
1290   void cvtsi2sdl(XMMRegister dst, Address src);
1291   void cvtsi2sdq(XMMRegister dst, Register src);
1292   void cvtsi2sdq(XMMRegister dst, Address src);
1293 
1294   // Convert Doubleword Integer to Scalar Single-Precision Floating-Point Value
1295   void cvtsi2ssl(XMMRegister dst, Register src);
1296   void cvtsi2ssl(XMMRegister dst, Address src);
1297   void cvtsi2ssq(XMMRegister dst, Register src);
1298   void cvtsi2ssq(XMMRegister dst, Address src);
1299 
1300   // Convert Packed Signed Doubleword Integers to Packed Double-Precision Floating-Point Value
1301   void cvtdq2pd(XMMRegister dst, XMMRegister src);
1302   void vcvtdq2pd(XMMRegister dst, XMMRegister src, int vector_len);
1303 
1304   // Convert Halffloat to Single Precision Floating-Point value
1305   void vcvtps2ph(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
1306   void vcvtph2ps(XMMRegister dst, XMMRegister src, int vector_len);
1307   void evcvtps2ph(Address dst, KRegister mask, XMMRegister src, int imm8, int vector_len);
1308   void vcvtps2ph(Address dst, XMMRegister src, int imm8, int vector_len);
1309   void vcvtph2ps(XMMRegister dst, Address src, int vector_len);
1310 
1311   // Convert Packed Signed Doubleword Integers to Packed Single-Precision Floating-Point Value
1312   void cvtdq2ps(XMMRegister dst, XMMRegister src);
1313   void vcvtdq2ps(XMMRegister dst, XMMRegister src, int vector_len);
1314 
1315   // Convert Scalar Single-Precision Floating-Point Value to Scalar Double-Precision Floating-Point Value
1316   void cvtss2sd(XMMRegister dst, XMMRegister src);
1317   void cvtss2sd(XMMRegister dst, Address src);
1318 
1319   // Convert with Truncation Scalar Double-Precision Floating-Point Value to Doubleword Integer
1320   void cvtsd2siq(Register dst, XMMRegister src);
1321   void cvttsd2sil(Register dst, Address src);
1322   void cvttsd2sil(Register dst, XMMRegister src);
1323   void cvttsd2siq(Register dst, Address src);
1324   void cvttsd2siq(Register dst, XMMRegister src);
1325 
1326   // Convert with Truncation Scalar Single-Precision Floating-Point Value to Doubleword Integer
1327   void cvttss2sil(Register dst, XMMRegister src);
1328   void cvttss2siq(Register dst, XMMRegister src);
1329   void cvtss2sil(Register dst, XMMRegister src);
1330 
1331   // Convert vector double to int
1332   void cvttpd2dq(XMMRegister dst, XMMRegister src);
1333 
1334   // Convert vector float and double
1335   void vcvtps2pd(XMMRegister dst, XMMRegister src, int vector_len);
1336   void vcvtpd2ps(XMMRegister dst, XMMRegister src, int vector_len);
1337 
1338   // Convert vector float to int/long
1339   void vcvtps2dq(XMMRegister dst, XMMRegister src, int vector_len);
1340   void vcvttps2dq(XMMRegister dst, XMMRegister src, int vector_len);
1341   void evcvttps2qq(XMMRegister dst, XMMRegister src, int vector_len);
1342 
1343   // Convert vector long to vector FP
1344   void evcvtqq2ps(XMMRegister dst, XMMRegister src, int vector_len);
1345   void evcvtqq2pd(XMMRegister dst, XMMRegister src, int vector_len);
1346 
1347   // Convert vector double to long
1348   void evcvtpd2qq(XMMRegister dst, XMMRegister src, int vector_len);
1349   void evcvttpd2qq(XMMRegister dst, XMMRegister src, int vector_len);
1350 
1351   // Convert vector double to int
1352   void vcvttpd2dq(XMMRegister dst, XMMRegister src, int vector_len);
1353 
1354   // Evex casts with truncation
1355   void evpmovwb(XMMRegister dst, XMMRegister src, int vector_len);
1356   void evpmovdw(XMMRegister dst, XMMRegister src, int vector_len);
1357   void evpmovdb(XMMRegister dst, XMMRegister src, int vector_len);
1358   void evpmovqd(XMMRegister dst, XMMRegister src, int vector_len);
1359   void evpmovqb(XMMRegister dst, XMMRegister src, int vector_len);
1360   void evpmovqw(XMMRegister dst, XMMRegister src, int vector_len);
1361 
1362   // Evex casts with signed saturation
1363   void evpmovsqd(XMMRegister dst, XMMRegister src, int vector_len);
1364 
1365   //Abs of packed Integer values
1366   void pabsb(XMMRegister dst, XMMRegister src);
1367   void pabsw(XMMRegister dst, XMMRegister src);
1368   void pabsd(XMMRegister dst, XMMRegister src);
1369   void vpabsb(XMMRegister dst, XMMRegister src, int vector_len);
1370   void vpabsw(XMMRegister dst, XMMRegister src, int vector_len);
1371   void vpabsd(XMMRegister dst, XMMRegister src, int vector_len);
1372   void evpabsq(XMMRegister dst, XMMRegister src, int vector_len);
1373 
1374   // Divide Scalar Double-Precision Floating-Point Values
1375   void divsd(XMMRegister dst, Address src);
1376   void divsd(XMMRegister dst, XMMRegister src);
1377 
1378   // Divide Scalar Single-Precision Floating-Point Values
1379   void divss(XMMRegister dst, Address src);
1380   void divss(XMMRegister dst, XMMRegister src);
1381 
1382 
1383   void fnstsw_ax();
1384   void fprem();
1385   void fld_d(Address adr);
1386   void fstp_d(Address adr);
1387   void fstp_d(int index);
1388 
1389  private:
1390 
1391   void emit_farith(int b1, int b2, int i);
1392 
1393  public:
1394 #ifndef _LP64
1395   void emms();
1396 
1397   void fabs();
1398 
1399   void fadd(int i);
1400 
1401   void fadd_d(Address src);
1402   void fadd_s(Address src);
1403 
1404   // "Alternate" versions of x87 instructions place result down in FPU
1405   // stack instead of on TOS
1406 
1407   void fadda(int i); // "alternate" fadd
1408   void faddp(int i = 1);
1409 
1410   void fchs();
1411 
1412   void fcom(int i);
1413 
1414   void fcomp(int i = 1);
1415   void fcomp_d(Address src);
1416   void fcomp_s(Address src);
1417 
1418   void fcompp();
1419 
1420   void fcos();
1421 
1422   void fdecstp();
1423 
1424   void fdiv(int i);
1425   void fdiv_d(Address src);
1426   void fdivr_s(Address src);
1427   void fdiva(int i);  // "alternate" fdiv
1428   void fdivp(int i = 1);
1429 
1430   void fdivr(int i);
1431   void fdivr_d(Address src);
1432   void fdiv_s(Address src);
1433 
1434   void fdivra(int i); // "alternate" reversed fdiv
1435 
1436   void fdivrp(int i = 1);
1437 
1438   void ffree(int i = 0);
1439 
1440   void fild_d(Address adr);
1441   void fild_s(Address adr);
1442 
1443   void fincstp();
1444 
1445   void finit();
1446 
1447   void fist_s (Address adr);
1448   void fistp_d(Address adr);
1449   void fistp_s(Address adr);
1450 
1451   void fld1();
1452 
1453   void fld_s(Address adr);
1454   void fld_s(int index);
1455 
1456   void fldcw(Address src);
1457 
1458   void fldenv(Address src);
1459 
1460   void fldlg2();
1461 
1462   void fldln2();
1463 
1464   void fldz();
1465 
1466   void flog();
1467   void flog10();
1468 
1469   void fmul(int i);
1470 
1471   void fmul_d(Address src);
1472   void fmul_s(Address src);
1473 
1474   void fmula(int i);  // "alternate" fmul
1475 
1476   void fmulp(int i = 1);
1477 
1478   void fnsave(Address dst);
1479 
1480   void fnstcw(Address src);
1481   void fprem1();
1482 
1483   void frstor(Address src);
1484 
1485   void fsin();
1486 
1487   void fsqrt();
1488 
1489   void fst_d(Address adr);
1490   void fst_s(Address adr);
1491 
1492   void fstp_s(Address adr);
1493 
1494   void fsub(int i);
1495   void fsub_d(Address src);
1496   void fsub_s(Address src);
1497 
1498   void fsuba(int i);  // "alternate" fsub
1499 
1500   void fsubp(int i = 1);
1501 
1502   void fsubr(int i);
1503   void fsubr_d(Address src);
1504   void fsubr_s(Address src);
1505 
1506   void fsubra(int i); // "alternate" reversed fsub
1507 
1508   void fsubrp(int i = 1);
1509 
1510   void ftan();
1511 
1512   void ftst();
1513 
1514   void fucomi(int i = 1);
1515   void fucomip(int i = 1);
1516 
1517   void fwait();
1518 
1519   void fxch(int i = 1);
1520 
1521   void fyl2x();
1522   void frndint();
1523   void f2xm1();
1524   void fldl2e();
1525 #endif // !_LP64
1526 
1527   // operands that only take the original 32bit registers
1528   void emit_operand32(Register reg, Address adr, int post_addr_length);
1529 
1530   void fld_x(Address adr);  // extended-precision (80-bit) format
1531   void fstp_x(Address adr); // extended-precision (80-bit) format
1532   void fxrstor(Address src);
1533   void xrstor(Address src);
1534 
1535   void fxsave(Address dst);
1536   void xsave(Address dst);
1537 
1538   void hlt();
1539 
1540   void idivl(Register src);
1541   void eidivl(Register src, bool no_flags);
1542   void divl(Register src); // Unsigned division
1543   void edivl(Register src, bool no_flags); // Unsigned division
1544 
1545 #ifdef _LP64
1546   void idivq(Register src);
1547   void eidivq(Register src, bool no_flags);
1548   void divq(Register src); // Unsigned division
1549   void edivq(Register src, bool no_flags); // Unsigned division
1550 #endif
1551 
1552   void imull(Register src);
1553   void eimull(Register src, bool no_flags);
1554   void imull(Register dst, Register src);
1555   void eimull(Register dst, Register src1, Register src2, bool no_flags);
1556   void imull(Register dst, Register src, int value);
1557   void eimull(Register dst, Register src, int value, bool no_flags);
1558   void imull(Register dst, Address src, int value);
1559   void eimull(Register dst, Address src, int value, bool no_flags);
1560   void imull(Register dst, Address src);
1561   void eimull(Register dst, Register src1, Address src2, bool no_flags);
1562 
1563 #ifdef _LP64
1564   void imulq(Register dst, Register src);
1565   void eimulq(Register dst, Register src, bool no_flags);
1566   void eimulq(Register dst, Register src1, Register src2, bool no_flags);
1567   void imulq(Register dst, Register src, int value);
1568   void eimulq(Register dst, Register src, int value, bool no_flags);
1569   void imulq(Register dst, Address src, int value);
1570   void eimulq(Register dst, Address src, int value, bool no_flags);
1571   void imulq(Register dst, Address src);
1572   void eimulq(Register dst, Address src, bool no_flags);
1573   void eimulq(Register dst, Register src1, Address src2, bool no_flags);
1574   void imulq(Register dst);
1575   void eimulq(Register dst, bool no_flags);
1576 #endif
1577 
1578   // jcc is the generic conditional branch generator to run-
1579   // time routines, jcc is used for branches to labels. jcc
1580   // takes a branch opcode (cc) and a label (L) and generates
1581   // either a backward branch or a forward branch and links it
1582   // to the label fixup chain. Usage:
1583   //
1584   // Label L;      // unbound label
1585   // jcc(cc, L);   // forward branch to unbound label
1586   // bind(L);      // bind label to the current pc
1587   // jcc(cc, L);   // backward branch to bound label
1588   // bind(L);      // illegal: a label may be bound only once
1589   //
1590   // Note: The same Label can be used for forward and backward branches
1591   // but it may be bound only once.
1592 
1593   void jcc(Condition cc, Label& L, bool maybe_short = true);
1594 
1595   // Conditional jump to a 8-bit offset to L.
1596   // WARNING: be very careful using this for forward jumps.  If the label is
1597   // not bound within an 8-bit offset of this instruction, a run-time error
1598   // will occur.
1599 
1600   // Use macro to record file and line number.
1601   #define jccb(cc, L) jccb_0(cc, L, __FILE__, __LINE__)
1602 
1603   void jccb_0(Condition cc, Label& L, const char* file, int line);
1604 
1605   void jmp(Address entry);    // pc <- entry
1606 
1607   // Label operations & relative jumps (PPUM Appendix D)
1608   void jmp(Label& L, bool maybe_short = true);   // unconditional jump to L
1609 
1610   void jmp(Register entry); // pc <- entry
1611 
1612   // Unconditional 8-bit offset jump to L.
1613   // WARNING: be very careful using this for forward jumps.  If the label is
1614   // not bound within an 8-bit offset of this instruction, a run-time error
1615   // will occur.
1616 
1617   // Use macro to record file and line number.
1618   #define jmpb(L) jmpb_0(L, __FILE__, __LINE__)
1619 
1620   void jmpb_0(Label& L, const char* file, int line);
1621 
1622   void ldmxcsr( Address src );
1623 
1624   void leal(Register dst, Address src);
1625 
1626   void leaq(Register dst, Address src);
1627 
1628 #ifdef _LP64
1629   void lea(Register dst, Label& L);
1630 #endif
1631 
1632   void lfence();
1633 
1634   void lock();
1635   void size_prefix();
1636 
1637   void lzcntl(Register dst, Register src);
1638   void elzcntl(Register dst, Register src, bool no_flags);
1639   void lzcntl(Register dst, Address src);
1640   void elzcntl(Register dst, Address src, bool no_flags);
1641 
1642 #ifdef _LP64
1643   void lzcntq(Register dst, Register src);
1644   void elzcntq(Register dst, Register src, bool no_flags);
1645   void lzcntq(Register dst, Address src);
1646   void elzcntq(Register dst, Address src, bool no_flags);
1647 #endif
1648 
1649   enum Membar_mask_bits {
1650     StoreStore = 1 << 3,
1651     LoadStore  = 1 << 2,
1652     StoreLoad  = 1 << 1,
1653     LoadLoad   = 1 << 0
1654   };
1655 
1656   // Serializes memory and blows flags
1657   void membar(Membar_mask_bits order_constraint);
1658 
1659   void mfence();
1660   void sfence();
1661 
1662   // Moves
1663 
1664   void mov64(Register dst, int64_t imm64);
1665   void mov64(Register dst, int64_t imm64, relocInfo::relocType rtype, int format);
1666 
1667   void movb(Address dst, Register src);
1668   void movb(Address dst, int imm8);
1669   void movb(Register dst, Address src);
1670 
1671   void movddup(XMMRegister dst, XMMRegister src);
1672   void movddup(XMMRegister dst, Address src);
1673   void vmovddup(XMMRegister dst, Address src, int vector_len);
1674 
1675   void kandbl(KRegister dst, KRegister src1, KRegister src2);
1676   void kandwl(KRegister dst, KRegister src1, KRegister src2);
1677   void kanddl(KRegister dst, KRegister src1, KRegister src2);
1678   void kandql(KRegister dst, KRegister src1, KRegister src2);
1679 
1680   void korbl(KRegister dst, KRegister src1, KRegister src2);
1681   void korwl(KRegister dst, KRegister src1, KRegister src2);
1682   void kordl(KRegister dst, KRegister src1, KRegister src2);
1683   void korql(KRegister dst, KRegister src1, KRegister src2);
1684 
1685   void kxnorwl(KRegister dst, KRegister src1, KRegister src2);
1686 
1687   void kxorbl(KRegister dst, KRegister src1, KRegister src2);
1688   void kxorwl(KRegister dst, KRegister src1, KRegister src2);
1689   void kxordl(KRegister dst, KRegister src1, KRegister src2);
1690   void kxorql(KRegister dst, KRegister src1, KRegister src2);
1691   void kmovbl(KRegister dst, Register src);
1692   void kmovbl(Register dst, KRegister src);
1693   void kmovbl(KRegister dst, KRegister src);
1694   void kmovwl(KRegister dst, Register src);
1695   void kmovwl(KRegister dst, Address src);
1696   void kmovwl(Register dst, KRegister src);
1697   void kmovwl(Address dst, KRegister src);
1698   void kmovwl(KRegister dst, KRegister src);
1699   void kmovdl(KRegister dst, Register src);
1700   void kmovdl(Register dst, KRegister src);
1701   void kmovql(KRegister dst, KRegister src);
1702   void kmovql(Address dst, KRegister src);
1703   void kmovql(KRegister dst, Address src);
1704   void kmovql(KRegister dst, Register src);
1705   void kmovql(Register dst, KRegister src);
1706 
1707   void knotbl(KRegister dst, KRegister src);
1708   void knotwl(KRegister dst, KRegister src);
1709   void knotdl(KRegister dst, KRegister src);
1710   void knotql(KRegister dst, KRegister src);
1711 
1712   void kortestbl(KRegister dst, KRegister src);
1713   void kortestwl(KRegister dst, KRegister src);
1714   void kortestdl(KRegister dst, KRegister src);
1715   void kortestql(KRegister dst, KRegister src);
1716 
1717   void kxnorbl(KRegister dst, KRegister src1, KRegister src2);
1718   void kshiftlbl(KRegister dst, KRegister src, int imm8);
1719   void kshiftlql(KRegister dst, KRegister src, int imm8);
1720   void kshiftrbl(KRegister dst, KRegister src, int imm8);
1721   void kshiftrwl(KRegister dst, KRegister src, int imm8);
1722   void kshiftrdl(KRegister dst, KRegister src, int imm8);
1723   void kshiftrql(KRegister dst, KRegister src, int imm8);
1724   void ktestq(KRegister src1, KRegister src2);
1725   void ktestd(KRegister src1, KRegister src2);
1726   void kunpckdql(KRegister dst, KRegister src1, KRegister src2);
1727 
1728 
1729   void ktestql(KRegister dst, KRegister src);
1730   void ktestdl(KRegister dst, KRegister src);
1731   void ktestwl(KRegister dst, KRegister src);
1732   void ktestbl(KRegister dst, KRegister src);
1733 
1734   void movdl(XMMRegister dst, Register src);
1735   void movdl(Register dst, XMMRegister src);
1736   void movdl(XMMRegister dst, Address src);
1737   void movdl(Address dst, XMMRegister src);
1738 
1739   // Move Double Quadword
1740   void movdq(XMMRegister dst, Register src);
1741   void movdq(Register dst, XMMRegister src);
1742 
1743   // Move Aligned Double Quadword
1744   void movdqa(XMMRegister dst, XMMRegister src);
1745   void movdqa(XMMRegister dst, Address src);
1746 
1747   // Move Unaligned Double Quadword
1748   void movdqu(Address     dst, XMMRegister src);
1749   void movdqu(XMMRegister dst, Address src);
1750   void movdqu(XMMRegister dst, XMMRegister src);
1751 
1752   // Move Unaligned 256bit Vector
1753   void vmovdqu(Address dst, XMMRegister src);
1754   void vmovdqu(XMMRegister dst, Address src);
1755   void vmovdqu(XMMRegister dst, XMMRegister src);
1756 
1757    // Move Unaligned 512bit Vector
1758   void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len);
1759   void evmovdqub(XMMRegister dst, Address src, int vector_len);
1760   void evmovdqub(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
1761   void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
1762   void evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
1763 
1764   void evmovdquw(XMMRegister dst, XMMRegister src, int vector_len);
1765   void evmovdquw(XMMRegister dst, Address src, int vector_len);
1766   void evmovdquw(Address dst, XMMRegister src, int vector_len);
1767   void evmovdquw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
1768   void evmovdquw(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
1769   void evmovdquw(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
1770 
1771   void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len);
1772   void evmovdqul(XMMRegister dst, Address src, int vector_len);
1773   void evmovdqul(Address dst, XMMRegister src, int vector_len);
1774 
1775   void evmovdqul(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
1776   void evmovdqul(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
1777   void evmovdqul(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
1778 
1779   void evmovntdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
1780   void evmovntdquq(Address dst, XMMRegister src, int vector_len);
1781 
1782   void evmovdquq(Address dst, XMMRegister src, int vector_len);
1783   void evmovdquq(XMMRegister dst, Address src, int vector_len);
1784   void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);
1785 
1786   void evmovdquq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
1787   void evmovdquq(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
1788   void evmovdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
1789 
1790   // Move lower 64bit to high 64bit in 128bit register
1791   void movlhps(XMMRegister dst, XMMRegister src);
1792 
1793   void movl(Register dst, int32_t imm32);
1794   void movl(Address dst, int32_t imm32);
1795   void movl(Register dst, Register src);
1796   void movl(Register dst, Address src);
1797   void movl(Address dst, Register src);
1798 
1799 #ifdef _LP64
1800   void movq(Register dst, Register src);
1801   void movq(Register dst, Address src);
1802   void movq(Address  dst, Register src);
1803   void movq(Address  dst, int32_t imm32);
1804   void movq(Register  dst, int32_t imm32);
1805 #endif
1806 
1807   // Move Quadword
1808   void movq(Address     dst, XMMRegister src);
1809   void movq(XMMRegister dst, Address src);
1810   void movq(XMMRegister dst, XMMRegister src);
1811   void movq(Register dst, XMMRegister src);
1812   void movq(XMMRegister dst, Register src);
1813 
1814   void movsbl(Register dst, Address src);
1815   void movsbl(Register dst, Register src);
1816 
1817 #ifdef _LP64
1818   void movsbq(Register dst, Address src);
1819   void movsbq(Register dst, Register src);
1820 
1821   // Move signed 32bit immediate to 64bit extending sign
1822   void movslq(Address  dst, int32_t imm64);
1823 
1824   void movslq(Register dst, Address src);
1825   void movslq(Register dst, Register src);
1826 #endif
1827 
1828   void movswl(Register dst, Address src);
1829   void movswl(Register dst, Register src);
1830 
1831 #ifdef _LP64
1832   void movswq(Register dst, Address src);
1833   void movswq(Register dst, Register src);
1834 #endif
1835 
1836   void movups(XMMRegister dst, Address src);
1837   void vmovups(XMMRegister dst, Address src, int vector_len);
1838   void movups(Address dst, XMMRegister src);
1839   void vmovups(Address dst, XMMRegister src, int vector_len);
1840 
1841   void movw(Address dst, int imm16);
1842   void movw(Register dst, Address src);
1843   void movw(Address dst, Register src);
1844 
1845   void movzbl(Register dst, Address src);
1846   void movzbl(Register dst, Register src);
1847 
1848 #ifdef _LP64
1849   void movzbq(Register dst, Address src);
1850   void movzbq(Register dst, Register src);
1851 #endif
1852 
1853   void movzwl(Register dst, Address src);
1854   void movzwl(Register dst, Register src);
1855 
1856 #ifdef _LP64
1857   void movzwq(Register dst, Address src);
1858   void movzwq(Register dst, Register src);
1859 #endif
1860 
1861   // Unsigned multiply with RAX destination register
1862   void mull(Address src);
1863   void emull(Address src, bool no_flags);
1864   void mull(Register src);
1865   void emull(Register src, bool no_flags);
1866 
1867 #ifdef _LP64
1868   void mulq(Address src);
1869   void emulq(Address src, bool no_flags);
1870   void mulq(Register src);
1871   void emulq(Register src, bool no_flags);
1872   void mulxq(Register dst1, Register dst2, Register src);
1873 #endif
1874 
1875   // Multiply Scalar Double-Precision Floating-Point Values
1876   void mulsd(XMMRegister dst, Address src);
1877   void mulsd(XMMRegister dst, XMMRegister src);
1878 
1879   // Multiply Scalar Single-Precision Floating-Point Values
1880   void mulss(XMMRegister dst, Address src);
1881   void mulss(XMMRegister dst, XMMRegister src);
1882 
1883   void negl(Register dst);
1884   void enegl(Register dst, Register src, bool no_flags);
1885   void negl(Address dst);
1886   void enegl(Register dst, Address src, bool no_flags);
1887 
1888 #ifdef _LP64
1889   void negq(Register dst);
1890   void enegq(Register dst, Register src, bool no_flags);
1891   void negq(Address dst);
1892   void enegq(Register dst, Address src, bool no_flags);
1893 #endif
1894 
1895   void nop(uint i = 1);
1896 
1897   void notl(Register dst);
1898   void enotl(Register dst, Register src);
1899 
1900 #ifdef _LP64
1901   void notq(Register dst);
1902   void enotq(Register dst, Register src);
1903 
1904   void btsq(Address dst, int imm8);
1905   void btrq(Address dst, int imm8);
1906   void btq(Register src, int imm8);
1907 #endif
1908   void btq(Register dst, Register src);
1909 
1910   void eorw(Register dst, Register src1, Register src2, bool no_flags);
1911 
1912   void orl(Address dst, int32_t imm32);
1913   void eorl(Register dst, Address  src, int32_t imm32, bool no_flags);
1914   void orl(Register dst, int32_t imm32);
1915   void eorl(Register dst, Register src, int32_t imm32, bool no_flags);
1916   void orl(Register dst, Address src);
1917   void eorl(Register dst, Register src1, Address src2, bool no_flags);
1918   void orl(Register dst, Register src);
1919   void eorl(Register dst, Register src1, Register src2, bool no_flags);
1920   void orl(Address dst, Register src);
1921   void eorl(Register dst, Address src1, Register src2, bool no_flags);
1922 
1923   void orb(Address dst, int imm8);
1924   void eorb(Register dst, Address  src, int imm8, bool no_flags);
1925   void orb(Address dst, Register src);
1926   void eorb(Register dst, Address src1, Register src2, bool no_flags);
1927 
1928   void orq(Address dst, int32_t imm32);
1929   void eorq(Register dst, Address  src, int32_t imm32, bool no_flags);
1930   void orq(Address dst, Register src);
1931   void eorq(Register dst, Address src1, Register src2, bool no_flags);
1932   void orq(Register dst, int32_t imm32);
1933   void eorq(Register dst, Register src, int32_t imm32, bool no_flags);
1934   void orq_imm32(Register dst, int32_t imm32);
1935   void eorq_imm32(Register dst, Register src, int32_t imm32, bool no_flags);
1936   void orq(Register dst, Address src);
1937   void eorq(Register dst, Register src1, Address src2, bool no_flags);
1938   void orq(Register dst, Register src);
1939   void eorq(Register dst, Register src1, Register src2, bool no_flags);
1940 
1941   // Pack with signed saturation
1942   void packsswb(XMMRegister dst, XMMRegister src);
1943   void vpacksswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1944   void packssdw(XMMRegister dst, XMMRegister src);
1945   void vpackssdw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1946 
1947   // Pack with unsigned saturation
1948   void packuswb(XMMRegister dst, XMMRegister src);
1949   void packuswb(XMMRegister dst, Address src);
1950   void packusdw(XMMRegister dst, XMMRegister src);
1951   void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1952   void vpackusdw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1953 
1954   // Permutations
1955   void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
1956   void vpermq(XMMRegister dst, XMMRegister src, int imm8);
1957   void vpermq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1958   void vpermb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1959   void vpermb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1960   void vpermw(XMMRegister dst,  XMMRegister nds, XMMRegister src, int vector_len);
1961   void vpermd(XMMRegister dst,  XMMRegister nds, Address src, int vector_len);
1962   void vpermd(XMMRegister dst,  XMMRegister nds, XMMRegister src, int vector_len);
1963   void vpermps(XMMRegister dst,  XMMRegister nds, XMMRegister src, int vector_len);
1964   void vperm2i128(XMMRegister dst,  XMMRegister nds, XMMRegister src, int imm8);
1965   void vperm2f128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8);
1966   void vpermilps(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
1967   void vpermilps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1968   void vpermilpd(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
1969   void vpermpd(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
1970   void evpmultishiftqb(XMMRegister dst, XMMRegister ctl, XMMRegister src, int vector_len);
1971   void evpermi2b(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1972   void evpermi2w(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1973   void evpermi2d(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1974   void evpermi2q(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1975   void evpermi2ps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1976   void evpermi2pd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1977   void evpermt2b(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1978   void evpermt2w(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1979   void evpermt2d(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1980   void evpermt2q(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1981 
1982   void pause();
1983 
1984   // Undefined Instruction
1985   void ud2();
1986 
1987   // SSE4.2 string instructions
1988   void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
1989   void pcmpestri(XMMRegister xmm1, Address src, int imm8);
1990 
1991   void pcmpeqb(XMMRegister dst, XMMRegister src);
1992   void vpcmpCCbwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, int vector_len);
1993 
1994   void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1995   void vpcmpeqb(XMMRegister dst, XMMRegister src1, Address src2, int vector_len);
1996   void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
1997   void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
1998   void evpcmpeqb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);
1999 
2000   void vpcmpgtb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2001   void evpcmpgtb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
2002   void evpcmpgtb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);
2003 
2004   void evpcmpub(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len);
2005 
2006   void evpcmpuw(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len);
2007   void evpcmpuw(KRegister kdst, XMMRegister nds, Address src, ComparisonPredicate vcc, int vector_len);
2008 
2009   void evpcmpud(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len);
2010   void evpcmpuq(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len);
2011 
2012   void pcmpeqw(XMMRegister dst, XMMRegister src);
2013   void vpcmpeqw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2014   void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2015   void evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
2016   void evpcmpeqw(KRegister kdst, XMMRegister nds, Address src, int vector_len);
2017 
2018   void vpcmpgtw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2019 
2020   void pcmpeqd(XMMRegister dst, XMMRegister src);
2021   void vpcmpeqd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2022   void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, int vector_len);
2023   void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);
2024 
2025   void pcmpeqq(XMMRegister dst, XMMRegister src);
2026   void evpcmpeqq(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, int vector_len);
2027   void vpcmpCCq(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, int vector_len);
2028   void vpcmpeqq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2029   void evpcmpeqq(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
2030   void evpcmpeqq(KRegister kdst, XMMRegister nds, Address src, int vector_len);
2031 
2032   void pcmpgtq(XMMRegister dst, XMMRegister src);
2033   void vpcmpgtq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2034 
2035   void pmovmskb(Register dst, XMMRegister src);
2036   void vpmovmskb(Register dst, XMMRegister src, int vec_enc);
2037   void vmovmskps(Register dst, XMMRegister src, int vec_enc);
2038   void vmovmskpd(Register dst, XMMRegister src, int vec_enc);
2039   void vpmaskmovd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2040   void vpmaskmovq(XMMRegister dst, XMMRegister mask, Address src, int vector_len);
2041 
2042 
2043   void vmaskmovps(XMMRegister dst, Address src, XMMRegister mask, int vector_len);
2044   void vmaskmovpd(XMMRegister dst, Address src, XMMRegister mask, int vector_len);
2045   void vmaskmovps(Address dst, XMMRegister src, XMMRegister mask, int vector_len);
2046   void vmaskmovpd(Address dst, XMMRegister src, XMMRegister mask, int vector_len);
2047 
2048   // SSE 4.1 extract
2049   void pextrd(Register dst, XMMRegister src, int imm8);
2050   void pextrq(Register dst, XMMRegister src, int imm8);
2051   void pextrd(Address dst, XMMRegister src, int imm8);
2052   void pextrq(Address dst, XMMRegister src, int imm8);
2053   void pextrb(Register dst, XMMRegister src, int imm8);
2054   void pextrb(Address dst, XMMRegister src, int imm8);
2055   // SSE 2 extract
2056   void pextrw(Register dst, XMMRegister src, int imm8);
2057   void pextrw(Address dst, XMMRegister src, int imm8);
2058 
2059   // SSE 4.1 insert
2060   void pinsrd(XMMRegister dst, Register src, int imm8);
2061   void pinsrq(XMMRegister dst, Register src, int imm8);
2062   void pinsrb(XMMRegister dst, Register src, int imm8);
2063   void pinsrd(XMMRegister dst, Address src, int imm8);
2064   void pinsrq(XMMRegister dst, Address src, int imm8);
2065   void pinsrb(XMMRegister dst, Address src, int imm8);
2066   void insertps(XMMRegister dst, XMMRegister src, int imm8);
2067   // SSE 2 insert
2068   void pinsrw(XMMRegister dst, Register src, int imm8);
2069   void pinsrw(XMMRegister dst, Address src, int imm8);
2070 
2071   // AVX insert
2072   void vpinsrd(XMMRegister dst, XMMRegister nds, Register src, int imm8);
2073   void vpinsrb(XMMRegister dst, XMMRegister nds, Register src, int imm8);
2074   void vpinsrq(XMMRegister dst, XMMRegister nds, Register src, int imm8);
2075   void vpinsrw(XMMRegister dst, XMMRegister nds, Register src, int imm8);
2076   void vinsertps(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8);
2077 
2078   // Zero extend moves
2079   void pmovzxbw(XMMRegister dst, XMMRegister src);
2080   void pmovzxbw(XMMRegister dst, Address src);
2081   void pmovzxbd(XMMRegister dst, XMMRegister src);
2082   void vpmovzxbw(XMMRegister dst, Address src, int vector_len);
2083   void vpmovzxbw(XMMRegister dst, XMMRegister src, int vector_len);
2084   void vpmovzxbd(XMMRegister dst, XMMRegister src, int vector_len);
2085   void vpmovzxbq(XMMRegister dst, XMMRegister src, int vector_len);
2086   void vpmovzxwd(XMMRegister dst, XMMRegister src, int vector_len);
2087   void vpmovzxwq(XMMRegister dst, XMMRegister src, int vector_len);
2088   void pmovzxdq(XMMRegister dst, XMMRegister src);
2089   void vpmovzxdq(XMMRegister dst, XMMRegister src, int vector_len);
2090   void evpmovzxbw(XMMRegister dst, KRegister mask, Address src, int vector_len);
2091   void evpmovzxbd(XMMRegister dst, KRegister mask, Address src, int vector_len);
2092   void evpmovzxbd(XMMRegister dst, Address src, int vector_len);
2093 
2094   // Sign extend moves
2095   void pmovsxbd(XMMRegister dst, XMMRegister src);
2096   void pmovsxbq(XMMRegister dst, XMMRegister src);
2097   void pmovsxbw(XMMRegister dst, XMMRegister src);
2098   void pmovsxwd(XMMRegister dst, XMMRegister src);
2099   void vpmovsxbd(XMMRegister dst, XMMRegister src, int vector_len);
2100   void vpmovsxbq(XMMRegister dst, XMMRegister src, int vector_len);
2101   void vpmovsxbw(XMMRegister dst, XMMRegister src, int vector_len);
2102   void vpmovsxwd(XMMRegister dst, XMMRegister src, int vector_len);
2103   void vpmovsxwq(XMMRegister dst, XMMRegister src, int vector_len);
2104   void vpmovsxdq(XMMRegister dst, XMMRegister src, int vector_len);
2105 
2106   void evpmovwb(Address dst, XMMRegister src, int vector_len);
2107   void evpmovwb(Address dst, KRegister mask, XMMRegister src, int vector_len);
2108   void evpmovdb(Address dst, XMMRegister src, int vector_len);
2109 
2110   // Multiply add
2111   void pmaddwd(XMMRegister dst, XMMRegister src);
2112   void vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2113   void vpmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2114   void vpmadd52luq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2115   void vpmadd52luq(XMMRegister dst, XMMRegister src1, Address src2, int vector_len);
2116   void evpmadd52luq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2117   void evpmadd52luq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
2118   void vpmadd52huq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2119   void vpmadd52huq(XMMRegister dst, XMMRegister src1, Address src2, int vector_len);
2120   void evpmadd52huq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2121   void evpmadd52huq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
2122 
2123   // Multiply add accumulate
2124   void evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2125 
2126 #ifndef _LP64 // no 32bit push/pop on amd64
2127   void popl(Address dst);
2128 #endif
2129 
2130 #ifdef _LP64
2131   void popq(Address dst);
2132   void popq(Register dst);
2133 #endif
2134 
2135   void popcntl(Register dst, Address src);
2136   void epopcntl(Register dst, Address src, bool no_flags);
2137   void popcntl(Register dst, Register src);
2138   void epopcntl(Register dst, Register src, bool no_flags);
2139 
2140   void evpopcntb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
2141   void evpopcntw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
2142   void evpopcntd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
2143   void evpopcntq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
2144 
2145 #ifdef _LP64
2146   void popcntq(Register dst, Address src);
2147   void epopcntq(Register dst, Address src, bool no_flags);
2148   void popcntq(Register dst, Register src);
2149   void epopcntq(Register dst, Register src, bool no_flags);
2150 #endif
2151 
2152   // Prefetches (SSE, SSE2, 3DNOW only)
2153 
2154   void prefetchnta(Address src);
2155   void prefetchr(Address src);
2156   void prefetcht0(Address src);
2157   void prefetcht1(Address src);
2158   void prefetcht2(Address src);
2159   void prefetchw(Address src);
2160 
2161   // Shuffle Bytes
2162   void pshufb(XMMRegister dst, XMMRegister src);
2163   void pshufb(XMMRegister dst, Address src);
2164   void vpshufb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2165   void vpshufb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2166   void evpshufb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2167 
2168 
2169   // Shuffle Packed Doublewords
2170   void pshufd(XMMRegister dst, XMMRegister src, int mode);
2171   void pshufd(XMMRegister dst, Address src,     int mode);
2172   void vpshufd(XMMRegister dst, XMMRegister src, int mode, int vector_len);
2173 
2174   // Shuffle Packed High/Low Words
2175   void pshufhw(XMMRegister dst, XMMRegister src, int mode);
2176   void pshuflw(XMMRegister dst, XMMRegister src, int mode);
2177   void pshuflw(XMMRegister dst, Address src,     int mode);
2178   void vpshufhw(XMMRegister dst, XMMRegister src, int mode, int vector_len);
2179   void vpshuflw(XMMRegister dst, XMMRegister src, int mode, int vector_len);
2180 
2181   //shuffle floats and doubles
2182   void shufps(XMMRegister, XMMRegister, int);
2183   void shufpd(XMMRegister, XMMRegister, int);
2184   void vshufps(XMMRegister, XMMRegister, XMMRegister, int, int);
2185   void vshufpd(XMMRegister, XMMRegister, XMMRegister, int, int);
2186 
2187   // Shuffle packed values at 128 bit granularity
2188   void evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len);
2189 
2190   // Shift Right by bytes Logical DoubleQuadword Immediate
2191   void psrldq(XMMRegister dst, int shift);
2192   // Shift Left by bytes Logical DoubleQuadword Immediate
2193   void pslldq(XMMRegister dst, int shift);
2194 
2195   // Logical Compare 128bit
2196   void ptest(XMMRegister dst, XMMRegister src);
2197   void ptest(XMMRegister dst, Address src);
2198   // Logical Compare 256bit
2199   void vptest(XMMRegister dst, XMMRegister src);
2200   void vptest(XMMRegister dst, Address src);
2201 
2202   void evptestmb(KRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2203   void evptestmd(KRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2204   void evptestnmd(KRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2205 
2206   // Vector compare
2207   void vptest(XMMRegister dst, XMMRegister src, int vector_len);
2208   void vtestps(XMMRegister dst, XMMRegister src, int vector_len);
2209 
2210   // Interleave Low Bytes
2211   void punpcklbw(XMMRegister dst, XMMRegister src);
2212   void punpcklbw(XMMRegister dst, Address src);
2213 
2214   // Interleave Low Doublewords
2215   void punpckldq(XMMRegister dst, XMMRegister src);
2216   void punpckldq(XMMRegister dst, Address src);
2217   void vpunpckldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2218   void vpunpcklqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2219 
2220 
2221   // Interleave High Word
2222   void vpunpckhwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2223 
2224   // Interleave Low Word
2225   void vpunpcklwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2226 
2227   // Interleave High Doublewords
2228   void vpunpckhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2229   void vpunpckhqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2230 
2231   // Interleave Low Quadwords
2232   void punpcklqdq(XMMRegister dst, XMMRegister src);
2233 
2234   void evpunpcklqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2235   void evpunpcklqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
2236   void evpunpckhqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2237   void evpunpckhqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
2238 
2239   // Vector sum of absolute difference.
2240   void vpsadbw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2241 
2242 #ifndef _LP64 // no 32bit push/pop on amd64
2243   void pushl(Address src);
2244 #endif
2245 
2246   void pushq(Address src);
2247 
2248   void rcll(Register dst, int imm8);
2249   void ercll(Register dst, Register src, int imm8);
2250 
2251   void rclq(Register dst, int imm8);
2252   void erclq(Register dst, Register src, int imm8);
2253 
2254   void rcrq(Register dst, int imm8);
2255   void ercrq(Register dst, Register src, int imm8);
2256 
2257   void rcpps(XMMRegister dst, XMMRegister src);
2258 
2259   void rcpss(XMMRegister dst, XMMRegister src);
2260 
2261   void rdtsc();
2262 
2263   void ret(int imm16);
2264 
2265   void roll(Register dst);
2266   void eroll(Register dst, Register src, bool no_flags);
2267 
2268   void roll(Register dst, int imm8);
2269   void eroll(Register dst, Register src, int imm8, bool no_flags);
2270 
2271   void rorl(Register dst);
2272   void erorl(Register dst, Register src, bool no_flags);
2273 
2274   void rorl(Register dst, int imm8);
2275   void erorl(Register dst, Register src, int imm8, bool no_flags);
2276 
2277 #ifdef _LP64
2278   void rolq(Register dst);
2279   void erolq(Register dst, Register src, bool no_flags);
2280   void rolq(Register dst, int imm8);
2281   void erolq(Register dst, Register src, int imm8, bool no_flags);
2282   void rorq(Register dst);
2283   void erorq(Register dst, Register src, bool no_flags);
2284   void rorq(Register dst, int imm8);
2285   void erorq(Register dst, Register src, int imm8, bool no_flags);
2286   void rorxl(Register dst, Register src, int imm8);
2287   void rorxl(Register dst, Address src, int imm8);
2288   void rorxq(Register dst, Register src, int imm8);
2289   void rorxq(Register dst, Address src, int imm8);
2290 #endif
2291 
2292   void sahf();
2293 
2294   void sall(Register dst, int imm8);
2295   void esall(Register dst, Register src, int imm8, bool no_flags);
2296   void sall(Register dst);
2297   void esall(Register dst, Register src, bool no_flags);
2298   void sall(Address dst, int imm8);
2299   void esall(Register dst, Address src, int imm8, bool no_flags);
2300   void sall(Address dst);
2301   void esall(Register dst, Address src, bool no_flags);
2302 
2303   void sarl(Address dst, int imm8);
2304   void esarl(Register dst, Address src, int imm8, bool no_flags);
2305   void sarl(Address dst);
2306   void esarl(Register dst, Address src, bool no_flags);
2307   void sarl(Register dst, int imm8);
2308   void esarl(Register dst, Register src, int imm8, bool no_flags);
2309   void sarl(Register dst);
2310   void esarl(Register dst, Register src, bool no_flags);
2311 
2312 #ifdef _LP64
2313   void salq(Register dst, int imm8);
2314   void esalq(Register dst, Register src, int imm8, bool no_flags);
2315   void salq(Register dst);
2316   void esalq(Register dst, Register src, bool no_flags);
2317   void salq(Address dst, int imm8);
2318   void esalq(Register dst, Address src, int imm8, bool no_flags);
2319   void salq(Address dst);
2320   void esalq(Register dst, Address src, bool no_flags);
2321 
2322   void sarq(Address dst, int imm8);
2323   void esarq(Register dst, Address src, int imm8, bool no_flags);
2324   void sarq(Address dst);
2325   void esarq(Register dst, Address src, bool no_flags);
2326   void sarq(Register dst, int imm8);
2327   void esarq(Register dst, Register src, int imm8, bool no_flags);
2328   void sarq(Register dst);
2329   void esarq(Register dst, Register src, bool no_flags);
2330 #endif
2331 
2332   void sbbl(Address dst, int32_t imm32);
2333   void sbbl(Register dst, int32_t imm32);
2334   void sbbl(Register dst, Address src);
2335   void sbbl(Register dst, Register src);
2336 
2337   void sbbq(Address dst, int32_t imm32);
2338   void sbbq(Register dst, int32_t imm32);
2339   void sbbq(Register dst, Address src);
2340   void sbbq(Register dst, Register src);
2341 
2342   void setb(Condition cc, Register dst);
2343 
2344   void palignr(XMMRegister dst, XMMRegister src, int imm8);
2345   void vpalignr(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len);
2346   void evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
2347 
2348   void pblendw(XMMRegister dst, XMMRegister src, int imm8);
2349   void vblendps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len);
2350 
2351   void sha1rnds4(XMMRegister dst, XMMRegister src, int imm8);
2352   void sha1nexte(XMMRegister dst, XMMRegister src);
2353   void sha1msg1(XMMRegister dst, XMMRegister src);
2354   void sha1msg2(XMMRegister dst, XMMRegister src);
2355   // xmm0 is implicit additional source to the following instruction.
2356   void sha256rnds2(XMMRegister dst, XMMRegister src);
2357   void sha256msg1(XMMRegister dst, XMMRegister src);
2358   void sha256msg2(XMMRegister dst, XMMRegister src);
2359   void sha512rnds2(XMMRegister dst, XMMRegister nds, XMMRegister src);
2360   void sha512msg1(XMMRegister dst, XMMRegister src);
2361   void sha512msg2(XMMRegister dst, XMMRegister src);
2362 
2363   void shldl(Register dst, Register src);
2364   void eshldl(Register dst, Register src1, Register src2, bool no_flags);
2365   void shldl(Register dst, Register src, int8_t imm8);
2366   void eshldl(Register dst, Register src1, Register src2, int8_t imm8, bool no_flags);
2367   void shrdl(Register dst, Register src);
2368   void eshrdl(Register dst, Register src1, Register src2, bool no_flags);
2369   void shrdl(Register dst, Register src, int8_t imm8);
2370   void eshrdl(Register dst, Register src1, Register src2, int8_t imm8, bool no_flags);
2371 #ifdef _LP64
2372   void shldq(Register dst, Register src, int8_t imm8);
2373   void eshldq(Register dst, Register src1, Register src2, int8_t imm8, bool no_flags);
2374   void shrdq(Register dst, Register src, int8_t imm8);
2375   void eshrdq(Register dst, Register src1, Register src2, int8_t imm8, bool no_flags);
2376 #endif
2377 
2378   void shll(Register dst, int imm8);
2379   void eshll(Register dst, Register src, int imm8, bool no_flags);
2380   void shll(Register dst);
2381   void eshll(Register dst, Register src, bool no_flags);
2382 
2383   void shlq(Register dst, int imm8);
2384   void eshlq(Register dst, Register src, int imm8, bool no_flags);
2385   void shlq(Register dst);
2386   void eshlq(Register dst, Register src, bool no_flags);
2387 
2388   void shrl(Register dst, int imm8);
2389   void eshrl(Register dst, Register src, int imm8, bool no_flags);
2390   void shrl(Register dst);
2391   void eshrl(Register dst, Register src, bool no_flags);
2392   void shrl(Address dst);
2393   void eshrl(Register dst, Address src, bool no_flags);
2394   void shrl(Address dst, int imm8);
2395   void eshrl(Register dst, Address src, int imm8, bool no_flags);
2396 
2397   void shrq(Register dst, int imm8);
2398   void eshrq(Register dst, Register src, int imm8, bool no_flags);
2399   void shrq(Register dst);
2400   void eshrq(Register dst, Register src, bool no_flags);
2401   void shrq(Address dst);
2402   void eshrq(Register dst, Address src, bool no_flags);
2403   void shrq(Address dst, int imm8);
2404   void eshrq(Register dst, Address src, int imm8, bool no_flags);
2405 
2406   void smovl(); // QQQ generic?
2407 
2408   // Compute Square Root of Scalar Double-Precision Floating-Point Value
2409   void sqrtsd(XMMRegister dst, Address src);
2410   void sqrtsd(XMMRegister dst, XMMRegister src);
2411 
2412   void roundsd(XMMRegister dst, Address src, int32_t rmode);
2413   void roundsd(XMMRegister dst, XMMRegister src, int32_t rmode);
2414 
2415   // Compute Square Root of Scalar Single-Precision Floating-Point Value
2416   void sqrtss(XMMRegister dst, Address src);
2417   void sqrtss(XMMRegister dst, XMMRegister src);
2418 
2419   void std();
2420 
2421   void stmxcsr( Address dst );
2422 
2423   void subl(Address dst, int32_t imm32);
2424   void esubl(Register dst, Address src, int32_t imm32, bool no_flags);
2425   void subl(Address dst, Register src);
2426   void esubl(Register dst, Address src1, Register src2, bool no_flags);
2427   void subl(Register dst, int32_t imm32);
2428   void esubl(Register dst, Register src, int32_t imm32, bool no_flags);
2429   void subl(Register dst, Address src);
2430   void esubl(Register dst, Register src1, Address src2, bool no_flags);
2431   void subl(Register dst, Register src);
2432   void esubl(Register dst, Register src1, Register src2, bool no_flags);
2433 
2434   void subq(Address dst, int32_t imm32);
2435   void esubq(Register dst, Address src, int32_t imm32, bool no_flags);
2436   void subq(Address dst, Register src);
2437   void esubq(Register dst, Address src1, Register src2, bool no_flags);
2438   void subq(Register dst, int32_t imm32);
2439   void esubq(Register dst, Register src, int32_t imm32, bool no_flags);
2440   void subq(Register dst, Address src);
2441   void esubq(Register dst, Register src1, Address src2, bool no_flags);
2442   void subq(Register dst, Register src);
2443   void esubq(Register dst, Register src1, Register src2, bool no_flags);
2444 
2445   // Force generation of a 4 byte immediate value even if it fits into 8bit
2446   void subl_imm32(Register dst, int32_t imm32);
2447   void esubl_imm32(Register dst, Register src, int32_t imm32, bool no_flags);
2448   void subq_imm32(Register dst, int32_t imm32);
2449   void esubq_imm32(Register dst, Register src, int32_t imm32, bool no_flags);
2450 
2451   // Subtract Scalar Double-Precision Floating-Point Values
2452   void subsd(XMMRegister dst, Address src);
2453   void subsd(XMMRegister dst, XMMRegister src);
2454 
2455   // Subtract Scalar Single-Precision Floating-Point Values
2456   void subss(XMMRegister dst, Address src);
2457   void subss(XMMRegister dst, XMMRegister src);
2458 
2459   void testb(Address dst, int imm8);
2460   void testb(Register dst, int imm8, bool use_ral = true);
2461 
2462   void testl(Address dst, int32_t imm32);
2463   void testl(Register dst, int32_t imm32);
2464   void testl(Register dst, Register src);
2465   void testl(Register dst, Address src);
2466 
2467   void testq(Address dst, int32_t imm32);
2468   void testq(Register dst, int32_t imm32);
2469   void testq(Register dst, Register src);
2470   void testq(Register dst, Address src);
2471 
2472   // BMI - count trailing zeros
2473   void tzcntl(Register dst, Register src);
2474   void etzcntl(Register dst, Register src, bool no_flags);
2475   void tzcntl(Register dst, Address src);
2476   void etzcntl(Register dst, Address src, bool no_flags);
2477   void tzcntq(Register dst, Register src);
2478   void etzcntq(Register dst, Register src, bool no_flags);
2479   void tzcntq(Register dst, Address src);
2480   void etzcntq(Register dst, Address src, bool no_flags);
2481 
2482   // Unordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
2483   void ucomisd(XMMRegister dst, Address src);
2484   void ucomisd(XMMRegister dst, XMMRegister src);
2485 
2486   // Unordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
2487   void ucomiss(XMMRegister dst, Address src);
2488   void ucomiss(XMMRegister dst, XMMRegister src);
2489 
2490   void xabort(int8_t imm8);
2491 
2492   void xaddb(Address dst, Register src);
2493   void xaddw(Address dst, Register src);
2494   void xaddl(Address dst, Register src);
2495   void xaddq(Address dst, Register src);
2496 
2497   void xbegin(Label& abort, relocInfo::relocType rtype = relocInfo::none);
2498 
2499   void xchgb(Register reg, Address adr);
2500   void xchgw(Register reg, Address adr);
2501   void xchgl(Register reg, Address adr);
2502   void xchgl(Register dst, Register src);
2503 
2504   void xchgq(Register reg, Address adr);
2505   void xchgq(Register dst, Register src);
2506 
2507   void xend();
2508 
2509   // Get Value of Extended Control Register
2510   void xgetbv();
2511 
2512   void xorl(Register dst, int32_t imm32);
2513   void exorl(Register dst, Register src, int32_t imm32, bool no_flags);
2514   void xorl(Address dst, int32_t imm32);
2515   void exorl(Register dst, Address  src, int32_t imm32, bool no_flags);
2516   void xorl(Register dst, Address src);
2517   void exorl(Register dst, Register src1, Address src2, bool no_flags);
2518   void xorl(Register dst, Register src);
2519   void exorl(Register dst, Register src1, Register src2, bool no_flags);
2520   void xorl(Address dst, Register src);
2521   void exorl(Register dst, Address src1, Register src2, bool no_flags);
2522 
2523   void xorb(Address dst, Register src);
2524   void exorb(Register dst, Address src1, Register src2, bool no_flags);
2525   void xorb(Register dst, Address src);
2526   void exorb(Register dst, Register src1, Address src2, bool no_flags);
2527   void xorw(Register dst, Address src);
2528   void exorw(Register dst, Register src1, Address src2, bool no_flags);
2529 
2530   void xorq(Register dst, Address src);
2531   void exorq(Register dst, Register src1, Address src2, bool no_flags);
2532   void xorq(Address dst, int32_t imm32);
2533   void exorq(Register dst, Address  src, int32_t imm32, bool no_flags);
2534   void xorq(Register dst, Register src);
2535   void exorq(Register dst, Register src1, Register src2, bool no_flags);
2536   void xorq(Register dst, int32_t imm32);
2537   void exorq(Register dst, Register src, int32_t imm32, bool no_flags);
2538   void xorq(Address dst, Register src);
2539   void exorq(Register dst, Address src1, Register src2, bool no_flags);
2540 
2541   // AVX 3-operands scalar instructions (encoded with VEX prefix)
2542 
2543   void vaddsd(XMMRegister dst, XMMRegister nds, Address src);
2544   void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
2545   void vaddss(XMMRegister dst, XMMRegister nds, Address src);
2546   void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src);
2547   void vdivsd(XMMRegister dst, XMMRegister nds, Address src);
2548   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
2549   void evdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src, EvexRoundPrefix rmode);
2550   void vdivss(XMMRegister dst, XMMRegister nds, Address src);
2551   void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
2552   void vfmadd231sd(XMMRegister dst, XMMRegister nds, XMMRegister src);
2553   void vfnmadd213sd(XMMRegister dst, XMMRegister nds, XMMRegister src);
2554   void evfnmadd213sd(XMMRegister dst, XMMRegister nds, XMMRegister src, EvexRoundPrefix rmode);
2555   void vfnmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2);
2556   void vfmadd231ss(XMMRegister dst, XMMRegister nds, XMMRegister src);
2557   void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
2558   void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
2559   void vmulss(XMMRegister dst, XMMRegister nds, Address src);
2560   void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src);
2561   void vsubsd(XMMRegister dst, XMMRegister nds, Address src);
2562   void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
2563   void vsubss(XMMRegister dst, XMMRegister nds, Address src);
2564   void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
2565 
2566   void vmaxss(XMMRegister dst, XMMRegister nds, XMMRegister src);
2567   void vmaxsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
2568   void vminss(XMMRegister dst, XMMRegister nds, XMMRegister src);
2569   void vminsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
2570 
2571   void sarxl(Register dst, Register src1, Register src2);
2572   void sarxl(Register dst, Address src1, Register src2);
2573   void sarxq(Register dst, Register src1, Register src2);
2574   void sarxq(Register dst, Address src1, Register src2);
2575   void shlxl(Register dst, Register src1, Register src2);
2576   void shlxl(Register dst, Address src1, Register src2);
2577   void shlxq(Register dst, Register src1, Register src2);
2578   void shlxq(Register dst, Address src1, Register src2);
2579   void shrxl(Register dst, Register src1, Register src2);
2580   void shrxl(Register dst, Address src1, Register src2);
2581   void shrxq(Register dst, Register src1, Register src2);
2582   void shrxq(Register dst, Address src1, Register src2);
2583 
2584   void bzhiq(Register dst, Register src1, Register src2);
2585   void bzhil(Register dst, Register src1, Register src2);
2586 
2587   void pextl(Register dst, Register src1, Register src2);
2588   void pdepl(Register dst, Register src1, Register src2);
2589   void pextq(Register dst, Register src1, Register src2);
2590   void pdepq(Register dst, Register src1, Register src2);
2591   void pextl(Register dst, Register src1, Address src2);
2592   void pdepl(Register dst, Register src1, Address src2);
2593   void pextq(Register dst, Register src1, Address src2);
2594   void pdepq(Register dst, Register src1, Address src2);
2595 
2596 
2597   //====================VECTOR ARITHMETIC=====================================
2598   // Add Packed Floating-Point Values
2599   void addpd(XMMRegister dst, XMMRegister src);
2600   void addpd(XMMRegister dst, Address src);
2601   void addps(XMMRegister dst, XMMRegister src);
2602   void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2603   void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2604   void vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2605   void vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2606 
2607   // Subtract Packed Floating-Point Values
2608   void subpd(XMMRegister dst, XMMRegister src);
2609   void subps(XMMRegister dst, XMMRegister src);
2610   void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2611   void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2612   void vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2613   void vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2614 
2615   // Multiply Packed Floating-Point Values
2616   void mulpd(XMMRegister dst, XMMRegister src);
2617   void mulpd(XMMRegister dst, Address src);
2618   void mulps(XMMRegister dst, XMMRegister src);
2619   void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2620   void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2621   void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2622   void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2623 
2624   void vfmadd231pd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2625   void vfmadd231ps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2626   void vfmadd231pd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2627   void vfmadd231ps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2628 
2629   // Divide Packed Floating-Point Values
2630   void divpd(XMMRegister dst, XMMRegister src);
2631   void divps(XMMRegister dst, XMMRegister src);
2632   void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2633   void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2634   void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2635   void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2636 
2637   // Sqrt Packed Floating-Point Values
2638   void vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len);
2639   void vsqrtpd(XMMRegister dst, Address src, int vector_len);
2640   void vsqrtps(XMMRegister dst, XMMRegister src, int vector_len);
2641   void vsqrtps(XMMRegister dst, Address src, int vector_len);
2642 
2643   // Round Packed Double precision value.
2644   void vroundpd(XMMRegister dst, XMMRegister src, int32_t rmode, int vector_len);
2645   void vroundpd(XMMRegister dst, Address src, int32_t rmode, int vector_len);
2646   void vrndscalesd(XMMRegister dst,  XMMRegister src1,  XMMRegister src2, int32_t rmode);
2647   void vrndscalepd(XMMRegister dst,  XMMRegister src,  int32_t rmode, int vector_len);
2648   void vrndscalepd(XMMRegister dst, Address src, int32_t rmode, int vector_len);
2649   void vroundsd(XMMRegister dst, XMMRegister src, XMMRegister src2, int32_t rmode);
2650   void vroundsd(XMMRegister dst, XMMRegister src, Address src2, int32_t rmode);
2651 
2652   // Bitwise Logical AND of Packed Floating-Point Values
2653   void andpd(XMMRegister dst, XMMRegister src);
2654   void andnpd(XMMRegister dst, XMMRegister src);
2655   void andps(XMMRegister dst, XMMRegister src);
2656   void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2657   void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2658   void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2659   void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2660 
2661   void unpckhpd(XMMRegister dst, XMMRegister src);
2662   void unpcklpd(XMMRegister dst, XMMRegister src);
2663 
2664   // Bitwise Logical XOR of Packed Floating-Point Values
2665   void xorpd(XMMRegister dst, XMMRegister src);
2666   void xorps(XMMRegister dst, XMMRegister src);
2667   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2668   void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2669   void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2670   void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2671 
2672   // Add horizontal packed integers
2673   void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2674   void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2675   void phaddw(XMMRegister dst, XMMRegister src);
2676   void phaddd(XMMRegister dst, XMMRegister src);
2677 
2678   // Add packed integers
2679   void paddb(XMMRegister dst, XMMRegister src);
2680   void paddw(XMMRegister dst, XMMRegister src);
2681   void paddd(XMMRegister dst, XMMRegister src);
2682   void paddd(XMMRegister dst, Address src);
2683   void paddq(XMMRegister dst, XMMRegister src);
2684   void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2685   void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2686   void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2687   void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2688   void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2689   void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2690   void vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2691   void vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2692 
2693   // Saturating packed insturctions.
2694   void vpaddsb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2695   void vpaddsw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2696   void vpaddusb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2697   void vpaddusw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2698   void evpaddsb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2699   void evpaddsw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2700   void evpaddusb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2701   void evpaddusw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2702   void vpsubsb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2703   void vpsubsw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2704   void vpsubusb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2705   void vpsubusw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2706   void evpsubsb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2707   void evpsubsw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2708   void evpsubusb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2709   void evpsubusw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2710   void vpaddsb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2711   void vpaddsw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2712   void vpaddusb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2713   void vpaddusw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2714   void evpaddsb(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2715   void evpaddsw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2716   void evpaddusb(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2717   void evpaddusw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2718   void vpsubsb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2719   void vpsubsw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2720   void vpsubusb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2721   void vpsubusw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2722   void evpsubsb(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2723   void evpsubsw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2724   void evpsubusb(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2725   void evpsubusw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2726 
2727   // Leaf level assembler routines for masked operations.
2728   void evpaddb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2729   void evpaddb(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2730   void evpaddw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2731   void evpaddw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2732   void evpaddd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2733   void evpaddd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2734   void evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2735   void evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2736   void evaddps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2737   void evaddps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2738   void evaddpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2739   void evaddpd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2740   void evpsubb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2741   void evpsubb(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2742   void evpsubw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2743   void evpsubw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2744   void evpsubd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2745   void evpsubd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2746   void evpsubq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2747   void evpsubq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2748   void evsubps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2749   void evsubps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2750   void evsubpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2751   void evsubpd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2752   void evpmulhw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2753   void evpmullw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2754   void evpmullw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2755   void evpmulld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2756   void evpmulld(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2757   void evpmullq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2758   void evpmullq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2759   void evmulps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2760   void evmulps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2761   void evmulpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2762   void evmulpd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2763   void evdivps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2764   void evdivps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2765   void evdivpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2766   void evdivpd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2767   void evpabsb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
2768   void evpabsb(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
2769   void evpabsw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
2770   void evpabsw(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
2771   void evpabsd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
2772   void evpabsd(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
2773   void evpabsq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
2774   void evpabsq(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
2775   void evpfma213ps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2776   void evpfma213ps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2777   void evpfma213pd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2778   void evpfma213pd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2779   void evpermb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2780   void evpermb(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2781   void evpermw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2782   void evpermw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2783   void evpermd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2784   void evpermd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2785   void evpermq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2786   void evpermq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2787   void evpsllw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2788   void evpslld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2789   void evpsllq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2790   void evpsrlw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2791   void evpsrld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2792   void evpsrlq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2793   void evpsraw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2794   void evpsrad(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2795   void evpsraq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2796   void evsqrtps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2797   void evsqrtps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2798   void evsqrtpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2799   void evsqrtpd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2800 
2801   void evpsllw(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2802   void evpslld(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2803   void evpsllq(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2804   void evpsrlw(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2805   void evpsrld(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2806   void evpsrlq(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2807   void evpsraw(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2808   void evpsrad(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2809   void evpsraq(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2810 
2811   void evpsllvw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2812   void evpsllvd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2813   void evpsllvq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2814   void evpsrlvw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2815   void evpsrlvd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2816   void evpsrlvq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2817   void evpsravw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2818   void evpsravd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2819   void evpsravq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2820   void evpmaxsb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2821   void evpmaxsw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2822   void evpmaxsd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2823   void evpmaxsq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2824   void evpminsb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2825   void evpminsw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2826   void evpminsd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2827   void evpminsq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2828   void evpmaxsb(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2829   void evpmaxsw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2830   void evpmaxsd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2831   void evpmaxsq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2832   void evpminsb(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2833   void evpminsw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2834   void evpminsd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2835   void evpminsq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2836   void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2837   void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2838   void evporq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2839   void evporq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2840   void evpandd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2841   void evpandd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2842   void evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2843   void evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2844   void evpxord(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2845   void evpxord(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2846   void evpxorq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2847   void evpxorq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2848 
2849   void evprold(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2850   void evprolq(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2851   void evprolvd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2852   void evprolvq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2853   void evprord(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2854   void evprorq(XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vector_len);
2855   void evprorvd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2856   void evprorvq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2857 
2858   void evpternlogd(XMMRegister dst, int imm8, KRegister mask, XMMRegister src2, XMMRegister src3, bool merge, int vector_len);
2859   void evpternlogd(XMMRegister dst, int imm8, KRegister mask, XMMRegister src2, Address src3, bool merge, int vector_len);
2860   void evpternlogq(XMMRegister dst, int imm8, KRegister mask, XMMRegister src2, XMMRegister src3, bool merge, int vector_len);
2861   void evpternlogq(XMMRegister dst, int imm8, KRegister mask, XMMRegister src2, Address src3, bool merge, int vector_len);
2862 
2863   void evplzcntd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
2864   void evplzcntq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
2865 
2866   // Sub packed integers
2867   void psubb(XMMRegister dst, XMMRegister src);
2868   void psubw(XMMRegister dst, XMMRegister src);
2869   void psubd(XMMRegister dst, XMMRegister src);
2870   void psubq(XMMRegister dst, XMMRegister src);
2871   void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2872   void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2873   void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2874   void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2875   void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2876   void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2877   void vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2878   void vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2879 
2880   // Multiply packed integers (only shorts and ints)
2881   void pmullw(XMMRegister dst, XMMRegister src);
2882   void pmulld(XMMRegister dst, XMMRegister src);
2883   void pmuludq(XMMRegister dst, XMMRegister src);
2884   void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2885   void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2886   void evpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2887   void vpmuludq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2888   void vpmuldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2889   void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2890   void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2891   void evpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2892   void vpmulhuw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2893 
2894   // Minimum of packed integers
2895   void pminsb(XMMRegister dst, XMMRegister src);
2896   void vpminsb(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2897   void pminsw(XMMRegister dst, XMMRegister src);
2898   void vpminsw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2899   void pminsd(XMMRegister dst, XMMRegister src);
2900   void vpminsd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2901   void vpminsq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2902   void minps(XMMRegister dst, XMMRegister src);
2903   void vminps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2904   void minpd(XMMRegister dst, XMMRegister src);
2905   void vminpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2906 
2907   // Maximum of packed integers
2908   void pmaxsb(XMMRegister dst, XMMRegister src);
2909   void vpmaxsb(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2910   void pmaxsw(XMMRegister dst, XMMRegister src);
2911   void vpmaxsw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2912   void pmaxsd(XMMRegister dst, XMMRegister src);
2913   void vpmaxsd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2914   void vpmaxsq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2915   void maxps(XMMRegister dst, XMMRegister src);
2916   void vmaxps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2917   void maxpd(XMMRegister dst, XMMRegister src);
2918   void vmaxpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2919 
2920   // Unsigned maximum packed integers.
2921   void vpmaxub(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2922   void vpmaxuw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2923   void vpmaxud(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2924   void vpmaxub(XMMRegister dst, XMMRegister src1, Address src2, int vector_len);
2925   void vpmaxuw(XMMRegister dst, XMMRegister src1, Address src2, int vector_len);
2926   void vpmaxud(XMMRegister dst, XMMRegister src1, Address src2, int vector_len);
2927   void evpmaxub(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2928   void evpmaxuw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2929   void evpmaxud(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2930   void evpmaxuq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2931   void evpmaxub(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2932   void evpmaxuw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2933   void evpmaxud(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2934   void evpmaxuq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2935 
2936   // Unsigned minimum packed integers.
2937   void vpminub(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2938   void vpminuw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2939   void vpminud(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
2940   void vpminub(XMMRegister dst, XMMRegister src1, Address src2, int vector_len);
2941   void vpminuw(XMMRegister dst, XMMRegister src1, Address src2, int vector_len);
2942   void vpminud(XMMRegister dst, XMMRegister src1, Address src2, int vector_len);
2943   void evpminub(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2944   void evpminuw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2945   void evpminud(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2946   void evpminuq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
2947   void evpminub(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2948   void evpminuw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2949   void evpminud(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2950   void evpminuq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
2951 
2952   // Shift left packed integers
2953   void psllw(XMMRegister dst, int shift);
2954   void pslld(XMMRegister dst, int shift);
2955   void psllq(XMMRegister dst, int shift);
2956   void psllw(XMMRegister dst, XMMRegister shift);
2957   void pslld(XMMRegister dst, XMMRegister shift);
2958   void psllq(XMMRegister dst, XMMRegister shift);
2959   void vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2960   void vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2961   void vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2962   void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2963   void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2964   void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2965   void vpslldq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2966 
2967   // Logical shift right packed integers
2968   void psrlw(XMMRegister dst, int shift);
2969   void psrld(XMMRegister dst, int shift);
2970   void psrlq(XMMRegister dst, int shift);
2971   void psrlw(XMMRegister dst, XMMRegister shift);
2972   void psrld(XMMRegister dst, XMMRegister shift);
2973   void psrlq(XMMRegister dst, XMMRegister shift);
2974   void vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2975   void vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2976   void vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2977   void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2978   void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2979   void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2980   void vpsrldq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2981   void evpsrlvw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2982   void evpsllvw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2983 
2984   // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
2985   void psraw(XMMRegister dst, int shift);
2986   void psrad(XMMRegister dst, int shift);
2987   void psraw(XMMRegister dst, XMMRegister shift);
2988   void psrad(XMMRegister dst, XMMRegister shift);
2989   void vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2990   void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2991   void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2992   void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2993   void evpsravw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2994   void evpsraq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2995   void evpsraq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2996 
2997   // Variable shift left packed integers
2998   void vpsllvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2999   void vpsllvq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
3000 
3001   // Variable shift right packed integers
3002   void vpsrlvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
3003   void vpsrlvq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
3004 
3005   // Variable shift right arithmetic packed integers
3006   void vpsravd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
3007   void evpsravq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
3008 
3009   void vpshldvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
3010   void vpshrdvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
3011 
3012   // And packed integers
3013   void pand(XMMRegister dst, XMMRegister src);
3014   void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
3015   void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
3016   void evpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
3017   void evpandq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
3018 
3019   // Andn packed integers
3020   void pandn(XMMRegister dst, XMMRegister src);
3021   void vpandn(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
3022 
3023   // Or packed integers
3024   void por(XMMRegister dst, XMMRegister src);
3025   void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
3026   void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
3027   void evporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
3028   void evporq(XMMRegister dst, XMMRegister nds, Address     src, int vector_len);
3029 
3030   // Xor packed integers
3031   void pxor(XMMRegister dst, XMMRegister src);
3032   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
3033   void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
3034   void vpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
3035   void evpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
3036   void evpxorq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
3037 
3038   // Ternary logic instruction.
3039   void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len);
3040   void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address     src3, int vector_len);
3041   void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len);
3042   void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, Address     src3, int vector_len);
3043 
3044   // Vector compress/expand instructions.
3045   void evpcompressb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
3046   void evpcompressw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
3047   void evpcompressd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
3048   void evpcompressq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
3049   void evcompressps(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
3050   void evcompresspd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
3051 
3052   void evpexpandb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
3053   void evpexpandw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
3054   void evpexpandd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
3055   void evpexpandq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
3056   void evexpandps(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
3057   void evexpandpd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
3058 
3059   // Vector Rotate Left/Right instruction.
3060   void evprolvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
3061   void evprolvq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
3062   void evprorvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
3063   void evprorvq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
3064   void evprold(XMMRegister dst, XMMRegister src, int shift, int vector_len);
3065   void evprolq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
3066   void evprord(XMMRegister dst, XMMRegister src, int shift, int vector_len);
3067   void evprorq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
3068 
3069   // vinserti forms
3070   void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
3071   void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
3072   void vinserti32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
3073   void vinserti32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
3074   void vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
3075   void evinserti64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8, int vector_len);
3076 
3077   // vinsertf forms
3078   void vinsertf128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
3079   void vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
3080   void vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
3081   void vinsertf32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
3082   void vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
3083   void vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
3084 
3085   // vextracti forms
3086   void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8);
3087   void vextracti128(Address dst, XMMRegister src, uint8_t imm8);
3088   void vextracti32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
3089   void vextracti32x4(Address dst, XMMRegister src, uint8_t imm8);
3090   void vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
3091   void vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
3092   void vextracti64x4(Address dst, XMMRegister src, uint8_t imm8);
3093 
3094   // vextractf forms
3095   void vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8);
3096   void vextractf128(Address dst, XMMRegister src, uint8_t imm8);
3097   void vextractf32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
3098   void vextractf32x4(Address dst, XMMRegister src, uint8_t imm8);
3099   void vextractf64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
3100   void vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
3101   void vextractf64x4(Address dst, XMMRegister src, uint8_t imm8);
3102 
3103   void extractps(Register dst, XMMRegister src, uint8_t imm8);
3104 
3105   // xmm/mem sourced byte/word/dword/qword replicate
3106   void vpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
3107   void vpbroadcastb(XMMRegister dst, Address src, int vector_len);
3108   void vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
3109   void vpbroadcastw(XMMRegister dst, Address src, int vector_len);
3110   void vpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
3111   void vpbroadcastd(XMMRegister dst, Address src, int vector_len);
3112   void vpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
3113   void vpbroadcastq(XMMRegister dst, Address src, int vector_len);
3114 
3115   void evbroadcasti32x4(XMMRegister dst, Address src, int vector_len);
3116   void evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len);
3117   void evbroadcasti64x2(XMMRegister dst, Address src, int vector_len);
3118   void vbroadcasti128(XMMRegister dst, Address src, int vector_len);
3119 
3120   // scalar single/double/128bit precision replicate
3121   void vbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
3122   void vbroadcastss(XMMRegister dst, Address src, int vector_len);
3123   void vbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
3124   void vbroadcastsd(XMMRegister dst, Address src, int vector_len);
3125   void vbroadcastf128(XMMRegister dst, Address src, int vector_len);
3126   void evbroadcastf64x2(XMMRegister dst, Address src, int vector_len);
3127 
3128   // gpr sourced byte/word/dword/qword replicate
3129   void evpbroadcastb(XMMRegister dst, Register src, int vector_len);
3130   void evpbroadcastw(XMMRegister dst, Register src, int vector_len);
3131   void evpbroadcastd(XMMRegister dst, Register src, int vector_len);
3132   void evpbroadcastq(XMMRegister dst, Register src, int vector_len);
3133 
3134   // Gather AVX2 and AVX3
3135   void vpgatherdd(XMMRegister dst, Address src, XMMRegister mask, int vector_len);
3136   void vpgatherdq(XMMRegister dst, Address src, XMMRegister mask, int vector_len);
3137   void vgatherdpd(XMMRegister dst, Address src, XMMRegister mask, int vector_len);
3138   void vgatherdps(XMMRegister dst, Address src, XMMRegister mask, int vector_len);
3139   void evpgatherdd(XMMRegister dst, KRegister mask, Address src, int vector_len);
3140   void evpgatherdq(XMMRegister dst, KRegister mask, Address src, int vector_len);
3141   void evgatherdpd(XMMRegister dst, KRegister mask, Address src, int vector_len);
3142   void evgatherdps(XMMRegister dst, KRegister mask, Address src, int vector_len);
3143 
3144   //Scatter AVX3 only
3145   void evpscatterdd(Address dst, KRegister mask, XMMRegister src, int vector_len);
3146   void evpscatterdq(Address dst, KRegister mask, XMMRegister src, int vector_len);
3147   void evscatterdps(Address dst, KRegister mask, XMMRegister src, int vector_len);
3148   void evscatterdpd(Address dst, KRegister mask, XMMRegister src, int vector_len);
3149 
3150   // Carry-Less Multiplication Quadword
3151   void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
3152   void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
3153   void evpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask, int vector_len);
3154   // AVX instruction which is used to clear upper 128 bits of YMM registers and
3155   // to avoid transaction penalty between AVX and SSE states. There is no
3156   // penalty if legacy SSE instructions are encoded using VEX prefix because
3157   // they always clear upper 128 bits. It should be used before calling
3158   // runtime code and native libraries.
3159   void vzeroupper();
3160 
3161   void vzeroall();
3162 
3163   // Vector double compares
3164   void vcmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
3165   void evcmppd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
3166                ComparisonPredicateFP comparison, int vector_len);
3167 
3168   // Vector float compares
3169   void vcmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int comparison, int vector_len);
3170   void evcmpps(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
3171                ComparisonPredicateFP comparison, int vector_len);
3172 
3173   // Vector integer compares
3174   void vpcmpgtd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
3175   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
3176                int comparison, bool is_signed, int vector_len);
3177   void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, Address src,
3178                int comparison, bool is_signed, int vector_len);
3179 
3180   // Vector long compares
3181   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
3182                int comparison, bool is_signed, int vector_len);
3183   void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, Address src,
3184                int comparison, bool is_signed, int vector_len);
3185 
3186   // Vector byte compares
3187   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
3188                int comparison, bool is_signed, int vector_len);
3189   void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, Address src,
3190                int comparison, bool is_signed, int vector_len);
3191 
3192   // Vector short compares
3193   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
3194                int comparison, bool is_signed, int vector_len);
3195   void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, Address src,
3196                int comparison, bool is_signed, int vector_len);
3197 
3198   void evpmovb2m(KRegister dst, XMMRegister src, int vector_len);
3199   void evpmovw2m(KRegister dst, XMMRegister src, int vector_len);
3200   void evpmovd2m(KRegister dst, XMMRegister src, int vector_len);
3201   void evpmovq2m(KRegister dst, XMMRegister src, int vector_len);
3202   void evpmovm2b(XMMRegister dst, KRegister src, int vector_len);
3203   void evpmovm2w(XMMRegister dst, KRegister src, int vector_len);
3204   void evpmovm2d(XMMRegister dst, KRegister src, int vector_len);
3205   void evpmovm2q(XMMRegister dst, KRegister src, int vector_len);
3206 
3207   // floating point class tests
3208   void vfpclassss(KRegister kdst, XMMRegister src, uint8_t imm8);
3209   void vfpclasssd(KRegister kdst, XMMRegister src, uint8_t imm8);
3210 
3211   // Vector blends
3212   void blendvps(XMMRegister dst, XMMRegister src);
3213   void blendvpd(XMMRegister dst, XMMRegister src);
3214   void pblendvb(XMMRegister dst, XMMRegister src);
3215   void blendvpb(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
3216   void vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len);
3217   void vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
3218   void vpblendvb(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len);
3219   void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len);
3220   void evblendmpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
3221   void evblendmps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
3222   void evpblendmb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
3223   void evpblendmw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
3224   void evpblendmd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
3225   void evpblendmq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
3226 
3227   // Galois field affine transformation instructions.
3228   void gf2p8affineqb(XMMRegister dst, XMMRegister src, int imm8);
3229   void vgf2p8affineqb(XMMRegister dst, XMMRegister src2, XMMRegister src3, int imm8, int vector_len);
3230 
3231  protected:
3232   // Next instructions require address alignment 16 bytes SSE mode.
3233   // They should be called only from corresponding MacroAssembler instructions.
3234   void andpd(XMMRegister dst, Address src);
3235   void andps(XMMRegister dst, Address src);
3236   void xorpd(XMMRegister dst, Address src);
3237   void xorps(XMMRegister dst, Address src);
3238 
3239 };
3240 
3241 // The Intel x86/Amd64 Assembler attributes: All fields enclosed here are to guide encoding level decisions.
3242 // Specific set functions are for specialized use, else defaults or whatever was supplied to object construction
3243 // are applied.
3244 class InstructionAttr {
3245 public:
3246   InstructionAttr(
3247     int vector_len,     // The length of vector to be applied in encoding - for both AVX and EVEX
3248     bool rex_vex_w,     // Width of data: if 32-bits or less, false, else if 64-bit or specially defined, true
3249     bool legacy_mode,   // Details if either this instruction is conditionally encoded to AVX or earlier if true else possibly EVEX
3250     bool no_reg_mask,   // when true, k0 is used when EVEX encoding is chosen, else embedded_opmask_register_specifier is used
3251     bool uses_vl)       // This instruction may have legacy constraints based on vector length for EVEX
3252     :
3253       _rex_vex_w(rex_vex_w),
3254       _legacy_mode(legacy_mode || UseAVX < 3),
3255       _no_reg_mask(no_reg_mask),
3256       _uses_vl(uses_vl),
3257       _rex_vex_w_reverted(false),
3258       _is_evex_instruction(false),
3259       _is_clear_context(true),
3260       _is_extended_context(false),
3261       _avx_vector_len(vector_len),
3262       _tuple_type(Assembler::EVEX_ETUP),
3263       _input_size_in_bits(Assembler::EVEX_NObit),
3264       _evex_encoding(0),
3265       _embedded_opmask_register_specifier(0), // hard code k0
3266       _current_assembler(nullptr) { }
3267 
3268   ~InstructionAttr() {
3269     if (_current_assembler != nullptr) {
3270       _current_assembler->clear_attributes();
3271     }
3272   }
3273 
3274 private:
3275   bool _rex_vex_w;
3276   bool _legacy_mode;
3277   bool _no_reg_mask;
3278   bool _uses_vl;
3279   bool _rex_vex_w_reverted;
3280   bool _is_evex_instruction;
3281   bool _is_clear_context;
3282   bool _is_extended_context;
3283   int  _avx_vector_len;
3284   int  _tuple_type;
3285   int  _input_size_in_bits;
3286   int  _evex_encoding;
3287   int _embedded_opmask_register_specifier;
3288 
3289   Assembler *_current_assembler;
3290 
3291 public:
3292   // query functions for field accessors
3293   bool is_rex_vex_w(void) const { return _rex_vex_w; }
3294   bool is_legacy_mode(void) const { return _legacy_mode; }
3295   bool is_no_reg_mask(void) const { return _no_reg_mask; }
3296   bool uses_vl(void) const { return _uses_vl; }
3297   bool is_rex_vex_w_reverted(void) { return _rex_vex_w_reverted; }
3298   bool is_evex_instruction(void) const { return _is_evex_instruction; }
3299   bool is_clear_context(void) const { return _is_clear_context; }
3300   bool is_extended_context(void) const { return _is_extended_context; }
3301   int  get_vector_len(void) const { return _avx_vector_len; }
3302   int  get_tuple_type(void) const { return _tuple_type; }
3303   int  get_input_size(void) const { return _input_size_in_bits; }
3304   int  get_evex_encoding(void) const { return _evex_encoding; }
3305   int  get_embedded_opmask_register_specifier(void) const { return _embedded_opmask_register_specifier; }
3306 
3307   // Set the vector len manually
3308   void set_vector_len(int vector_len) { _avx_vector_len = vector_len; }
3309 
3310   // Set revert rex_vex_w for avx encoding
3311   void set_rex_vex_w_reverted(void) { _rex_vex_w_reverted = true; }
3312 
3313   // Set rex_vex_w based on state
3314   void set_rex_vex_w(bool state) { _rex_vex_w = state; }
3315 
3316   // Set the instruction to be encoded in AVX mode
3317   void set_is_legacy_mode(void) { _legacy_mode = true; }
3318 
3319   // Set the current instruction to be encoded as an EVEX instruction
3320   void set_is_evex_instruction(void) { _is_evex_instruction = true; }
3321 
3322   // Internal encoding data used in compressed immediate offset programming
3323   void set_evex_encoding(int value) { _evex_encoding = value; }
3324 
3325   // When the Evex.Z field is set (true), it is used to clear all non directed XMM/YMM/ZMM components.
3326   // This method unsets it so that merge semantics are used instead.
3327   void reset_is_clear_context(void) { _is_clear_context = false; }
3328 
3329   // Map back to current assembler so that we can manage object level association
3330   void set_current_assembler(Assembler *current_assembler) { _current_assembler = current_assembler; }
3331 
3332   // Address modifiers used for compressed displacement calculation
3333   void set_address_attributes(int tuple_type, int input_size_in_bits);
3334 
3335   // Set embedded opmask register specifier.
3336   void set_embedded_opmask_register_specifier(KRegister mask) {
3337     _embedded_opmask_register_specifier = mask->encoding() & 0x7;
3338   }
3339 
3340   void set_extended_context(void) { _is_extended_context = true; }
3341 };
3342 
3343 #endif // CPU_X86_ASSEMBLER_X86_HPP