1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  54 
  55   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  56   // NativeJump::patch_verified_entry will be able to patch out the entry
  57   // code safely. The push to verify stack depth is ok at 5 bytes,
  58   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  59   // stack bang then we must use the 6 byte frame allocation even if
  60   // we have no frame. :-(
  61   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  62 
  63   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  64   // Remove word for return addr
  65   framesize -= wordSize;
  66   stack_bang_size -= wordSize;
  67 
  68   // Calls to C2R adapters often do not accept exceptional returns.
  69   // We require that their callers must bang for them.  But be careful, because
  70   // some VM calls (such as call site linkage) can use several kilobytes of
  71   // stack.  But the stack safety zone should account for that.
  72   // See bugs 4446381, 4468289, 4497237.
  73   if (stack_bang_size > 0) {
  74     generate_stack_overflow_check(stack_bang_size);
  75 
  76     // We always push rbp, so that on return to interpreter rbp, will be
  77     // restored correctly and we can correct the stack.
  78     push(rbp);
  79     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  80     if (PreserveFramePointer) {
  81       mov(rbp, rsp);
  82     }
  83     // Remove word for ebp
  84     framesize -= wordSize;
  85 
  86     // Create frame
  87     if (framesize) {
  88       subptr(rsp, framesize);
  89     }
  90   } else {
  91     // Create frame (force generation of a 4 byte immediate value)
  92     subptr_imm32(rsp, framesize);
  93 
  94     // Save RBP register now.
  95     framesize -= wordSize;
  96     movptr(Address(rsp, framesize), rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       movptr(rbp, rsp);
 100       if (framesize > 0) {
 101         addptr(rbp, framesize);
 102       }
 103     }
 104   }
 105 
 106   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 107     framesize -= wordSize;
 108     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 109   }
 110 
 111 #ifndef _LP64
 112   // If method sets FPU control word do it now
 113   if (fp_mode_24b) {
 114     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 115   }
 116   if (UseSSE >= 2 && VerifyFPU) {
 117     verify_FPU(0, "FPU stack must be clean on entry");
 118   }
 119 #endif
 120 
 121 #ifdef ASSERT
 122   if (VerifyStackAtCalls) {
 123     Label L;
 124     push(rax);
 125     mov(rax, rsp);
 126     andptr(rax, StackAlignmentInBytes-1);
 127     cmpptr(rax, StackAlignmentInBytes-wordSize);
 128     pop(rax);
 129     jcc(Assembler::equal, L);
 130     STOP("Stack is not properly aligned!");
 131     bind(L);
 132   }
 133 #endif
 134 
 135   if (!is_stub) {
 136     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137  #ifdef _LP64
 138     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 139       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 140       Label dummy_slow_path;
 141       Label dummy_continuation;
 142       Label* slow_path = &dummy_slow_path;
 143       Label* continuation = &dummy_continuation;
 144       if (!Compile::current()->output()->in_scratch_emit_size()) {
 145         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 146         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 147         Compile::current()->output()->add_stub(stub);
 148         slow_path = &stub->entry();
 149         continuation = &stub->continuation();
 150       }
 151       bs->nmethod_entry_barrier(this, slow_path, continuation);
 152     }
 153 #else
 154     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 155     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 156 #endif
 157   }
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 // fast_lock and fast_unlock used by C2
 176 
 177 // Because the transitions from emitted code to the runtime
 178 // monitorenter/exit helper stubs are so slow it's critical that
 179 // we inline both the stack-locking fast path and the inflated fast path.
 180 //
 181 // See also: cmpFastLock and cmpFastUnlock.
 182 //
 183 // What follows is a specialized inline transliteration of the code
 184 // in enter() and exit(). If we're concerned about I$ bloat another
 185 // option would be to emit TrySlowEnter and TrySlowExit methods
 186 // at startup-time.  These methods would accept arguments as
 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 188 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 190 // In practice, however, the # of lock sites is bounded and is usually small.
 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 192 // if the processor uses simple bimodal branch predictors keyed by EIP
 193 // Since the helper routines would be called from multiple synchronization
 194 // sites.
 195 //
 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 198 // to those specialized methods.  That'd give us a mostly platform-independent
 199 // implementation that the JITs could optimize and inline at their pleasure.
 200 // Done correctly, the only time we'd need to cross to native could would be
 201 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 203 // (b) explicit barriers or fence operations.
 204 //
 205 // TODO:
 206 //
 207 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 208 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 209 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 210 //    the lock operators would typically be faster than reifying Self.
 211 //
 212 // *  Ideally I'd define the primitives as:
 213 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 214 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 215 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 216 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 217 //    Furthermore the register assignments are overconstrained, possibly resulting in
 218 //    sub-optimal code near the synchronization site.
 219 //
 220 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 221 //    Alternately, use a better sp-proximity test.
 222 //
 223 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 224 //    Either one is sufficient to uniquely identify a thread.
 225 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 226 //
 227 // *  Intrinsify notify() and notifyAll() for the common cases where the
 228 //    object is locked by the calling thread but the waitlist is empty.
 229 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 230 //
 231 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 232 //    But beware of excessive branch density on AMD Opterons.
 233 //
 234 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 235 //    or failure of the fast path.  If the fast path fails then we pass
 236 //    control to the slow path, typically in C.  In fast_lock and
 237 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 238 //    will emit a conditional branch immediately after the node.
 239 //    So we have branches to branches and lots of ICC.ZF games.
 240 //    Instead, it might be better to have C2 pass a "FailureLabel"
 241 //    into fast_lock and fast_unlock.  In the case of success, control
 242 //    will drop through the node.  ICC.ZF is undefined at exit.
 243 //    In the case of failure, the node will branch directly to the
 244 //    FailureLabel
 245 
 246 
 247 // obj: object to lock
 248 // box: on-stack box address (displaced header location) - KILLED
 249 // rax,: tmp -- KILLED
 250 // scr: tmp -- KILLED
 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 252                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 253                                  Metadata* method_data) {
 254   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 255   // Ensure the register assignments are disjoint
 256   assert(tmpReg == rax, "");
 257   assert(cx1Reg == noreg, "");
 258   assert(cx2Reg == noreg, "");
 259   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 260 
 261   // Possible cases that we'll encounter in fast_lock
 262   // ------------------------------------------------
 263   // * Inflated
 264   //    -- unlocked
 265   //    -- Locked
 266   //       = by self
 267   //       = by other
 268   // * neutral
 269   // * stack-locked
 270   //    -- by self
 271   //       = sp-proximity test hits
 272   //       = sp-proximity test generates false-negative
 273   //    -- by other
 274   //
 275 
 276   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 277 
 278   if (DiagnoseSyncOnValueBasedClasses != 0) {
 279     load_klass(tmpReg, objReg, scrReg);
 280     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 281     jcc(Assembler::notZero, DONE_LABEL);
 282   }
 283 
 284   if (LockingMode == LM_MONITOR) {
 285     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 286     testptr(objReg, objReg);
 287   } else {
 288     assert(LockingMode == LM_LEGACY, "must be");
 289 
 290     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 291     testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 292     jcc(Assembler::notZero, IsInflated);
 293 
 294     // Attempt stack-locking ...
 295     orptr (tmpReg, markWord::unlocked_value);
 296     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 297     lock();
 298     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 299     jcc(Assembler::equal, COUNT);           // Success
 300 
 301     // Recursive locking.
 302     // The object is stack-locked: markword contains stack pointer to BasicLock.
 303     // Locked by current thread if difference with current SP is less than one page.
 304     subptr(tmpReg, rsp);
 305     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 306     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 307     movptr(Address(boxReg, 0), tmpReg);
 308   }
 309   jmp(DONE_LABEL);
 310 
 311   bind(IsInflated);
 312   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 313 
 314 #ifndef _LP64
 315   // The object is inflated.
 316 
 317   // boxReg refers to the on-stack BasicLock in the current frame.
 318   // We'd like to write:
 319   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 320   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 321   // additional latency as we have another ST in the store buffer that must drain.
 322 
 323   // avoid ST-before-CAS
 324   // register juggle because we need tmpReg for cmpxchgptr below
 325   movptr(scrReg, boxReg);
 326   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 327 
 328   // Optimistic form: consider XORL tmpReg,tmpReg
 329   movptr(tmpReg, NULL_WORD);
 330 
 331   // Appears unlocked - try to swing _owner from null to non-null.
 332   // Ideally, I'd manifest "Self" with get_thread and then attempt
 333   // to CAS the register containing thread id into m->Owner.
 334   // But we don't have enough registers, so instead we can either try to CAS
 335   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 336   // we later store thread id into m->Owner.  Transiently storing a stack address
 337   // (rsp or the address of the box) into  m->owner is harmless.
 338   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 339   lock();
 340   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 341   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 342   // If we weren't able to swing _owner from null to the BasicLock
 343   // then take the slow path.
 344   jccb  (Assembler::notZero, NO_COUNT);
 345   // update _owner from BasicLock to thread
 346   get_thread (scrReg);                    // beware: clobbers ICCs
 347   movptr(scrReg, Address(scrReg, JavaThread::lock_id_offset()));
 348   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 349   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 350 
 351   // If the CAS fails we can either retry or pass control to the slow path.
 352   // We use the latter tactic.
 353   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 354   // If the CAS was successful ...
 355   //   Self has acquired the lock
 356   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 357   // Intentional fall-through into DONE_LABEL ...
 358 #else // _LP64
 359   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 360   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 361   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 362 
 363   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 364   movq(scrReg, tmpReg);
 365   xorq(tmpReg, tmpReg);
 366   movptr(boxReg, Address(r15_thread, JavaThread::lock_id_offset()));
 367   lock();
 368   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 369 
 370   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 371   jccb(Assembler::equal, COUNT);    // CAS above succeeded; propagate ZF = 1 (success)
 372 
 373   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 374   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 375   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 376   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 377 #endif // _LP64
 378   bind(DONE_LABEL);
 379 
 380   // ZFlag == 1 count in fast path
 381   // ZFlag == 0 count in slow path
 382   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 383 
 384   bind(COUNT);
 385   // Count monitors in fast path
 386   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 387 
 388   xorl(tmpReg, tmpReg); // Set ZF == 1
 389 
 390   bind(NO_COUNT);
 391 
 392   // At NO_COUNT the icc ZFlag is set as follows ...
 393   // fast_unlock uses the same protocol.
 394   // ZFlag == 1 -> Success
 395   // ZFlag == 0 -> Failure - force control through the slow path
 396 }
 397 
 398 // obj: object to unlock
 399 // box: box address (displaced header location), killed.  Must be EAX.
 400 // tmp: killed, cannot be obj nor box.
 401 //
 402 // Some commentary on balanced locking:
 403 //
 404 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 405 // Methods that don't have provably balanced locking are forced to run in the
 406 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 407 // The interpreter provides two properties:
 408 // I1:  At return-time the interpreter automatically and quietly unlocks any
 409 //      objects acquired the current activation (frame).  Recall that the
 410 //      interpreter maintains an on-stack list of locks currently held by
 411 //      a frame.
 412 // I2:  If a method attempts to unlock an object that is not held by the
 413 //      the frame the interpreter throws IMSX.
 414 //
 415 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 416 // B() doesn't have provably balanced locking so it runs in the interpreter.
 417 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 418 // is still locked by A().
 419 //
 420 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 421 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 422 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 423 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 424 // Arguably given that the spec legislates the JNI case as undefined our implementation
 425 // could reasonably *avoid* checking owner in fast_unlock().
 426 // In the interest of performance we elide m->Owner==Self check in unlock.
 427 // A perfectly viable alternative is to elide the owner check except when
 428 // Xcheck:jni is enabled.
 429 
 430 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, Register scrReg) {
 431   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 432   assert(boxReg == rax, "");
 433   assert_different_registers(objReg, boxReg, tmpReg);
 434 
 435   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 436 
 437   if (LockingMode == LM_LEGACY) {
 438     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 439     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 440   }
 441   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 442   if (LockingMode != LM_MONITOR) {
 443     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 444     jcc(Assembler::zero, Stacked);
 445   }
 446 
 447   // It's inflated.
 448 
 449   // Despite our balanced locking property we still check that m->_owner == Self
 450   // as java routines or native JNI code called by this thread might
 451   // have released the lock.
 452   // Refer to the comments in synchronizer.cpp for how we might encode extra
 453   // state in _succ so we can avoid fetching EntryList|cxq.
 454   //
 455   // If there's no contention try a 1-0 exit.  That is, exit without
 456   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 457   // we detect and recover from the race that the 1-0 exit admits.
 458   //
 459   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 460   // before it STs null into _owner, releasing the lock.  Updates
 461   // to data protected by the critical section must be visible before
 462   // we drop the lock (and thus before any other thread could acquire
 463   // the lock and observe the fields protected by the lock).
 464   // IA32's memory-model is SPO, so STs are ordered with respect to
 465   // each other and there's no need for an explicit barrier (fence).
 466   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 467   Label LSuccess, LNotRecursive;
 468 
 469   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 470   jccb(Assembler::equal, LNotRecursive);
 471 
 472   // Recursive inflated unlock
 473   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 474   jmpb(LSuccess);
 475 
 476   bind(LNotRecursive);
 477 
 478   // Set owner to null.
 479   // Release to satisfy the JMM
 480   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 481   // We need a full fence after clearing owner to avoid stranding.
 482   // StoreLoad achieves this.
 483   membar(StoreLoad);
 484 
 485   // Check if the entry lists are empty.
 486   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 487   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 488   jccb(Assembler::zero, LSuccess);    // If so we are done.
 489 
 490   // Check if there is a successor.
 491   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 492   jccb(Assembler::notZero, LSuccess); // If so we are done.
 493 
 494   // Save the monitor pointer in the current thread, so we can try to
 495   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 496   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 497 #ifndef _LP64
 498   get_thread(boxReg);
 499   movptr(Address(boxReg, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 500 #else // _LP64
 501   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 502 #endif
 503 
 504   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 505   jmpb  (DONE_LABEL);
 506 
 507   bind  (LSuccess);
 508   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 509   jmpb  (DONE_LABEL);
 510 
 511   if (LockingMode == LM_LEGACY) {
 512     bind  (Stacked);
 513     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 514     lock();
 515     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 516      // Intentional fall-thru into DONE_LABEL
 517   }
 518 
 519   bind(DONE_LABEL);
 520 
 521   // ZFlag == 1 count in fast path
 522   // ZFlag == 0 count in slow path
 523   jccb(Assembler::notZero, NO_COUNT);
 524 
 525   bind(COUNT);
 526 
 527   if (LockingMode == LM_LEGACY) {
 528     // Count monitors in fast path
 529 #ifndef _LP64
 530     get_thread(tmpReg);
 531     decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 532 #else // _LP64
 533     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 534 #endif
 535   }
 536 
 537   xorl(tmpReg, tmpReg); // Set ZF == 1
 538 
 539   bind(NO_COUNT);
 540 }
 541 
 542 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 543                                               Register t, Register thread) {
 544   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 545   assert(rax_reg == rax, "Used for CAS");
 546   assert_different_registers(obj, box, rax_reg, t, thread);
 547 
 548   // Handle inflated monitor.
 549   Label inflated;
 550   // Finish fast lock successfully. ZF value is irrelevant.
 551   Label locked;
 552   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 553   Label slow_path;
 554 
 555   if (UseObjectMonitorTable) {
 556     // Clear cache in case fast locking succeeds.
 557     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 558   }
 559 
 560   if (DiagnoseSyncOnValueBasedClasses != 0) {
 561     load_klass(rax_reg, obj, t);
 562     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 563     jcc(Assembler::notZero, slow_path);
 564   }
 565 
 566   const Register mark = t;
 567 
 568   { // Lightweight Lock
 569 
 570     Label push;
 571 
 572     const Register top = UseObjectMonitorTable ? rax_reg : box;
 573 
 574     // Load the mark.
 575     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 576 
 577     // Prefetch top.
 578     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 579 
 580     // Check for monitor (0b10).
 581     testptr(mark, markWord::monitor_value);
 582     jcc(Assembler::notZero, inflated);
 583 
 584     // Check if lock-stack is full.
 585     cmpl(top, LockStack::end_offset() - 1);
 586     jcc(Assembler::greater, slow_path);
 587 
 588     // Check if recursive.
 589     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 590     jccb(Assembler::equal, push);
 591 
 592     // Try to lock. Transition lock bits 0b01 => 0b00
 593     movptr(rax_reg, mark);
 594     orptr(rax_reg, markWord::unlocked_value);
 595     andptr(mark, ~(int32_t)markWord::unlocked_value);
 596     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 597     jcc(Assembler::notEqual, slow_path);
 598 
 599     if (UseObjectMonitorTable) {
 600       // Need to reload top, clobbered by CAS.
 601       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 602     }
 603     bind(push);
 604     // After successful lock, push object on lock-stack.
 605     movptr(Address(thread, top), obj);
 606     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 607     jmpb(locked);
 608   }
 609 
 610   { // Handle inflated monitor.
 611     bind(inflated);
 612 
 613     const Register monitor = t;
 614 
 615     if (!UseObjectMonitorTable) {
 616       assert(mark == monitor, "should be the same here");
 617     } else {
 618       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 619       // Fetch ObjectMonitor* from the cache or take the slow-path.
 620       Label monitor_found;
 621 
 622       // Load cache address
 623       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 624 
 625       const int num_unrolled = 2;
 626       for (int i = 0; i < num_unrolled; i++) {
 627         cmpptr(obj, Address(t));
 628         jccb(Assembler::equal, monitor_found);
 629         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 630       }
 631 
 632       Label loop;
 633 
 634       // Search for obj in cache.
 635       bind(loop);
 636 
 637       // Check for match.
 638       cmpptr(obj, Address(t));
 639       jccb(Assembler::equal, monitor_found);
 640 
 641       // Search until null encountered, guaranteed _null_sentinel at end.
 642       cmpptr(Address(t), 1);
 643       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 644       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 645       jmpb(loop);
 646 
 647       // Cache hit.
 648       bind(monitor_found);
 649       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 650     }
 651     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 652     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 653     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 654 
 655     Label monitor_locked;
 656     // Lock the monitor.
 657 
 658     if (UseObjectMonitorTable) {
 659       // Cache the monitor for unlock before trashing box. On failure to acquire
 660       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 661       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 662     }
 663 
 664     // CAS owner (null => current thread).
 665     xorptr(rax_reg, rax_reg);
 666     movptr(box, Address(thread, JavaThread::lock_id_offset()));
 667     lock(); cmpxchgptr(box, owner_address);
 668     jccb(Assembler::equal, monitor_locked);
 669 
 670     // Check if recursive.
 671     cmpptr(box, rax_reg);
 672     jccb(Assembler::notEqual, slow_path);
 673 
 674     // Recursive.
 675     increment(recursions_address);
 676 
 677     bind(monitor_locked);
 678   }
 679 
 680   bind(locked);
 681   // Set ZF = 1
 682   xorl(rax_reg, rax_reg);
 683 
 684 #ifdef ASSERT
 685   // Check that locked label is reached with ZF set.
 686   Label zf_correct;
 687   Label zf_bad_zero;
 688   jcc(Assembler::zero, zf_correct);
 689   jmp(zf_bad_zero);
 690 #endif
 691 
 692   bind(slow_path);
 693 #ifdef ASSERT
 694   // Check that slow_path label is reached with ZF not set.
 695   jcc(Assembler::notZero, zf_correct);
 696   stop("Fast Lock ZF != 0");
 697   bind(zf_bad_zero);
 698   stop("Fast Lock ZF != 1");
 699   bind(zf_correct);
 700 #endif
 701   // C2 uses the value of ZF to determine the continuation.
 702 }
 703 
 704 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t1, Register t2, Register thread) {
 705   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 706   assert(reg_rax == rax, "Used for CAS");
 707   assert_different_registers(obj, reg_rax, t1, t2);
 708 
 709   // Handle inflated monitor.
 710   Label inflated, inflated_check_lock_stack;
 711   // Finish fast unlock successfully.  MUST jump with ZF == 1
 712   Label unlocked, slow_path;
 713 
 714   const Register mark = t1;
 715   const Register monitor = t1;
 716   const Register top = UseObjectMonitorTable ? t1 : reg_rax;
 717   const Register box = reg_rax;
 718 
 719   Label dummy;
 720   C2FastUnlockLightweightStub* stub = nullptr;
 721 
 722   if (!Compile::current()->output()->in_scratch_emit_size()) {
 723     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, t2, thread);
 724     Compile::current()->output()->add_stub(stub);
 725   }
 726 
 727   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 728 
 729   { // Lightweight Unlock
 730 
 731     // Load top.
 732     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 733 
 734     if (!UseObjectMonitorTable) {
 735       // Prefetch mark.
 736       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 737     }
 738 
 739     // Check if obj is top of lock-stack.
 740     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 741     // Top of lock stack was not obj. Must be monitor.
 742     jcc(Assembler::notEqual, inflated_check_lock_stack);
 743 
 744     // Pop lock-stack.
 745     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 746     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 747 
 748     // Check if recursive.
 749     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 750     jcc(Assembler::equal, unlocked);
 751 
 752     // We elide the monitor check, let the CAS fail instead.
 753 
 754     if (UseObjectMonitorTable) {
 755       // Load mark.
 756       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 757     }
 758 
 759     // Try to unlock. Transition lock bits 0b00 => 0b01
 760     movptr(reg_rax, mark);
 761     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 762     orptr(mark, markWord::unlocked_value);
 763     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 764     jcc(Assembler::notEqual, push_and_slow_path);
 765     jmp(unlocked);
 766   }
 767 
 768 
 769   { // Handle inflated monitor.
 770     bind(inflated_check_lock_stack);
 771 #ifdef ASSERT
 772     Label check_done;
 773     subl(top, oopSize);
 774     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 775     jcc(Assembler::below, check_done);
 776     cmpptr(obj, Address(thread, top));
 777     jccb(Assembler::notEqual, inflated_check_lock_stack);
 778     stop("Fast Unlock lock on stack");
 779     bind(check_done);
 780     if (UseObjectMonitorTable) {
 781       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 782     }
 783     testptr(mark, markWord::monitor_value);
 784     jccb(Assembler::notZero, inflated);
 785     stop("Fast Unlock not monitor");
 786 #endif
 787 
 788     bind(inflated);
 789 
 790     if (!UseObjectMonitorTable) {
 791       assert(mark == monitor, "should be the same here");
 792     } else {
 793       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 794       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 795       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 796       cmpptr(monitor, alignof(ObjectMonitor*));
 797       jcc(Assembler::below, slow_path);
 798     }
 799     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 800     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 801     const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag};
 802     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 803     const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag};
 804     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 805 
 806     Label recursive;
 807 
 808     // Check if recursive.
 809     cmpptr(recursions_address, 0);
 810     jccb(Assembler::notZero, recursive);
 811 
 812     // Set owner to null.
 813     // Release to satisfy the JMM
 814     movptr(owner_address, NULL_WORD);
 815     // We need a full fence after clearing owner to avoid stranding.
 816     // StoreLoad achieves this.
 817     membar(StoreLoad);
 818 
 819     // Check if the entry lists are empty.
 820     movptr(reg_rax, cxq_address);
 821     orptr(reg_rax, EntryList_address);
 822     jccb(Assembler::zero, unlocked);    // If so we are done.
 823 
 824     // Check if there is a successor.
 825     cmpptr(succ_address, NULL_WORD);
 826     jccb(Assembler::notZero, unlocked); // If so we are done.
 827 
 828     // Save the monitor pointer in the current thread, so we can try to
 829     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 830     if (!UseObjectMonitorTable) {
 831       andptr(monitor, ~(int32_t)markWord::monitor_value);
 832     }
 833     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 834 
 835     testl(monitor, monitor);            // Fast Unlock ZF = 0
 836     jmpb(slow_path);
 837 
 838     // Recursive unlock.
 839     bind(recursive);
 840     decrement(recursions_address);
 841   }
 842 
 843   bind(unlocked);
 844   xorl(t1, t1); // Fast Unlock ZF = 1
 845 
 846 #ifdef ASSERT
 847   // Check that unlocked label is reached with ZF set.
 848   Label zf_correct;
 849   jcc(Assembler::zero, zf_correct);
 850   stop("Fast Unlock ZF != 1");
 851 #endif
 852 
 853   bind(slow_path);
 854   if (stub != nullptr) {
 855     bind(stub->slow_path_continuation());
 856   }
 857 #ifdef ASSERT
 858   // Check that stub->continuation() label is reached with ZF not set.
 859   jccb(Assembler::notZero, zf_correct);
 860   stop("Fast Unlock ZF != 0");
 861   bind(zf_correct);
 862 #endif
 863   // C2 uses the value of ZF to determine the continuation.
 864 }
 865 
 866 //-------------------------------------------------------------------------------------------
 867 // Generic instructions support for use in .ad files C2 code generation
 868 
 869 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 870   if (dst != src) {
 871     movdqu(dst, src);
 872   }
 873   if (opcode == Op_AbsVD) {
 874     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 875   } else {
 876     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 877     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 878   }
 879 }
 880 
 881 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 882   if (opcode == Op_AbsVD) {
 883     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 884   } else {
 885     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 886     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 887   }
 888 }
 889 
 890 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 891   if (dst != src) {
 892     movdqu(dst, src);
 893   }
 894   if (opcode == Op_AbsVF) {
 895     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 896   } else {
 897     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 898     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 899   }
 900 }
 901 
 902 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 903   if (opcode == Op_AbsVF) {
 904     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 905   } else {
 906     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 907     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 908   }
 909 }
 910 
 911 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 912   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 913   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 914 
 915   if (opcode == Op_MinV) {
 916     if (elem_bt == T_BYTE) {
 917       pminsb(dst, src);
 918     } else if (elem_bt == T_SHORT) {
 919       pminsw(dst, src);
 920     } else if (elem_bt == T_INT) {
 921       pminsd(dst, src);
 922     } else {
 923       assert(elem_bt == T_LONG, "required");
 924       assert(tmp == xmm0, "required");
 925       assert_different_registers(dst, src, tmp);
 926       movdqu(xmm0, dst);
 927       pcmpgtq(xmm0, src);
 928       blendvpd(dst, src);  // xmm0 as mask
 929     }
 930   } else { // opcode == Op_MaxV
 931     if (elem_bt == T_BYTE) {
 932       pmaxsb(dst, src);
 933     } else if (elem_bt == T_SHORT) {
 934       pmaxsw(dst, src);
 935     } else if (elem_bt == T_INT) {
 936       pmaxsd(dst, src);
 937     } else {
 938       assert(elem_bt == T_LONG, "required");
 939       assert(tmp == xmm0, "required");
 940       assert_different_registers(dst, src, tmp);
 941       movdqu(xmm0, src);
 942       pcmpgtq(xmm0, dst);
 943       blendvpd(dst, src);  // xmm0 as mask
 944     }
 945   }
 946 }
 947 
 948 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 949                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 950                                  int vlen_enc) {
 951   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 952 
 953   if (opcode == Op_MinV) {
 954     if (elem_bt == T_BYTE) {
 955       vpminsb(dst, src1, src2, vlen_enc);
 956     } else if (elem_bt == T_SHORT) {
 957       vpminsw(dst, src1, src2, vlen_enc);
 958     } else if (elem_bt == T_INT) {
 959       vpminsd(dst, src1, src2, vlen_enc);
 960     } else {
 961       assert(elem_bt == T_LONG, "required");
 962       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 963         vpminsq(dst, src1, src2, vlen_enc);
 964       } else {
 965         assert_different_registers(dst, src1, src2);
 966         vpcmpgtq(dst, src1, src2, vlen_enc);
 967         vblendvpd(dst, src1, src2, dst, vlen_enc);
 968       }
 969     }
 970   } else { // opcode == Op_MaxV
 971     if (elem_bt == T_BYTE) {
 972       vpmaxsb(dst, src1, src2, vlen_enc);
 973     } else if (elem_bt == T_SHORT) {
 974       vpmaxsw(dst, src1, src2, vlen_enc);
 975     } else if (elem_bt == T_INT) {
 976       vpmaxsd(dst, src1, src2, vlen_enc);
 977     } else {
 978       assert(elem_bt == T_LONG, "required");
 979       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 980         vpmaxsq(dst, src1, src2, vlen_enc);
 981       } else {
 982         assert_different_registers(dst, src1, src2);
 983         vpcmpgtq(dst, src1, src2, vlen_enc);
 984         vblendvpd(dst, src2, src1, dst, vlen_enc);
 985       }
 986     }
 987   }
 988 }
 989 
 990 // Float/Double min max
 991 
 992 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 993                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 994                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 995                                    int vlen_enc) {
 996   assert(UseAVX > 0, "required");
 997   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 998          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 999   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1000   assert_different_registers(a, tmp, atmp, btmp);
1001   assert_different_registers(b, tmp, atmp, btmp);
1002 
1003   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1004   bool is_double_word = is_double_word_type(elem_bt);
1005 
1006   /* Note on 'non-obvious' assembly sequence:
1007    *
1008    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1009    * and Java on how they handle floats:
1010    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1011    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1012    *
1013    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1014    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1015    *                (only useful when signs differ, noop otherwise)
1016    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1017 
1018    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1019    *   btmp = (b < +0.0) ? a : b
1020    *   atmp = (b < +0.0) ? b : a
1021    *   Tmp  = Max_Float(atmp , btmp)
1022    *   Res  = (atmp == NaN) ? atmp : Tmp
1023    */
1024 
1025   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1026   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1027   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1028   XMMRegister mask;
1029 
1030   if (!is_double_word && is_min) {
1031     mask = a;
1032     vblend = &MacroAssembler::vblendvps;
1033     vmaxmin = &MacroAssembler::vminps;
1034     vcmp = &MacroAssembler::vcmpps;
1035   } else if (!is_double_word && !is_min) {
1036     mask = b;
1037     vblend = &MacroAssembler::vblendvps;
1038     vmaxmin = &MacroAssembler::vmaxps;
1039     vcmp = &MacroAssembler::vcmpps;
1040   } else if (is_double_word && is_min) {
1041     mask = a;
1042     vblend = &MacroAssembler::vblendvpd;
1043     vmaxmin = &MacroAssembler::vminpd;
1044     vcmp = &MacroAssembler::vcmppd;
1045   } else {
1046     assert(is_double_word && !is_min, "sanity");
1047     mask = b;
1048     vblend = &MacroAssembler::vblendvpd;
1049     vmaxmin = &MacroAssembler::vmaxpd;
1050     vcmp = &MacroAssembler::vcmppd;
1051   }
1052 
1053   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1054   XMMRegister maxmin, scratch;
1055   if (dst == btmp) {
1056     maxmin = btmp;
1057     scratch = tmp;
1058   } else {
1059     maxmin = tmp;
1060     scratch = btmp;
1061   }
1062 
1063   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1064   if (precompute_mask && !is_double_word) {
1065     vpsrad(tmp, mask, 32, vlen_enc);
1066     mask = tmp;
1067   } else if (precompute_mask && is_double_word) {
1068     vpxor(tmp, tmp, tmp, vlen_enc);
1069     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1070     mask = tmp;
1071   }
1072 
1073   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1074   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1075   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1076   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1077   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1078 }
1079 
1080 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1081                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1082                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1083                                     int vlen_enc) {
1084   assert(UseAVX > 2, "required");
1085   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1086          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1087   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1088   assert_different_registers(dst, a, atmp, btmp);
1089   assert_different_registers(dst, b, atmp, btmp);
1090 
1091   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1092   bool is_double_word = is_double_word_type(elem_bt);
1093   bool merge = true;
1094 
1095   if (!is_double_word && is_min) {
1096     evpmovd2m(ktmp, a, vlen_enc);
1097     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1098     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1099     vminps(dst, atmp, btmp, vlen_enc);
1100     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1101     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1102   } else if (!is_double_word && !is_min) {
1103     evpmovd2m(ktmp, b, vlen_enc);
1104     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1105     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1106     vmaxps(dst, atmp, btmp, vlen_enc);
1107     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1108     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1109   } else if (is_double_word && is_min) {
1110     evpmovq2m(ktmp, a, vlen_enc);
1111     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1112     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1113     vminpd(dst, atmp, btmp, vlen_enc);
1114     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1115     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1116   } else {
1117     assert(is_double_word && !is_min, "sanity");
1118     evpmovq2m(ktmp, b, vlen_enc);
1119     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1120     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1121     vmaxpd(dst, atmp, btmp, vlen_enc);
1122     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1123     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1124   }
1125 }
1126 
1127 // Float/Double signum
1128 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1129   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1130 
1131   Label DONE_LABEL;
1132 
1133   if (opcode == Op_SignumF) {
1134     assert(UseSSE > 0, "required");
1135     ucomiss(dst, zero);
1136     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1137     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1138     movflt(dst, one);
1139     jcc(Assembler::above, DONE_LABEL);
1140     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1141   } else if (opcode == Op_SignumD) {
1142     assert(UseSSE > 1, "required");
1143     ucomisd(dst, zero);
1144     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1145     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1146     movdbl(dst, one);
1147     jcc(Assembler::above, DONE_LABEL);
1148     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1149   }
1150 
1151   bind(DONE_LABEL);
1152 }
1153 
1154 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1155   if (sign) {
1156     pmovsxbw(dst, src);
1157   } else {
1158     pmovzxbw(dst, src);
1159   }
1160 }
1161 
1162 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1163   if (sign) {
1164     vpmovsxbw(dst, src, vector_len);
1165   } else {
1166     vpmovzxbw(dst, src, vector_len);
1167   }
1168 }
1169 
1170 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1171   if (sign) {
1172     vpmovsxbd(dst, src, vector_len);
1173   } else {
1174     vpmovzxbd(dst, src, vector_len);
1175   }
1176 }
1177 
1178 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1179   if (sign) {
1180     vpmovsxwd(dst, src, vector_len);
1181   } else {
1182     vpmovzxwd(dst, src, vector_len);
1183   }
1184 }
1185 
1186 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1187                                      int shift, int vector_len) {
1188   if (opcode == Op_RotateLeftV) {
1189     if (etype == T_INT) {
1190       evprold(dst, src, shift, vector_len);
1191     } else {
1192       assert(etype == T_LONG, "expected type T_LONG");
1193       evprolq(dst, src, shift, vector_len);
1194     }
1195   } else {
1196     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1197     if (etype == T_INT) {
1198       evprord(dst, src, shift, vector_len);
1199     } else {
1200       assert(etype == T_LONG, "expected type T_LONG");
1201       evprorq(dst, src, shift, vector_len);
1202     }
1203   }
1204 }
1205 
1206 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1207                                      XMMRegister shift, int vector_len) {
1208   if (opcode == Op_RotateLeftV) {
1209     if (etype == T_INT) {
1210       evprolvd(dst, src, shift, vector_len);
1211     } else {
1212       assert(etype == T_LONG, "expected type T_LONG");
1213       evprolvq(dst, src, shift, vector_len);
1214     }
1215   } else {
1216     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1217     if (etype == T_INT) {
1218       evprorvd(dst, src, shift, vector_len);
1219     } else {
1220       assert(etype == T_LONG, "expected type T_LONG");
1221       evprorvq(dst, src, shift, vector_len);
1222     }
1223   }
1224 }
1225 
1226 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1227   if (opcode == Op_RShiftVI) {
1228     psrad(dst, shift);
1229   } else if (opcode == Op_LShiftVI) {
1230     pslld(dst, shift);
1231   } else {
1232     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1233     psrld(dst, shift);
1234   }
1235 }
1236 
1237 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1238   switch (opcode) {
1239     case Op_RShiftVI:  psrad(dst, shift); break;
1240     case Op_LShiftVI:  pslld(dst, shift); break;
1241     case Op_URShiftVI: psrld(dst, shift); break;
1242 
1243     default: assert(false, "%s", NodeClassNames[opcode]);
1244   }
1245 }
1246 
1247 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1248   if (opcode == Op_RShiftVI) {
1249     vpsrad(dst, nds, shift, vector_len);
1250   } else if (opcode == Op_LShiftVI) {
1251     vpslld(dst, nds, shift, vector_len);
1252   } else {
1253     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1254     vpsrld(dst, nds, shift, vector_len);
1255   }
1256 }
1257 
1258 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1259   switch (opcode) {
1260     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1261     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1262     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1263 
1264     default: assert(false, "%s", NodeClassNames[opcode]);
1265   }
1266 }
1267 
1268 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1269   switch (opcode) {
1270     case Op_RShiftVB:  // fall-through
1271     case Op_RShiftVS:  psraw(dst, shift); break;
1272 
1273     case Op_LShiftVB:  // fall-through
1274     case Op_LShiftVS:  psllw(dst, shift);   break;
1275 
1276     case Op_URShiftVS: // fall-through
1277     case Op_URShiftVB: psrlw(dst, shift);  break;
1278 
1279     default: assert(false, "%s", NodeClassNames[opcode]);
1280   }
1281 }
1282 
1283 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1284   switch (opcode) {
1285     case Op_RShiftVB:  // fall-through
1286     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1287 
1288     case Op_LShiftVB:  // fall-through
1289     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1290 
1291     case Op_URShiftVS: // fall-through
1292     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1293 
1294     default: assert(false, "%s", NodeClassNames[opcode]);
1295   }
1296 }
1297 
1298 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1299   switch (opcode) {
1300     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1301     case Op_LShiftVL:  psllq(dst, shift); break;
1302     case Op_URShiftVL: psrlq(dst, shift); break;
1303 
1304     default: assert(false, "%s", NodeClassNames[opcode]);
1305   }
1306 }
1307 
1308 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1309   if (opcode == Op_RShiftVL) {
1310     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1311   } else if (opcode == Op_LShiftVL) {
1312     psllq(dst, shift);
1313   } else {
1314     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1315     psrlq(dst, shift);
1316   }
1317 }
1318 
1319 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1320   switch (opcode) {
1321     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1322     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1323     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1324 
1325     default: assert(false, "%s", NodeClassNames[opcode]);
1326   }
1327 }
1328 
1329 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1330   if (opcode == Op_RShiftVL) {
1331     evpsraq(dst, nds, shift, vector_len);
1332   } else if (opcode == Op_LShiftVL) {
1333     vpsllq(dst, nds, shift, vector_len);
1334   } else {
1335     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1336     vpsrlq(dst, nds, shift, vector_len);
1337   }
1338 }
1339 
1340 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1341   switch (opcode) {
1342     case Op_RShiftVB:  // fall-through
1343     case Op_RShiftVS:  // fall-through
1344     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1345 
1346     case Op_LShiftVB:  // fall-through
1347     case Op_LShiftVS:  // fall-through
1348     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1349 
1350     case Op_URShiftVB: // fall-through
1351     case Op_URShiftVS: // fall-through
1352     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1353 
1354     default: assert(false, "%s", NodeClassNames[opcode]);
1355   }
1356 }
1357 
1358 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1359   switch (opcode) {
1360     case Op_RShiftVB:  // fall-through
1361     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1362 
1363     case Op_LShiftVB:  // fall-through
1364     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1365 
1366     case Op_URShiftVB: // fall-through
1367     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1368 
1369     default: assert(false, "%s", NodeClassNames[opcode]);
1370   }
1371 }
1372 
1373 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1374   assert(UseAVX >= 2, "required");
1375   switch (opcode) {
1376     case Op_RShiftVL: {
1377       if (UseAVX > 2) {
1378         assert(tmp == xnoreg, "not used");
1379         if (!VM_Version::supports_avx512vl()) {
1380           vlen_enc = Assembler::AVX_512bit;
1381         }
1382         evpsravq(dst, src, shift, vlen_enc);
1383       } else {
1384         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1385         vpsrlvq(dst, src, shift, vlen_enc);
1386         vpsrlvq(tmp, tmp, shift, vlen_enc);
1387         vpxor(dst, dst, tmp, vlen_enc);
1388         vpsubq(dst, dst, tmp, vlen_enc);
1389       }
1390       break;
1391     }
1392     case Op_LShiftVL: {
1393       assert(tmp == xnoreg, "not used");
1394       vpsllvq(dst, src, shift, vlen_enc);
1395       break;
1396     }
1397     case Op_URShiftVL: {
1398       assert(tmp == xnoreg, "not used");
1399       vpsrlvq(dst, src, shift, vlen_enc);
1400       break;
1401     }
1402     default: assert(false, "%s", NodeClassNames[opcode]);
1403   }
1404 }
1405 
1406 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1407 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1408   assert(opcode == Op_LShiftVB ||
1409          opcode == Op_RShiftVB ||
1410          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1411   bool sign = (opcode != Op_URShiftVB);
1412   assert(vector_len == 0, "required");
1413   vextendbd(sign, dst, src, 1);
1414   vpmovzxbd(vtmp, shift, 1);
1415   varshiftd(opcode, dst, dst, vtmp, 1);
1416   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1417   vextracti128_high(vtmp, dst);
1418   vpackusdw(dst, dst, vtmp, 0);
1419 }
1420 
1421 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1422 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1423   assert(opcode == Op_LShiftVB ||
1424          opcode == Op_RShiftVB ||
1425          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1426   bool sign = (opcode != Op_URShiftVB);
1427   int ext_vector_len = vector_len + 1;
1428   vextendbw(sign, dst, src, ext_vector_len);
1429   vpmovzxbw(vtmp, shift, ext_vector_len);
1430   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1431   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1432   if (vector_len == 0) {
1433     vextracti128_high(vtmp, dst);
1434     vpackuswb(dst, dst, vtmp, vector_len);
1435   } else {
1436     vextracti64x4_high(vtmp, dst);
1437     vpackuswb(dst, dst, vtmp, vector_len);
1438     vpermq(dst, dst, 0xD8, vector_len);
1439   }
1440 }
1441 
1442 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1443   switch(typ) {
1444     case T_BYTE:
1445       pinsrb(dst, val, idx);
1446       break;
1447     case T_SHORT:
1448       pinsrw(dst, val, idx);
1449       break;
1450     case T_INT:
1451       pinsrd(dst, val, idx);
1452       break;
1453     case T_LONG:
1454       pinsrq(dst, val, idx);
1455       break;
1456     default:
1457       assert(false,"Should not reach here.");
1458       break;
1459   }
1460 }
1461 
1462 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1463   switch(typ) {
1464     case T_BYTE:
1465       vpinsrb(dst, src, val, idx);
1466       break;
1467     case T_SHORT:
1468       vpinsrw(dst, src, val, idx);
1469       break;
1470     case T_INT:
1471       vpinsrd(dst, src, val, idx);
1472       break;
1473     case T_LONG:
1474       vpinsrq(dst, src, val, idx);
1475       break;
1476     default:
1477       assert(false,"Should not reach here.");
1478       break;
1479   }
1480 }
1481 
1482 #ifdef _LP64
1483 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1484                                                 XMMRegister dst, Register base,
1485                                                 Register idx_base,
1486                                                 Register offset, Register mask,
1487                                                 Register mask_idx, Register rtmp,
1488                                                 int vlen_enc) {
1489   vpxor(dst, dst, dst, vlen_enc);
1490   if (elem_bt == T_SHORT) {
1491     for (int i = 0; i < 4; i++) {
1492       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1493       Label skip_load;
1494       btq(mask, mask_idx);
1495       jccb(Assembler::carryClear, skip_load);
1496       movl(rtmp, Address(idx_base, i * 4));
1497       if (offset != noreg) {
1498         addl(rtmp, offset);
1499       }
1500       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1501       bind(skip_load);
1502       incq(mask_idx);
1503     }
1504   } else {
1505     assert(elem_bt == T_BYTE, "");
1506     for (int i = 0; i < 8; i++) {
1507       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1508       Label skip_load;
1509       btq(mask, mask_idx);
1510       jccb(Assembler::carryClear, skip_load);
1511       movl(rtmp, Address(idx_base, i * 4));
1512       if (offset != noreg) {
1513         addl(rtmp, offset);
1514       }
1515       pinsrb(dst, Address(base, rtmp), i);
1516       bind(skip_load);
1517       incq(mask_idx);
1518     }
1519   }
1520 }
1521 #endif // _LP64
1522 
1523 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1524                                          Register base, Register idx_base,
1525                                          Register offset, Register rtmp,
1526                                          int vlen_enc) {
1527   vpxor(dst, dst, dst, vlen_enc);
1528   if (elem_bt == T_SHORT) {
1529     for (int i = 0; i < 4; i++) {
1530       // dst[i] = src[offset + idx_base[i]]
1531       movl(rtmp, Address(idx_base, i * 4));
1532       if (offset != noreg) {
1533         addl(rtmp, offset);
1534       }
1535       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1536     }
1537   } else {
1538     assert(elem_bt == T_BYTE, "");
1539     for (int i = 0; i < 8; i++) {
1540       // dst[i] = src[offset + idx_base[i]]
1541       movl(rtmp, Address(idx_base, i * 4));
1542       if (offset != noreg) {
1543         addl(rtmp, offset);
1544       }
1545       pinsrb(dst, Address(base, rtmp), i);
1546     }
1547   }
1548 }
1549 
1550 /*
1551  * Gather using hybrid algorithm, first partially unroll scalar loop
1552  * to accumulate values from gather indices into a quad-word(64bit) slice.
1553  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1554  * permutation to place the slice into appropriate vector lane
1555  * locations in destination vector. Following pseudo code describes the
1556  * algorithm in detail:
1557  *
1558  * DST_VEC = ZERO_VEC
1559  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1560  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1561  * FOREACH_ITER:
1562  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1563  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1564  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1565  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1566  *
1567  * With each iteration, doubleword permute indices (0,1) corresponding
1568  * to gathered quadword gets right shifted by two lane positions.
1569  *
1570  */
1571 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1572                                         Register base, Register idx_base,
1573                                         Register offset, Register mask,
1574                                         XMMRegister xtmp1, XMMRegister xtmp2,
1575                                         XMMRegister temp_dst, Register rtmp,
1576                                         Register mask_idx, Register length,
1577                                         int vector_len, int vlen_enc) {
1578   Label GATHER8_LOOP;
1579   assert(is_subword_type(elem_ty), "");
1580   movl(length, vector_len);
1581   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1582   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1583   vallones(xtmp2, vlen_enc);
1584   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1585   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1586   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1587 
1588   bind(GATHER8_LOOP);
1589     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1590     if (mask == noreg) {
1591       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1592     } else {
1593       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1594     }
1595     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1596     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1597     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1598     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1599     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1600     vpor(dst, dst, temp_dst, vlen_enc);
1601     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1602     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1603     jcc(Assembler::notEqual, GATHER8_LOOP);
1604 }
1605 
1606 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1607   switch(typ) {
1608     case T_INT:
1609       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1610       break;
1611     case T_FLOAT:
1612       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1613       break;
1614     case T_LONG:
1615       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1616       break;
1617     case T_DOUBLE:
1618       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1619       break;
1620     default:
1621       assert(false,"Should not reach here.");
1622       break;
1623   }
1624 }
1625 
1626 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1627   switch(typ) {
1628     case T_INT:
1629       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1630       break;
1631     case T_FLOAT:
1632       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1633       break;
1634     case T_LONG:
1635       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1636       break;
1637     case T_DOUBLE:
1638       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1639       break;
1640     default:
1641       assert(false,"Should not reach here.");
1642       break;
1643   }
1644 }
1645 
1646 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1647   switch(typ) {
1648     case T_INT:
1649       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1650       break;
1651     case T_FLOAT:
1652       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1653       break;
1654     case T_LONG:
1655       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1656       break;
1657     case T_DOUBLE:
1658       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1659       break;
1660     default:
1661       assert(false,"Should not reach here.");
1662       break;
1663   }
1664 }
1665 
1666 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1667   if (vlen_in_bytes <= 16) {
1668     pxor (dst, dst);
1669     psubb(dst, src);
1670     switch (elem_bt) {
1671       case T_BYTE:   /* nothing to do */ break;
1672       case T_SHORT:  pmovsxbw(dst, dst); break;
1673       case T_INT:    pmovsxbd(dst, dst); break;
1674       case T_FLOAT:  pmovsxbd(dst, dst); break;
1675       case T_LONG:   pmovsxbq(dst, dst); break;
1676       case T_DOUBLE: pmovsxbq(dst, dst); break;
1677 
1678       default: assert(false, "%s", type2name(elem_bt));
1679     }
1680   } else {
1681     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1682     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1683 
1684     vpxor (dst, dst, dst, vlen_enc);
1685     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1686 
1687     switch (elem_bt) {
1688       case T_BYTE:   /* nothing to do */            break;
1689       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1690       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1691       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1692       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1693       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1694 
1695       default: assert(false, "%s", type2name(elem_bt));
1696     }
1697   }
1698 }
1699 
1700 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1701   if (novlbwdq) {
1702     vpmovsxbd(xtmp, src, vlen_enc);
1703     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1704             Assembler::eq, true, vlen_enc, noreg);
1705   } else {
1706     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1707     vpsubb(xtmp, xtmp, src, vlen_enc);
1708     evpmovb2m(dst, xtmp, vlen_enc);
1709   }
1710 }
1711 
1712 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1713   switch (vlen_in_bytes) {
1714     case 4:  movdl(dst, src);   break;
1715     case 8:  movq(dst, src);    break;
1716     case 16: movdqu(dst, src);  break;
1717     case 32: vmovdqu(dst, src); break;
1718     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1719     default: ShouldNotReachHere();
1720   }
1721 }
1722 
1723 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1724   assert(rscratch != noreg || always_reachable(src), "missing");
1725 
1726   if (reachable(src)) {
1727     load_vector(dst, as_Address(src), vlen_in_bytes);
1728   } else {
1729     lea(rscratch, src);
1730     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1731   }
1732 }
1733 
1734 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1735   int vlen_enc = vector_length_encoding(vlen);
1736   if (VM_Version::supports_avx()) {
1737     if (bt == T_LONG) {
1738       if (VM_Version::supports_avx2()) {
1739         vpbroadcastq(dst, src, vlen_enc);
1740       } else {
1741         vmovddup(dst, src, vlen_enc);
1742       }
1743     } else if (bt == T_DOUBLE) {
1744       if (vlen_enc != Assembler::AVX_128bit) {
1745         vbroadcastsd(dst, src, vlen_enc, noreg);
1746       } else {
1747         vmovddup(dst, src, vlen_enc);
1748       }
1749     } else {
1750       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1751         vpbroadcastd(dst, src, vlen_enc);
1752       } else {
1753         vbroadcastss(dst, src, vlen_enc);
1754       }
1755     }
1756   } else if (VM_Version::supports_sse3()) {
1757     movddup(dst, src);
1758   } else {
1759     movq(dst, src);
1760     if (vlen == 16) {
1761       punpcklqdq(dst, dst);
1762     }
1763   }
1764 }
1765 
1766 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1767   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1768   int offset = exact_log2(type2aelembytes(bt)) << 6;
1769   if (is_floating_point_type(bt)) {
1770     offset += 128;
1771   }
1772   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1773   load_vector(dst, addr, vlen_in_bytes);
1774 }
1775 
1776 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1777 
1778 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1779   int vector_len = Assembler::AVX_128bit;
1780 
1781   switch (opcode) {
1782     case Op_AndReductionV:  pand(dst, src); break;
1783     case Op_OrReductionV:   por (dst, src); break;
1784     case Op_XorReductionV:  pxor(dst, src); break;
1785     case Op_MinReductionV:
1786       switch (typ) {
1787         case T_BYTE:        pminsb(dst, src); break;
1788         case T_SHORT:       pminsw(dst, src); break;
1789         case T_INT:         pminsd(dst, src); break;
1790         case T_LONG:        assert(UseAVX > 2, "required");
1791                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1792         default:            assert(false, "wrong type");
1793       }
1794       break;
1795     case Op_MaxReductionV:
1796       switch (typ) {
1797         case T_BYTE:        pmaxsb(dst, src); break;
1798         case T_SHORT:       pmaxsw(dst, src); break;
1799         case T_INT:         pmaxsd(dst, src); break;
1800         case T_LONG:        assert(UseAVX > 2, "required");
1801                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1802         default:            assert(false, "wrong type");
1803       }
1804       break;
1805     case Op_AddReductionVF: addss(dst, src); break;
1806     case Op_AddReductionVD: addsd(dst, src); break;
1807     case Op_AddReductionVI:
1808       switch (typ) {
1809         case T_BYTE:        paddb(dst, src); break;
1810         case T_SHORT:       paddw(dst, src); break;
1811         case T_INT:         paddd(dst, src); break;
1812         default:            assert(false, "wrong type");
1813       }
1814       break;
1815     case Op_AddReductionVL: paddq(dst, src); break;
1816     case Op_MulReductionVF: mulss(dst, src); break;
1817     case Op_MulReductionVD: mulsd(dst, src); break;
1818     case Op_MulReductionVI:
1819       switch (typ) {
1820         case T_SHORT:       pmullw(dst, src); break;
1821         case T_INT:         pmulld(dst, src); break;
1822         default:            assert(false, "wrong type");
1823       }
1824       break;
1825     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1826                             evpmullq(dst, dst, src, vector_len); break;
1827     default:                assert(false, "wrong opcode");
1828   }
1829 }
1830 
1831 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1832   switch (opcode) {
1833     case Op_AddReductionVF: addps(dst, src); break;
1834     case Op_AddReductionVD: addpd(dst, src); break;
1835     case Op_MulReductionVF: mulps(dst, src); break;
1836     case Op_MulReductionVD: mulpd(dst, src); break;
1837     default:                assert(false, "%s", NodeClassNames[opcode]);
1838   }
1839 }
1840 
1841 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1842   int vector_len = Assembler::AVX_256bit;
1843 
1844   switch (opcode) {
1845     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1846     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1847     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1848     case Op_MinReductionV:
1849       switch (typ) {
1850         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1851         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1852         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1853         case T_LONG:        assert(UseAVX > 2, "required");
1854                             vpminsq(dst, src1, src2, vector_len); break;
1855         default:            assert(false, "wrong type");
1856       }
1857       break;
1858     case Op_MaxReductionV:
1859       switch (typ) {
1860         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1861         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1862         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1863         case T_LONG:        assert(UseAVX > 2, "required");
1864                             vpmaxsq(dst, src1, src2, vector_len); break;
1865         default:            assert(false, "wrong type");
1866       }
1867       break;
1868     case Op_AddReductionVI:
1869       switch (typ) {
1870         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1871         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1872         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1873         default:            assert(false, "wrong type");
1874       }
1875       break;
1876     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1877     case Op_MulReductionVI:
1878       switch (typ) {
1879         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1880         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1881         default:            assert(false, "wrong type");
1882       }
1883       break;
1884     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1885     default:                assert(false, "wrong opcode");
1886   }
1887 }
1888 
1889 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1890   int vector_len = Assembler::AVX_256bit;
1891 
1892   switch (opcode) {
1893     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1894     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1895     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1896     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1897     default:                assert(false, "%s", NodeClassNames[opcode]);
1898   }
1899 }
1900 
1901 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1902                                   XMMRegister dst, XMMRegister src,
1903                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1904   switch (opcode) {
1905     case Op_AddReductionVF:
1906     case Op_MulReductionVF:
1907       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1908       break;
1909 
1910     case Op_AddReductionVD:
1911     case Op_MulReductionVD:
1912       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1913       break;
1914 
1915     default: assert(false, "wrong opcode");
1916   }
1917 }
1918 
1919 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1920                                             XMMRegister dst, XMMRegister src,
1921                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1922   switch (opcode) {
1923     case Op_AddReductionVF:
1924     case Op_MulReductionVF:
1925       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1926       break;
1927 
1928     case Op_AddReductionVD:
1929     case Op_MulReductionVD:
1930       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1931       break;
1932 
1933     default: assert(false, "%s", NodeClassNames[opcode]);
1934   }
1935 }
1936 
1937 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1938                              Register dst, Register src1, XMMRegister src2,
1939                              XMMRegister vtmp1, XMMRegister vtmp2) {
1940   switch (vlen) {
1941     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1942     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1943     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1944     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1945 
1946     default: assert(false, "wrong vector length");
1947   }
1948 }
1949 
1950 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1951                              Register dst, Register src1, XMMRegister src2,
1952                              XMMRegister vtmp1, XMMRegister vtmp2) {
1953   switch (vlen) {
1954     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1955     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1956     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1957     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1958 
1959     default: assert(false, "wrong vector length");
1960   }
1961 }
1962 
1963 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1964                              Register dst, Register src1, XMMRegister src2,
1965                              XMMRegister vtmp1, XMMRegister vtmp2) {
1966   switch (vlen) {
1967     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1968     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1969     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1970     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1971 
1972     default: assert(false, "wrong vector length");
1973   }
1974 }
1975 
1976 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1977                              Register dst, Register src1, XMMRegister src2,
1978                              XMMRegister vtmp1, XMMRegister vtmp2) {
1979   switch (vlen) {
1980     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1981     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1982     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1983     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1984 
1985     default: assert(false, "wrong vector length");
1986   }
1987 }
1988 
1989 #ifdef _LP64
1990 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1991                              Register dst, Register src1, XMMRegister src2,
1992                              XMMRegister vtmp1, XMMRegister vtmp2) {
1993   switch (vlen) {
1994     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1995     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1996     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1997 
1998     default: assert(false, "wrong vector length");
1999   }
2000 }
2001 #endif // _LP64
2002 
2003 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2004   switch (vlen) {
2005     case 2:
2006       assert(vtmp2 == xnoreg, "");
2007       reduce2F(opcode, dst, src, vtmp1);
2008       break;
2009     case 4:
2010       assert(vtmp2 == xnoreg, "");
2011       reduce4F(opcode, dst, src, vtmp1);
2012       break;
2013     case 8:
2014       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2015       break;
2016     case 16:
2017       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2018       break;
2019     default: assert(false, "wrong vector length");
2020   }
2021 }
2022 
2023 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2024   switch (vlen) {
2025     case 2:
2026       assert(vtmp2 == xnoreg, "");
2027       reduce2D(opcode, dst, src, vtmp1);
2028       break;
2029     case 4:
2030       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2031       break;
2032     case 8:
2033       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2034       break;
2035     default: assert(false, "wrong vector length");
2036   }
2037 }
2038 
2039 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2040   switch (vlen) {
2041     case 2:
2042       assert(vtmp1 == xnoreg, "");
2043       assert(vtmp2 == xnoreg, "");
2044       unorderedReduce2F(opcode, dst, src);
2045       break;
2046     case 4:
2047       assert(vtmp2 == xnoreg, "");
2048       unorderedReduce4F(opcode, dst, src, vtmp1);
2049       break;
2050     case 8:
2051       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2052       break;
2053     case 16:
2054       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2055       break;
2056     default: assert(false, "wrong vector length");
2057   }
2058 }
2059 
2060 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2061   switch (vlen) {
2062     case 2:
2063       assert(vtmp1 == xnoreg, "");
2064       assert(vtmp2 == xnoreg, "");
2065       unorderedReduce2D(opcode, dst, src);
2066       break;
2067     case 4:
2068       assert(vtmp2 == xnoreg, "");
2069       unorderedReduce4D(opcode, dst, src, vtmp1);
2070       break;
2071     case 8:
2072       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2073       break;
2074     default: assert(false, "wrong vector length");
2075   }
2076 }
2077 
2078 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2079   if (opcode == Op_AddReductionVI) {
2080     if (vtmp1 != src2) {
2081       movdqu(vtmp1, src2);
2082     }
2083     phaddd(vtmp1, vtmp1);
2084   } else {
2085     pshufd(vtmp1, src2, 0x1);
2086     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2087   }
2088   movdl(vtmp2, src1);
2089   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2090   movdl(dst, vtmp1);
2091 }
2092 
2093 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2094   if (opcode == Op_AddReductionVI) {
2095     if (vtmp1 != src2) {
2096       movdqu(vtmp1, src2);
2097     }
2098     phaddd(vtmp1, src2);
2099     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2100   } else {
2101     pshufd(vtmp2, src2, 0xE);
2102     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2103     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2104   }
2105 }
2106 
2107 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2108   if (opcode == Op_AddReductionVI) {
2109     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2110     vextracti128_high(vtmp2, vtmp1);
2111     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2112     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2113   } else {
2114     vextracti128_high(vtmp1, src2);
2115     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2116     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2117   }
2118 }
2119 
2120 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2121   vextracti64x4_high(vtmp2, src2);
2122   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2123   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2124 }
2125 
2126 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2127   pshufd(vtmp2, src2, 0x1);
2128   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2129   movdqu(vtmp1, vtmp2);
2130   psrldq(vtmp1, 2);
2131   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2132   movdqu(vtmp2, vtmp1);
2133   psrldq(vtmp2, 1);
2134   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2135   movdl(vtmp2, src1);
2136   pmovsxbd(vtmp1, vtmp1);
2137   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2138   pextrb(dst, vtmp1, 0x0);
2139   movsbl(dst, dst);
2140 }
2141 
2142 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2143   pshufd(vtmp1, src2, 0xE);
2144   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2145   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2146 }
2147 
2148 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2149   vextracti128_high(vtmp2, src2);
2150   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2151   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2152 }
2153 
2154 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2155   vextracti64x4_high(vtmp1, src2);
2156   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2157   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2158 }
2159 
2160 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2161   pmovsxbw(vtmp2, src2);
2162   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2163 }
2164 
2165 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2166   if (UseAVX > 1) {
2167     int vector_len = Assembler::AVX_256bit;
2168     vpmovsxbw(vtmp1, src2, vector_len);
2169     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2170   } else {
2171     pmovsxbw(vtmp2, src2);
2172     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2173     pshufd(vtmp2, src2, 0x1);
2174     pmovsxbw(vtmp2, src2);
2175     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2176   }
2177 }
2178 
2179 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2180   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2181     int vector_len = Assembler::AVX_512bit;
2182     vpmovsxbw(vtmp1, src2, vector_len);
2183     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2184   } else {
2185     assert(UseAVX >= 2,"Should not reach here.");
2186     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2187     vextracti128_high(vtmp2, src2);
2188     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2189   }
2190 }
2191 
2192 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2193   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2194   vextracti64x4_high(vtmp2, src2);
2195   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2196 }
2197 
2198 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2199   if (opcode == Op_AddReductionVI) {
2200     if (vtmp1 != src2) {
2201       movdqu(vtmp1, src2);
2202     }
2203     phaddw(vtmp1, vtmp1);
2204     phaddw(vtmp1, vtmp1);
2205   } else {
2206     pshufd(vtmp2, src2, 0x1);
2207     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2208     movdqu(vtmp1, vtmp2);
2209     psrldq(vtmp1, 2);
2210     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2211   }
2212   movdl(vtmp2, src1);
2213   pmovsxwd(vtmp1, vtmp1);
2214   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2215   pextrw(dst, vtmp1, 0x0);
2216   movswl(dst, dst);
2217 }
2218 
2219 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2220   if (opcode == Op_AddReductionVI) {
2221     if (vtmp1 != src2) {
2222       movdqu(vtmp1, src2);
2223     }
2224     phaddw(vtmp1, src2);
2225   } else {
2226     pshufd(vtmp1, src2, 0xE);
2227     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2228   }
2229   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2230 }
2231 
2232 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2233   if (opcode == Op_AddReductionVI) {
2234     int vector_len = Assembler::AVX_256bit;
2235     vphaddw(vtmp2, src2, src2, vector_len);
2236     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2237   } else {
2238     vextracti128_high(vtmp2, src2);
2239     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2240   }
2241   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2242 }
2243 
2244 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2245   int vector_len = Assembler::AVX_256bit;
2246   vextracti64x4_high(vtmp1, src2);
2247   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2248   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2249 }
2250 
2251 #ifdef _LP64
2252 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2253   pshufd(vtmp2, src2, 0xE);
2254   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2255   movdq(vtmp1, src1);
2256   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2257   movdq(dst, vtmp1);
2258 }
2259 
2260 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2261   vextracti128_high(vtmp1, src2);
2262   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2263   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2264 }
2265 
2266 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2267   vextracti64x4_high(vtmp2, src2);
2268   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2269   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2270 }
2271 
2272 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2273   mov64(temp, -1L);
2274   bzhiq(temp, temp, len);
2275   kmovql(dst, temp);
2276 }
2277 #endif // _LP64
2278 
2279 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2280   reduce_operation_128(T_FLOAT, opcode, dst, src);
2281   pshufd(vtmp, src, 0x1);
2282   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2283 }
2284 
2285 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2286   reduce2F(opcode, dst, src, vtmp);
2287   pshufd(vtmp, src, 0x2);
2288   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2289   pshufd(vtmp, src, 0x3);
2290   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2291 }
2292 
2293 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2294   reduce4F(opcode, dst, src, vtmp2);
2295   vextractf128_high(vtmp2, src);
2296   reduce4F(opcode, dst, vtmp2, vtmp1);
2297 }
2298 
2299 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2300   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2301   vextracti64x4_high(vtmp1, src);
2302   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2303 }
2304 
2305 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2306   pshufd(dst, src, 0x1);
2307   reduce_operation_128(T_FLOAT, opcode, dst, src);
2308 }
2309 
2310 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2311   pshufd(vtmp, src, 0xE);
2312   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2313   unorderedReduce2F(opcode, dst, vtmp);
2314 }
2315 
2316 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2317   vextractf128_high(vtmp1, src);
2318   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2319   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2320 }
2321 
2322 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2323   vextractf64x4_high(vtmp2, src);
2324   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2325   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2326 }
2327 
2328 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2329   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2330   pshufd(vtmp, src, 0xE);
2331   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2332 }
2333 
2334 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2335   reduce2D(opcode, dst, src, vtmp2);
2336   vextractf128_high(vtmp2, src);
2337   reduce2D(opcode, dst, vtmp2, vtmp1);
2338 }
2339 
2340 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2341   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2342   vextracti64x4_high(vtmp1, src);
2343   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2344 }
2345 
2346 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2347   pshufd(dst, src, 0xE);
2348   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2349 }
2350 
2351 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2352   vextractf128_high(vtmp, src);
2353   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2354   unorderedReduce2D(opcode, dst, vtmp);
2355 }
2356 
2357 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2358   vextractf64x4_high(vtmp2, src);
2359   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2360   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2361 }
2362 
2363 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2364   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2365 }
2366 
2367 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2368   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2369 }
2370 
2371 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2372                                  int vec_enc) {
2373   switch(elem_bt) {
2374     case T_INT:
2375     case T_FLOAT:
2376       vmaskmovps(dst, src, mask, vec_enc);
2377       break;
2378     case T_LONG:
2379     case T_DOUBLE:
2380       vmaskmovpd(dst, src, mask, vec_enc);
2381       break;
2382     default:
2383       fatal("Unsupported type %s", type2name(elem_bt));
2384       break;
2385   }
2386 }
2387 
2388 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2389                                  int vec_enc) {
2390   switch(elem_bt) {
2391     case T_INT:
2392     case T_FLOAT:
2393       vmaskmovps(dst, src, mask, vec_enc);
2394       break;
2395     case T_LONG:
2396     case T_DOUBLE:
2397       vmaskmovpd(dst, src, mask, vec_enc);
2398       break;
2399     default:
2400       fatal("Unsupported type %s", type2name(elem_bt));
2401       break;
2402   }
2403 }
2404 
2405 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2406                                           XMMRegister dst, XMMRegister src,
2407                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2408                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2409   const int permconst[] = {1, 14};
2410   XMMRegister wsrc = src;
2411   XMMRegister wdst = xmm_0;
2412   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2413 
2414   int vlen_enc = Assembler::AVX_128bit;
2415   if (vlen == 16) {
2416     vlen_enc = Assembler::AVX_256bit;
2417   }
2418 
2419   for (int i = log2(vlen) - 1; i >=0; i--) {
2420     if (i == 0 && !is_dst_valid) {
2421       wdst = dst;
2422     }
2423     if (i == 3) {
2424       vextracti64x4_high(wtmp, wsrc);
2425     } else if (i == 2) {
2426       vextracti128_high(wtmp, wsrc);
2427     } else { // i = [0,1]
2428       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2429     }
2430     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2431     wsrc = wdst;
2432     vlen_enc = Assembler::AVX_128bit;
2433   }
2434   if (is_dst_valid) {
2435     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2436   }
2437 }
2438 
2439 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2440                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2441                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2442   XMMRegister wsrc = src;
2443   XMMRegister wdst = xmm_0;
2444   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2445   int vlen_enc = Assembler::AVX_128bit;
2446   if (vlen == 8) {
2447     vlen_enc = Assembler::AVX_256bit;
2448   }
2449   for (int i = log2(vlen) - 1; i >=0; i--) {
2450     if (i == 0 && !is_dst_valid) {
2451       wdst = dst;
2452     }
2453     if (i == 1) {
2454       vextracti128_high(wtmp, wsrc);
2455     } else if (i == 2) {
2456       vextracti64x4_high(wtmp, wsrc);
2457     } else {
2458       assert(i == 0, "%d", i);
2459       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2460     }
2461     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2462     wsrc = wdst;
2463     vlen_enc = Assembler::AVX_128bit;
2464   }
2465   if (is_dst_valid) {
2466     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2467   }
2468 }
2469 
2470 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2471   switch (bt) {
2472     case T_BYTE:  pextrb(dst, src, idx); break;
2473     case T_SHORT: pextrw(dst, src, idx); break;
2474     case T_INT:   pextrd(dst, src, idx); break;
2475     case T_LONG:  pextrq(dst, src, idx); break;
2476 
2477     default:
2478       assert(false,"Should not reach here.");
2479       break;
2480   }
2481 }
2482 
2483 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2484   int esize =  type2aelembytes(typ);
2485   int elem_per_lane = 16/esize;
2486   int lane = elemindex / elem_per_lane;
2487   int eindex = elemindex % elem_per_lane;
2488 
2489   if (lane >= 2) {
2490     assert(UseAVX > 2, "required");
2491     vextractf32x4(dst, src, lane & 3);
2492     return dst;
2493   } else if (lane > 0) {
2494     assert(UseAVX > 0, "required");
2495     vextractf128(dst, src, lane);
2496     return dst;
2497   } else {
2498     return src;
2499   }
2500 }
2501 
2502 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2503   if (typ == T_BYTE) {
2504     movsbl(dst, dst);
2505   } else if (typ == T_SHORT) {
2506     movswl(dst, dst);
2507   }
2508 }
2509 
2510 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2511   int esize =  type2aelembytes(typ);
2512   int elem_per_lane = 16/esize;
2513   int eindex = elemindex % elem_per_lane;
2514   assert(is_integral_type(typ),"required");
2515 
2516   if (eindex == 0) {
2517     if (typ == T_LONG) {
2518       movq(dst, src);
2519     } else {
2520       movdl(dst, src);
2521       movsxl(typ, dst);
2522     }
2523   } else {
2524     extract(typ, dst, src, eindex);
2525     movsxl(typ, dst);
2526   }
2527 }
2528 
2529 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2530   int esize =  type2aelembytes(typ);
2531   int elem_per_lane = 16/esize;
2532   int eindex = elemindex % elem_per_lane;
2533   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2534 
2535   if (eindex == 0) {
2536     movq(dst, src);
2537   } else {
2538     if (typ == T_FLOAT) {
2539       if (UseAVX == 0) {
2540         movdqu(dst, src);
2541         shufps(dst, dst, eindex);
2542       } else {
2543         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2544       }
2545     } else {
2546       if (UseAVX == 0) {
2547         movdqu(dst, src);
2548         psrldq(dst, eindex*esize);
2549       } else {
2550         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2551       }
2552       movq(dst, dst);
2553     }
2554   }
2555   // Zero upper bits
2556   if (typ == T_FLOAT) {
2557     if (UseAVX == 0) {
2558       assert(vtmp != xnoreg, "required.");
2559       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2560       pand(dst, vtmp);
2561     } else {
2562       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2563     }
2564   }
2565 }
2566 
2567 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2568   switch(typ) {
2569     case T_BYTE:
2570     case T_BOOLEAN:
2571       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2572       break;
2573     case T_SHORT:
2574     case T_CHAR:
2575       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2576       break;
2577     case T_INT:
2578     case T_FLOAT:
2579       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2580       break;
2581     case T_LONG:
2582     case T_DOUBLE:
2583       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2584       break;
2585     default:
2586       assert(false,"Should not reach here.");
2587       break;
2588   }
2589 }
2590 
2591 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2592   assert(rscratch != noreg || always_reachable(src2), "missing");
2593 
2594   switch(typ) {
2595     case T_BOOLEAN:
2596     case T_BYTE:
2597       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2598       break;
2599     case T_CHAR:
2600     case T_SHORT:
2601       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2602       break;
2603     case T_INT:
2604     case T_FLOAT:
2605       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2606       break;
2607     case T_LONG:
2608     case T_DOUBLE:
2609       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2610       break;
2611     default:
2612       assert(false,"Should not reach here.");
2613       break;
2614   }
2615 }
2616 
2617 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2618   switch(typ) {
2619     case T_BYTE:
2620       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2621       break;
2622     case T_SHORT:
2623       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2624       break;
2625     case T_INT:
2626     case T_FLOAT:
2627       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2628       break;
2629     case T_LONG:
2630     case T_DOUBLE:
2631       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2632       break;
2633     default:
2634       assert(false,"Should not reach here.");
2635       break;
2636   }
2637 }
2638 
2639 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2640   assert(vlen_in_bytes <= 32, "");
2641   int esize = type2aelembytes(bt);
2642   if (vlen_in_bytes == 32) {
2643     assert(vtmp == xnoreg, "required.");
2644     if (esize >= 4) {
2645       vtestps(src1, src2, AVX_256bit);
2646     } else {
2647       vptest(src1, src2, AVX_256bit);
2648     }
2649     return;
2650   }
2651   if (vlen_in_bytes < 16) {
2652     // Duplicate the lower part to fill the whole register,
2653     // Don't need to do so for src2
2654     assert(vtmp != xnoreg, "required");
2655     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2656     pshufd(vtmp, src1, shuffle_imm);
2657   } else {
2658     assert(vtmp == xnoreg, "required");
2659     vtmp = src1;
2660   }
2661   if (esize >= 4 && VM_Version::supports_avx()) {
2662     vtestps(vtmp, src2, AVX_128bit);
2663   } else {
2664     ptest(vtmp, src2);
2665   }
2666 }
2667 
2668 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2669   assert(UseAVX >= 2, "required");
2670 #ifdef ASSERT
2671   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2672   bool is_bw_supported = VM_Version::supports_avx512bw();
2673   if (is_bw && !is_bw_supported) {
2674     assert(vlen_enc != Assembler::AVX_512bit, "required");
2675     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2676            "XMM register should be 0-15");
2677   }
2678 #endif // ASSERT
2679   switch (elem_bt) {
2680     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2681     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2682     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2683     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2684     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2685     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2686     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2687   }
2688 }
2689 
2690 #ifdef _LP64
2691 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2692   assert(UseAVX >= 2, "required");
2693   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2694   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2695   if ((UseAVX > 2) &&
2696       (!is_bw || VM_Version::supports_avx512bw()) &&
2697       (!is_vl || VM_Version::supports_avx512vl())) {
2698     switch (elem_bt) {
2699       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2700       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2701       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2702       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2703       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2704     }
2705   } else {
2706     assert(vlen_enc != Assembler::AVX_512bit, "required");
2707     assert((dst->encoding() < 16),"XMM register should be 0-15");
2708     switch (elem_bt) {
2709       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2710       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2711       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2712       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2713       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2714       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2715       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2716     }
2717   }
2718 }
2719 #endif
2720 
2721 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2722   switch (to_elem_bt) {
2723     case T_SHORT:
2724       vpmovsxbw(dst, src, vlen_enc);
2725       break;
2726     case T_INT:
2727       vpmovsxbd(dst, src, vlen_enc);
2728       break;
2729     case T_FLOAT:
2730       vpmovsxbd(dst, src, vlen_enc);
2731       vcvtdq2ps(dst, dst, vlen_enc);
2732       break;
2733     case T_LONG:
2734       vpmovsxbq(dst, src, vlen_enc);
2735       break;
2736     case T_DOUBLE: {
2737       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2738       vpmovsxbd(dst, src, mid_vlen_enc);
2739       vcvtdq2pd(dst, dst, vlen_enc);
2740       break;
2741     }
2742     default:
2743       fatal("Unsupported type %s", type2name(to_elem_bt));
2744       break;
2745   }
2746 }
2747 
2748 //-------------------------------------------------------------------------------------------
2749 
2750 // IndexOf for constant substrings with size >= 8 chars
2751 // which don't need to be loaded through stack.
2752 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2753                                          Register cnt1, Register cnt2,
2754                                          int int_cnt2,  Register result,
2755                                          XMMRegister vec, Register tmp,
2756                                          int ae) {
2757   ShortBranchVerifier sbv(this);
2758   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2759   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2760 
2761   // This method uses the pcmpestri instruction with bound registers
2762   //   inputs:
2763   //     xmm - substring
2764   //     rax - substring length (elements count)
2765   //     mem - scanned string
2766   //     rdx - string length (elements count)
2767   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2768   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2769   //   outputs:
2770   //     rcx - matched index in string
2771   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2772   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2773   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2774   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2775   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2776 
2777   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2778         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2779         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2780 
2781   // Note, inline_string_indexOf() generates checks:
2782   // if (substr.count > string.count) return -1;
2783   // if (substr.count == 0) return 0;
2784   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2785 
2786   // Load substring.
2787   if (ae == StrIntrinsicNode::UL) {
2788     pmovzxbw(vec, Address(str2, 0));
2789   } else {
2790     movdqu(vec, Address(str2, 0));
2791   }
2792   movl(cnt2, int_cnt2);
2793   movptr(result, str1); // string addr
2794 
2795   if (int_cnt2 > stride) {
2796     jmpb(SCAN_TO_SUBSTR);
2797 
2798     // Reload substr for rescan, this code
2799     // is executed only for large substrings (> 8 chars)
2800     bind(RELOAD_SUBSTR);
2801     if (ae == StrIntrinsicNode::UL) {
2802       pmovzxbw(vec, Address(str2, 0));
2803     } else {
2804       movdqu(vec, Address(str2, 0));
2805     }
2806     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2807 
2808     bind(RELOAD_STR);
2809     // We came here after the beginning of the substring was
2810     // matched but the rest of it was not so we need to search
2811     // again. Start from the next element after the previous match.
2812 
2813     // cnt2 is number of substring reminding elements and
2814     // cnt1 is number of string reminding elements when cmp failed.
2815     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2816     subl(cnt1, cnt2);
2817     addl(cnt1, int_cnt2);
2818     movl(cnt2, int_cnt2); // Now restore cnt2
2819 
2820     decrementl(cnt1);     // Shift to next element
2821     cmpl(cnt1, cnt2);
2822     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2823 
2824     addptr(result, (1<<scale1));
2825 
2826   } // (int_cnt2 > 8)
2827 
2828   // Scan string for start of substr in 16-byte vectors
2829   bind(SCAN_TO_SUBSTR);
2830   pcmpestri(vec, Address(result, 0), mode);
2831   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2832   subl(cnt1, stride);
2833   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2834   cmpl(cnt1, cnt2);
2835   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2836   addptr(result, 16);
2837   jmpb(SCAN_TO_SUBSTR);
2838 
2839   // Found a potential substr
2840   bind(FOUND_CANDIDATE);
2841   // Matched whole vector if first element matched (tmp(rcx) == 0).
2842   if (int_cnt2 == stride) {
2843     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2844   } else { // int_cnt2 > 8
2845     jccb(Assembler::overflow, FOUND_SUBSTR);
2846   }
2847   // After pcmpestri tmp(rcx) contains matched element index
2848   // Compute start addr of substr
2849   lea(result, Address(result, tmp, scale1));
2850 
2851   // Make sure string is still long enough
2852   subl(cnt1, tmp);
2853   cmpl(cnt1, cnt2);
2854   if (int_cnt2 == stride) {
2855     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2856   } else { // int_cnt2 > 8
2857     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2858   }
2859   // Left less then substring.
2860 
2861   bind(RET_NOT_FOUND);
2862   movl(result, -1);
2863   jmp(EXIT);
2864 
2865   if (int_cnt2 > stride) {
2866     // This code is optimized for the case when whole substring
2867     // is matched if its head is matched.
2868     bind(MATCH_SUBSTR_HEAD);
2869     pcmpestri(vec, Address(result, 0), mode);
2870     // Reload only string if does not match
2871     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2872 
2873     Label CONT_SCAN_SUBSTR;
2874     // Compare the rest of substring (> 8 chars).
2875     bind(FOUND_SUBSTR);
2876     // First 8 chars are already matched.
2877     negptr(cnt2);
2878     addptr(cnt2, stride);
2879 
2880     bind(SCAN_SUBSTR);
2881     subl(cnt1, stride);
2882     cmpl(cnt2, -stride); // Do not read beyond substring
2883     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2884     // Back-up strings to avoid reading beyond substring:
2885     // cnt1 = cnt1 - cnt2 + 8
2886     addl(cnt1, cnt2); // cnt2 is negative
2887     addl(cnt1, stride);
2888     movl(cnt2, stride); negptr(cnt2);
2889     bind(CONT_SCAN_SUBSTR);
2890     if (int_cnt2 < (int)G) {
2891       int tail_off1 = int_cnt2<<scale1;
2892       int tail_off2 = int_cnt2<<scale2;
2893       if (ae == StrIntrinsicNode::UL) {
2894         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2895       } else {
2896         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2897       }
2898       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2899     } else {
2900       // calculate index in register to avoid integer overflow (int_cnt2*2)
2901       movl(tmp, int_cnt2);
2902       addptr(tmp, cnt2);
2903       if (ae == StrIntrinsicNode::UL) {
2904         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2905       } else {
2906         movdqu(vec, Address(str2, tmp, scale2, 0));
2907       }
2908       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2909     }
2910     // Need to reload strings pointers if not matched whole vector
2911     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2912     addptr(cnt2, stride);
2913     jcc(Assembler::negative, SCAN_SUBSTR);
2914     // Fall through if found full substring
2915 
2916   } // (int_cnt2 > 8)
2917 
2918   bind(RET_FOUND);
2919   // Found result if we matched full small substring.
2920   // Compute substr offset
2921   subptr(result, str1);
2922   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2923     shrl(result, 1); // index
2924   }
2925   bind(EXIT);
2926 
2927 } // string_indexofC8
2928 
2929 // Small strings are loaded through stack if they cross page boundary.
2930 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2931                                        Register cnt1, Register cnt2,
2932                                        int int_cnt2,  Register result,
2933                                        XMMRegister vec, Register tmp,
2934                                        int ae) {
2935   ShortBranchVerifier sbv(this);
2936   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2937   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2938 
2939   //
2940   // int_cnt2 is length of small (< 8 chars) constant substring
2941   // or (-1) for non constant substring in which case its length
2942   // is in cnt2 register.
2943   //
2944   // Note, inline_string_indexOf() generates checks:
2945   // if (substr.count > string.count) return -1;
2946   // if (substr.count == 0) return 0;
2947   //
2948   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2949   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2950   // This method uses the pcmpestri instruction with bound registers
2951   //   inputs:
2952   //     xmm - substring
2953   //     rax - substring length (elements count)
2954   //     mem - scanned string
2955   //     rdx - string length (elements count)
2956   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2957   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2958   //   outputs:
2959   //     rcx - matched index in string
2960   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2961   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2962   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2963   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2964 
2965   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2966         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2967         FOUND_CANDIDATE;
2968 
2969   { //========================================================
2970     // We don't know where these strings are located
2971     // and we can't read beyond them. Load them through stack.
2972     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2973 
2974     movptr(tmp, rsp); // save old SP
2975 
2976     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2977       if (int_cnt2 == (1>>scale2)) { // One byte
2978         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2979         load_unsigned_byte(result, Address(str2, 0));
2980         movdl(vec, result); // move 32 bits
2981       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2982         // Not enough header space in 32-bit VM: 12+3 = 15.
2983         movl(result, Address(str2, -1));
2984         shrl(result, 8);
2985         movdl(vec, result); // move 32 bits
2986       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2987         load_unsigned_short(result, Address(str2, 0));
2988         movdl(vec, result); // move 32 bits
2989       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2990         movdl(vec, Address(str2, 0)); // move 32 bits
2991       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2992         movq(vec, Address(str2, 0));  // move 64 bits
2993       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2994         // Array header size is 12 bytes in 32-bit VM
2995         // + 6 bytes for 3 chars == 18 bytes,
2996         // enough space to load vec and shift.
2997         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2998         if (ae == StrIntrinsicNode::UL) {
2999           int tail_off = int_cnt2-8;
3000           pmovzxbw(vec, Address(str2, tail_off));
3001           psrldq(vec, -2*tail_off);
3002         }
3003         else {
3004           int tail_off = int_cnt2*(1<<scale2);
3005           movdqu(vec, Address(str2, tail_off-16));
3006           psrldq(vec, 16-tail_off);
3007         }
3008       }
3009     } else { // not constant substring
3010       cmpl(cnt2, stride);
3011       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3012 
3013       // We can read beyond string if srt+16 does not cross page boundary
3014       // since heaps are aligned and mapped by pages.
3015       assert(os::vm_page_size() < (int)G, "default page should be small");
3016       movl(result, str2); // We need only low 32 bits
3017       andl(result, ((int)os::vm_page_size()-1));
3018       cmpl(result, ((int)os::vm_page_size()-16));
3019       jccb(Assembler::belowEqual, CHECK_STR);
3020 
3021       // Move small strings to stack to allow load 16 bytes into vec.
3022       subptr(rsp, 16);
3023       int stk_offset = wordSize-(1<<scale2);
3024       push(cnt2);
3025 
3026       bind(COPY_SUBSTR);
3027       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3028         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3029         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3030       } else if (ae == StrIntrinsicNode::UU) {
3031         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3032         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3033       }
3034       decrement(cnt2);
3035       jccb(Assembler::notZero, COPY_SUBSTR);
3036 
3037       pop(cnt2);
3038       movptr(str2, rsp);  // New substring address
3039     } // non constant
3040 
3041     bind(CHECK_STR);
3042     cmpl(cnt1, stride);
3043     jccb(Assembler::aboveEqual, BIG_STRINGS);
3044 
3045     // Check cross page boundary.
3046     movl(result, str1); // We need only low 32 bits
3047     andl(result, ((int)os::vm_page_size()-1));
3048     cmpl(result, ((int)os::vm_page_size()-16));
3049     jccb(Assembler::belowEqual, BIG_STRINGS);
3050 
3051     subptr(rsp, 16);
3052     int stk_offset = -(1<<scale1);
3053     if (int_cnt2 < 0) { // not constant
3054       push(cnt2);
3055       stk_offset += wordSize;
3056     }
3057     movl(cnt2, cnt1);
3058 
3059     bind(COPY_STR);
3060     if (ae == StrIntrinsicNode::LL) {
3061       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3062       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3063     } else {
3064       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3065       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3066     }
3067     decrement(cnt2);
3068     jccb(Assembler::notZero, COPY_STR);
3069 
3070     if (int_cnt2 < 0) { // not constant
3071       pop(cnt2);
3072     }
3073     movptr(str1, rsp);  // New string address
3074 
3075     bind(BIG_STRINGS);
3076     // Load substring.
3077     if (int_cnt2 < 0) { // -1
3078       if (ae == StrIntrinsicNode::UL) {
3079         pmovzxbw(vec, Address(str2, 0));
3080       } else {
3081         movdqu(vec, Address(str2, 0));
3082       }
3083       push(cnt2);       // substr count
3084       push(str2);       // substr addr
3085       push(str1);       // string addr
3086     } else {
3087       // Small (< 8 chars) constant substrings are loaded already.
3088       movl(cnt2, int_cnt2);
3089     }
3090     push(tmp);  // original SP
3091 
3092   } // Finished loading
3093 
3094   //========================================================
3095   // Start search
3096   //
3097 
3098   movptr(result, str1); // string addr
3099 
3100   if (int_cnt2  < 0) {  // Only for non constant substring
3101     jmpb(SCAN_TO_SUBSTR);
3102 
3103     // SP saved at sp+0
3104     // String saved at sp+1*wordSize
3105     // Substr saved at sp+2*wordSize
3106     // Substr count saved at sp+3*wordSize
3107 
3108     // Reload substr for rescan, this code
3109     // is executed only for large substrings (> 8 chars)
3110     bind(RELOAD_SUBSTR);
3111     movptr(str2, Address(rsp, 2*wordSize));
3112     movl(cnt2, Address(rsp, 3*wordSize));
3113     if (ae == StrIntrinsicNode::UL) {
3114       pmovzxbw(vec, Address(str2, 0));
3115     } else {
3116       movdqu(vec, Address(str2, 0));
3117     }
3118     // We came here after the beginning of the substring was
3119     // matched but the rest of it was not so we need to search
3120     // again. Start from the next element after the previous match.
3121     subptr(str1, result); // Restore counter
3122     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3123       shrl(str1, 1);
3124     }
3125     addl(cnt1, str1);
3126     decrementl(cnt1);   // Shift to next element
3127     cmpl(cnt1, cnt2);
3128     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3129 
3130     addptr(result, (1<<scale1));
3131   } // non constant
3132 
3133   // Scan string for start of substr in 16-byte vectors
3134   bind(SCAN_TO_SUBSTR);
3135   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3136   pcmpestri(vec, Address(result, 0), mode);
3137   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3138   subl(cnt1, stride);
3139   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3140   cmpl(cnt1, cnt2);
3141   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3142   addptr(result, 16);
3143 
3144   bind(ADJUST_STR);
3145   cmpl(cnt1, stride); // Do not read beyond string
3146   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3147   // Back-up string to avoid reading beyond string.
3148   lea(result, Address(result, cnt1, scale1, -16));
3149   movl(cnt1, stride);
3150   jmpb(SCAN_TO_SUBSTR);
3151 
3152   // Found a potential substr
3153   bind(FOUND_CANDIDATE);
3154   // After pcmpestri tmp(rcx) contains matched element index
3155 
3156   // Make sure string is still long enough
3157   subl(cnt1, tmp);
3158   cmpl(cnt1, cnt2);
3159   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3160   // Left less then substring.
3161 
3162   bind(RET_NOT_FOUND);
3163   movl(result, -1);
3164   jmp(CLEANUP);
3165 
3166   bind(FOUND_SUBSTR);
3167   // Compute start addr of substr
3168   lea(result, Address(result, tmp, scale1));
3169   if (int_cnt2 > 0) { // Constant substring
3170     // Repeat search for small substring (< 8 chars)
3171     // from new point without reloading substring.
3172     // Have to check that we don't read beyond string.
3173     cmpl(tmp, stride-int_cnt2);
3174     jccb(Assembler::greater, ADJUST_STR);
3175     // Fall through if matched whole substring.
3176   } else { // non constant
3177     assert(int_cnt2 == -1, "should be != 0");
3178 
3179     addl(tmp, cnt2);
3180     // Found result if we matched whole substring.
3181     cmpl(tmp, stride);
3182     jcc(Assembler::lessEqual, RET_FOUND);
3183 
3184     // Repeat search for small substring (<= 8 chars)
3185     // from new point 'str1' without reloading substring.
3186     cmpl(cnt2, stride);
3187     // Have to check that we don't read beyond string.
3188     jccb(Assembler::lessEqual, ADJUST_STR);
3189 
3190     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3191     // Compare the rest of substring (> 8 chars).
3192     movptr(str1, result);
3193 
3194     cmpl(tmp, cnt2);
3195     // First 8 chars are already matched.
3196     jccb(Assembler::equal, CHECK_NEXT);
3197 
3198     bind(SCAN_SUBSTR);
3199     pcmpestri(vec, Address(str1, 0), mode);
3200     // Need to reload strings pointers if not matched whole vector
3201     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3202 
3203     bind(CHECK_NEXT);
3204     subl(cnt2, stride);
3205     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3206     addptr(str1, 16);
3207     if (ae == StrIntrinsicNode::UL) {
3208       addptr(str2, 8);
3209     } else {
3210       addptr(str2, 16);
3211     }
3212     subl(cnt1, stride);
3213     cmpl(cnt2, stride); // Do not read beyond substring
3214     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3215     // Back-up strings to avoid reading beyond substring.
3216 
3217     if (ae == StrIntrinsicNode::UL) {
3218       lea(str2, Address(str2, cnt2, scale2, -8));
3219       lea(str1, Address(str1, cnt2, scale1, -16));
3220     } else {
3221       lea(str2, Address(str2, cnt2, scale2, -16));
3222       lea(str1, Address(str1, cnt2, scale1, -16));
3223     }
3224     subl(cnt1, cnt2);
3225     movl(cnt2, stride);
3226     addl(cnt1, stride);
3227     bind(CONT_SCAN_SUBSTR);
3228     if (ae == StrIntrinsicNode::UL) {
3229       pmovzxbw(vec, Address(str2, 0));
3230     } else {
3231       movdqu(vec, Address(str2, 0));
3232     }
3233     jmp(SCAN_SUBSTR);
3234 
3235     bind(RET_FOUND_LONG);
3236     movptr(str1, Address(rsp, wordSize));
3237   } // non constant
3238 
3239   bind(RET_FOUND);
3240   // Compute substr offset
3241   subptr(result, str1);
3242   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3243     shrl(result, 1); // index
3244   }
3245   bind(CLEANUP);
3246   pop(rsp); // restore SP
3247 
3248 } // string_indexof
3249 
3250 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3251                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3252   ShortBranchVerifier sbv(this);
3253   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3254 
3255   int stride = 8;
3256 
3257   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3258         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3259         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3260         FOUND_SEQ_CHAR, DONE_LABEL;
3261 
3262   movptr(result, str1);
3263   if (UseAVX >= 2) {
3264     cmpl(cnt1, stride);
3265     jcc(Assembler::less, SCAN_TO_CHAR);
3266     cmpl(cnt1, 2*stride);
3267     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3268     movdl(vec1, ch);
3269     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3270     vpxor(vec2, vec2);
3271     movl(tmp, cnt1);
3272     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3273     andl(cnt1,0x0000000F);  //tail count (in chars)
3274 
3275     bind(SCAN_TO_16_CHAR_LOOP);
3276     vmovdqu(vec3, Address(result, 0));
3277     vpcmpeqw(vec3, vec3, vec1, 1);
3278     vptest(vec2, vec3);
3279     jcc(Assembler::carryClear, FOUND_CHAR);
3280     addptr(result, 32);
3281     subl(tmp, 2*stride);
3282     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3283     jmp(SCAN_TO_8_CHAR);
3284     bind(SCAN_TO_8_CHAR_INIT);
3285     movdl(vec1, ch);
3286     pshuflw(vec1, vec1, 0x00);
3287     pshufd(vec1, vec1, 0);
3288     pxor(vec2, vec2);
3289   }
3290   bind(SCAN_TO_8_CHAR);
3291   cmpl(cnt1, stride);
3292   jcc(Assembler::less, SCAN_TO_CHAR);
3293   if (UseAVX < 2) {
3294     movdl(vec1, ch);
3295     pshuflw(vec1, vec1, 0x00);
3296     pshufd(vec1, vec1, 0);
3297     pxor(vec2, vec2);
3298   }
3299   movl(tmp, cnt1);
3300   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3301   andl(cnt1,0x00000007);  //tail count (in chars)
3302 
3303   bind(SCAN_TO_8_CHAR_LOOP);
3304   movdqu(vec3, Address(result, 0));
3305   pcmpeqw(vec3, vec1);
3306   ptest(vec2, vec3);
3307   jcc(Assembler::carryClear, FOUND_CHAR);
3308   addptr(result, 16);
3309   subl(tmp, stride);
3310   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3311   bind(SCAN_TO_CHAR);
3312   testl(cnt1, cnt1);
3313   jcc(Assembler::zero, RET_NOT_FOUND);
3314   bind(SCAN_TO_CHAR_LOOP);
3315   load_unsigned_short(tmp, Address(result, 0));
3316   cmpl(ch, tmp);
3317   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3318   addptr(result, 2);
3319   subl(cnt1, 1);
3320   jccb(Assembler::zero, RET_NOT_FOUND);
3321   jmp(SCAN_TO_CHAR_LOOP);
3322 
3323   bind(RET_NOT_FOUND);
3324   movl(result, -1);
3325   jmpb(DONE_LABEL);
3326 
3327   bind(FOUND_CHAR);
3328   if (UseAVX >= 2) {
3329     vpmovmskb(tmp, vec3);
3330   } else {
3331     pmovmskb(tmp, vec3);
3332   }
3333   bsfl(ch, tmp);
3334   addptr(result, ch);
3335 
3336   bind(FOUND_SEQ_CHAR);
3337   subptr(result, str1);
3338   shrl(result, 1);
3339 
3340   bind(DONE_LABEL);
3341 } // string_indexof_char
3342 
3343 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3344                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3345   ShortBranchVerifier sbv(this);
3346   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3347 
3348   int stride = 16;
3349 
3350   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3351         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3352         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3353         FOUND_SEQ_CHAR, DONE_LABEL;
3354 
3355   movptr(result, str1);
3356   if (UseAVX >= 2) {
3357     cmpl(cnt1, stride);
3358     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3359     cmpl(cnt1, stride*2);
3360     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3361     movdl(vec1, ch);
3362     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3363     vpxor(vec2, vec2);
3364     movl(tmp, cnt1);
3365     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3366     andl(cnt1,0x0000001F);  //tail count (in chars)
3367 
3368     bind(SCAN_TO_32_CHAR_LOOP);
3369     vmovdqu(vec3, Address(result, 0));
3370     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3371     vptest(vec2, vec3);
3372     jcc(Assembler::carryClear, FOUND_CHAR);
3373     addptr(result, 32);
3374     subl(tmp, stride*2);
3375     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3376     jmp(SCAN_TO_16_CHAR);
3377 
3378     bind(SCAN_TO_16_CHAR_INIT);
3379     movdl(vec1, ch);
3380     pxor(vec2, vec2);
3381     pshufb(vec1, vec2);
3382   }
3383 
3384   bind(SCAN_TO_16_CHAR);
3385   cmpl(cnt1, stride);
3386   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3387   if (UseAVX < 2) {
3388     movdl(vec1, ch);
3389     pxor(vec2, vec2);
3390     pshufb(vec1, vec2);
3391   }
3392   movl(tmp, cnt1);
3393   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3394   andl(cnt1,0x0000000F);  //tail count (in bytes)
3395 
3396   bind(SCAN_TO_16_CHAR_LOOP);
3397   movdqu(vec3, Address(result, 0));
3398   pcmpeqb(vec3, vec1);
3399   ptest(vec2, vec3);
3400   jcc(Assembler::carryClear, FOUND_CHAR);
3401   addptr(result, 16);
3402   subl(tmp, stride);
3403   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3404 
3405   bind(SCAN_TO_CHAR_INIT);
3406   testl(cnt1, cnt1);
3407   jcc(Assembler::zero, RET_NOT_FOUND);
3408   bind(SCAN_TO_CHAR_LOOP);
3409   load_unsigned_byte(tmp, Address(result, 0));
3410   cmpl(ch, tmp);
3411   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3412   addptr(result, 1);
3413   subl(cnt1, 1);
3414   jccb(Assembler::zero, RET_NOT_FOUND);
3415   jmp(SCAN_TO_CHAR_LOOP);
3416 
3417   bind(RET_NOT_FOUND);
3418   movl(result, -1);
3419   jmpb(DONE_LABEL);
3420 
3421   bind(FOUND_CHAR);
3422   if (UseAVX >= 2) {
3423     vpmovmskb(tmp, vec3);
3424   } else {
3425     pmovmskb(tmp, vec3);
3426   }
3427   bsfl(ch, tmp);
3428   addptr(result, ch);
3429 
3430   bind(FOUND_SEQ_CHAR);
3431   subptr(result, str1);
3432 
3433   bind(DONE_LABEL);
3434 } // stringL_indexof_char
3435 
3436 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3437   switch (eltype) {
3438   case T_BOOLEAN: return sizeof(jboolean);
3439   case T_BYTE:  return sizeof(jbyte);
3440   case T_SHORT: return sizeof(jshort);
3441   case T_CHAR:  return sizeof(jchar);
3442   case T_INT:   return sizeof(jint);
3443   default:
3444     ShouldNotReachHere();
3445     return -1;
3446   }
3447 }
3448 
3449 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3450   switch (eltype) {
3451   // T_BOOLEAN used as surrogate for unsigned byte
3452   case T_BOOLEAN: movzbl(dst, src);   break;
3453   case T_BYTE:    movsbl(dst, src);   break;
3454   case T_SHORT:   movswl(dst, src);   break;
3455   case T_CHAR:    movzwl(dst, src);   break;
3456   case T_INT:     movl(dst, src);     break;
3457   default:
3458     ShouldNotReachHere();
3459   }
3460 }
3461 
3462 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3463   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3464 }
3465 
3466 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3467   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3468 }
3469 
3470 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3471   const int vlen = Assembler::AVX_256bit;
3472   switch (eltype) {
3473   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3474   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3475   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3476   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3477   case T_INT:
3478     // do nothing
3479     break;
3480   default:
3481     ShouldNotReachHere();
3482   }
3483 }
3484 
3485 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3486                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3487                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3488                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3489                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3490                                         BasicType eltype) {
3491   ShortBranchVerifier sbv(this);
3492   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3493   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3494   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3495 
3496   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3497         SHORT_UNROLLED_LOOP_EXIT,
3498         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3499         UNROLLED_VECTOR_LOOP_BEGIN,
3500         END;
3501   switch (eltype) {
3502   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3503   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3504   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3505   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3506   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3507   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3508   }
3509 
3510   // For "renaming" for readibility of the code
3511   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3512                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3513                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3514 
3515   const int elsize = arrays_hashcode_elsize(eltype);
3516 
3517   /*
3518     if (cnt1 >= 2) {
3519       if (cnt1 >= 32) {
3520         UNROLLED VECTOR LOOP
3521       }
3522       UNROLLED SCALAR LOOP
3523     }
3524     SINGLE SCALAR
3525    */
3526 
3527   cmpl(cnt1, 32);
3528   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3529 
3530   // cnt1 >= 32 && generate_vectorized_loop
3531   xorl(index, index);
3532 
3533   // vresult = IntVector.zero(I256);
3534   for (int idx = 0; idx < 4; idx++) {
3535     vpxor(vresult[idx], vresult[idx]);
3536   }
3537   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3538   Register bound = tmp2;
3539   Register next = tmp3;
3540   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3541   movl(next, Address(tmp2, 0));
3542   movdl(vnext, next);
3543   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3544 
3545   // index = 0;
3546   // bound = cnt1 & ~(32 - 1);
3547   movl(bound, cnt1);
3548   andl(bound, ~(32 - 1));
3549   // for (; index < bound; index += 32) {
3550   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3551   // result *= next;
3552   imull(result, next);
3553   // loop fission to upfront the cost of fetching from memory, OOO execution
3554   // can then hopefully do a better job of prefetching
3555   for (int idx = 0; idx < 4; idx++) {
3556     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3557   }
3558   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3559   for (int idx = 0; idx < 4; idx++) {
3560     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3561     arrays_hashcode_elvcast(vtmp[idx], eltype);
3562     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3563   }
3564   // index += 32;
3565   addl(index, 32);
3566   // index < bound;
3567   cmpl(index, bound);
3568   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3569   // }
3570 
3571   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3572   subl(cnt1, bound);
3573   // release bound
3574 
3575   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3576   for (int idx = 0; idx < 4; idx++) {
3577     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3578     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3579     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3580   }
3581   // result += vresult.reduceLanes(ADD);
3582   for (int idx = 0; idx < 4; idx++) {
3583     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3584   }
3585 
3586   // } else if (cnt1 < 32) {
3587 
3588   bind(SHORT_UNROLLED_BEGIN);
3589   // int i = 1;
3590   movl(index, 1);
3591   cmpl(index, cnt1);
3592   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3593 
3594   // for (; i < cnt1 ; i += 2) {
3595   bind(SHORT_UNROLLED_LOOP_BEGIN);
3596   movl(tmp3, 961);
3597   imull(result, tmp3);
3598   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3599   movl(tmp3, tmp2);
3600   shll(tmp3, 5);
3601   subl(tmp3, tmp2);
3602   addl(result, tmp3);
3603   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3604   addl(result, tmp3);
3605   addl(index, 2);
3606   cmpl(index, cnt1);
3607   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3608 
3609   // }
3610   // if (i >= cnt1) {
3611   bind(SHORT_UNROLLED_LOOP_EXIT);
3612   jccb(Assembler::greater, END);
3613   movl(tmp2, result);
3614   shll(result, 5);
3615   subl(result, tmp2);
3616   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3617   addl(result, tmp3);
3618   // }
3619   bind(END);
3620 
3621   BLOCK_COMMENT("} // arrays_hashcode");
3622 
3623 } // arrays_hashcode
3624 
3625 // helper function for string_compare
3626 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3627                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3628                                            Address::ScaleFactor scale2, Register index, int ae) {
3629   if (ae == StrIntrinsicNode::LL) {
3630     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3631     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3632   } else if (ae == StrIntrinsicNode::UU) {
3633     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3634     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3635   } else {
3636     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3637     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3638   }
3639 }
3640 
3641 // Compare strings, used for char[] and byte[].
3642 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3643                                        Register cnt1, Register cnt2, Register result,
3644                                        XMMRegister vec1, int ae, KRegister mask) {
3645   ShortBranchVerifier sbv(this);
3646   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3647   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3648   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3649   int stride2x2 = 0x40;
3650   Address::ScaleFactor scale = Address::no_scale;
3651   Address::ScaleFactor scale1 = Address::no_scale;
3652   Address::ScaleFactor scale2 = Address::no_scale;
3653 
3654   if (ae != StrIntrinsicNode::LL) {
3655     stride2x2 = 0x20;
3656   }
3657 
3658   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3659     shrl(cnt2, 1);
3660   }
3661   // Compute the minimum of the string lengths and the
3662   // difference of the string lengths (stack).
3663   // Do the conditional move stuff
3664   movl(result, cnt1);
3665   subl(cnt1, cnt2);
3666   push(cnt1);
3667   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3668 
3669   // Is the minimum length zero?
3670   testl(cnt2, cnt2);
3671   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3672   if (ae == StrIntrinsicNode::LL) {
3673     // Load first bytes
3674     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3675     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3676   } else if (ae == StrIntrinsicNode::UU) {
3677     // Load first characters
3678     load_unsigned_short(result, Address(str1, 0));
3679     load_unsigned_short(cnt1, Address(str2, 0));
3680   } else {
3681     load_unsigned_byte(result, Address(str1, 0));
3682     load_unsigned_short(cnt1, Address(str2, 0));
3683   }
3684   subl(result, cnt1);
3685   jcc(Assembler::notZero,  POP_LABEL);
3686 
3687   if (ae == StrIntrinsicNode::UU) {
3688     // Divide length by 2 to get number of chars
3689     shrl(cnt2, 1);
3690   }
3691   cmpl(cnt2, 1);
3692   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3693 
3694   // Check if the strings start at the same location and setup scale and stride
3695   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3696     cmpptr(str1, str2);
3697     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3698     if (ae == StrIntrinsicNode::LL) {
3699       scale = Address::times_1;
3700       stride = 16;
3701     } else {
3702       scale = Address::times_2;
3703       stride = 8;
3704     }
3705   } else {
3706     scale1 = Address::times_1;
3707     scale2 = Address::times_2;
3708     // scale not used
3709     stride = 8;
3710   }
3711 
3712   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3713     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3714     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3715     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3716     Label COMPARE_TAIL_LONG;
3717     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3718 
3719     int pcmpmask = 0x19;
3720     if (ae == StrIntrinsicNode::LL) {
3721       pcmpmask &= ~0x01;
3722     }
3723 
3724     // Setup to compare 16-chars (32-bytes) vectors,
3725     // start from first character again because it has aligned address.
3726     if (ae == StrIntrinsicNode::LL) {
3727       stride2 = 32;
3728     } else {
3729       stride2 = 16;
3730     }
3731     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3732       adr_stride = stride << scale;
3733     } else {
3734       adr_stride1 = 8;  //stride << scale1;
3735       adr_stride2 = 16; //stride << scale2;
3736     }
3737 
3738     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3739     // rax and rdx are used by pcmpestri as elements counters
3740     movl(result, cnt2);
3741     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3742     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3743 
3744     // fast path : compare first 2 8-char vectors.
3745     bind(COMPARE_16_CHARS);
3746     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3747       movdqu(vec1, Address(str1, 0));
3748     } else {
3749       pmovzxbw(vec1, Address(str1, 0));
3750     }
3751     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3752     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3753 
3754     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3755       movdqu(vec1, Address(str1, adr_stride));
3756       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3757     } else {
3758       pmovzxbw(vec1, Address(str1, adr_stride1));
3759       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3760     }
3761     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3762     addl(cnt1, stride);
3763 
3764     // Compare the characters at index in cnt1
3765     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3766     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3767     subl(result, cnt2);
3768     jmp(POP_LABEL);
3769 
3770     // Setup the registers to start vector comparison loop
3771     bind(COMPARE_WIDE_VECTORS);
3772     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3773       lea(str1, Address(str1, result, scale));
3774       lea(str2, Address(str2, result, scale));
3775     } else {
3776       lea(str1, Address(str1, result, scale1));
3777       lea(str2, Address(str2, result, scale2));
3778     }
3779     subl(result, stride2);
3780     subl(cnt2, stride2);
3781     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3782     negptr(result);
3783 
3784     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3785     bind(COMPARE_WIDE_VECTORS_LOOP);
3786 
3787 #ifdef _LP64
3788     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3789       cmpl(cnt2, stride2x2);
3790       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3791       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3792       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3793 
3794       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3795       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3796         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3797         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3798       } else {
3799         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3800         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3801       }
3802       kortestql(mask, mask);
3803       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3804       addptr(result, stride2x2);  // update since we already compared at this addr
3805       subl(cnt2, stride2x2);      // and sub the size too
3806       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3807 
3808       vpxor(vec1, vec1);
3809       jmpb(COMPARE_WIDE_TAIL);
3810     }//if (VM_Version::supports_avx512vlbw())
3811 #endif // _LP64
3812 
3813 
3814     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3815     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3816       vmovdqu(vec1, Address(str1, result, scale));
3817       vpxor(vec1, Address(str2, result, scale));
3818     } else {
3819       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3820       vpxor(vec1, Address(str2, result, scale2));
3821     }
3822     vptest(vec1, vec1);
3823     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3824     addptr(result, stride2);
3825     subl(cnt2, stride2);
3826     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3827     // clean upper bits of YMM registers
3828     vpxor(vec1, vec1);
3829 
3830     // compare wide vectors tail
3831     bind(COMPARE_WIDE_TAIL);
3832     testptr(result, result);
3833     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3834 
3835     movl(result, stride2);
3836     movl(cnt2, result);
3837     negptr(result);
3838     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3839 
3840     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3841     bind(VECTOR_NOT_EQUAL);
3842     // clean upper bits of YMM registers
3843     vpxor(vec1, vec1);
3844     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3845       lea(str1, Address(str1, result, scale));
3846       lea(str2, Address(str2, result, scale));
3847     } else {
3848       lea(str1, Address(str1, result, scale1));
3849       lea(str2, Address(str2, result, scale2));
3850     }
3851     jmp(COMPARE_16_CHARS);
3852 
3853     // Compare tail chars, length between 1 to 15 chars
3854     bind(COMPARE_TAIL_LONG);
3855     movl(cnt2, result);
3856     cmpl(cnt2, stride);
3857     jcc(Assembler::less, COMPARE_SMALL_STR);
3858 
3859     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3860       movdqu(vec1, Address(str1, 0));
3861     } else {
3862       pmovzxbw(vec1, Address(str1, 0));
3863     }
3864     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3865     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3866     subptr(cnt2, stride);
3867     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3868     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3869       lea(str1, Address(str1, result, scale));
3870       lea(str2, Address(str2, result, scale));
3871     } else {
3872       lea(str1, Address(str1, result, scale1));
3873       lea(str2, Address(str2, result, scale2));
3874     }
3875     negptr(cnt2);
3876     jmpb(WHILE_HEAD_LABEL);
3877 
3878     bind(COMPARE_SMALL_STR);
3879   } else if (UseSSE42Intrinsics) {
3880     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3881     int pcmpmask = 0x19;
3882     // Setup to compare 8-char (16-byte) vectors,
3883     // start from first character again because it has aligned address.
3884     movl(result, cnt2);
3885     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3886     if (ae == StrIntrinsicNode::LL) {
3887       pcmpmask &= ~0x01;
3888     }
3889     jcc(Assembler::zero, COMPARE_TAIL);
3890     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3891       lea(str1, Address(str1, result, scale));
3892       lea(str2, Address(str2, result, scale));
3893     } else {
3894       lea(str1, Address(str1, result, scale1));
3895       lea(str2, Address(str2, result, scale2));
3896     }
3897     negptr(result);
3898 
3899     // pcmpestri
3900     //   inputs:
3901     //     vec1- substring
3902     //     rax - negative string length (elements count)
3903     //     mem - scanned string
3904     //     rdx - string length (elements count)
3905     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3906     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3907     //   outputs:
3908     //     rcx - first mismatched element index
3909     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3910 
3911     bind(COMPARE_WIDE_VECTORS);
3912     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3913       movdqu(vec1, Address(str1, result, scale));
3914       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3915     } else {
3916       pmovzxbw(vec1, Address(str1, result, scale1));
3917       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3918     }
3919     // After pcmpestri cnt1(rcx) contains mismatched element index
3920 
3921     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3922     addptr(result, stride);
3923     subptr(cnt2, stride);
3924     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3925 
3926     // compare wide vectors tail
3927     testptr(result, result);
3928     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3929 
3930     movl(cnt2, stride);
3931     movl(result, stride);
3932     negptr(result);
3933     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3934       movdqu(vec1, Address(str1, result, scale));
3935       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3936     } else {
3937       pmovzxbw(vec1, Address(str1, result, scale1));
3938       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3939     }
3940     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3941 
3942     // Mismatched characters in the vectors
3943     bind(VECTOR_NOT_EQUAL);
3944     addptr(cnt1, result);
3945     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3946     subl(result, cnt2);
3947     jmpb(POP_LABEL);
3948 
3949     bind(COMPARE_TAIL); // limit is zero
3950     movl(cnt2, result);
3951     // Fallthru to tail compare
3952   }
3953   // Shift str2 and str1 to the end of the arrays, negate min
3954   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3955     lea(str1, Address(str1, cnt2, scale));
3956     lea(str2, Address(str2, cnt2, scale));
3957   } else {
3958     lea(str1, Address(str1, cnt2, scale1));
3959     lea(str2, Address(str2, cnt2, scale2));
3960   }
3961   decrementl(cnt2);  // first character was compared already
3962   negptr(cnt2);
3963 
3964   // Compare the rest of the elements
3965   bind(WHILE_HEAD_LABEL);
3966   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3967   subl(result, cnt1);
3968   jccb(Assembler::notZero, POP_LABEL);
3969   increment(cnt2);
3970   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3971 
3972   // Strings are equal up to min length.  Return the length difference.
3973   bind(LENGTH_DIFF_LABEL);
3974   pop(result);
3975   if (ae == StrIntrinsicNode::UU) {
3976     // Divide diff by 2 to get number of chars
3977     sarl(result, 1);
3978   }
3979   jmpb(DONE_LABEL);
3980 
3981 #ifdef _LP64
3982   if (VM_Version::supports_avx512vlbw()) {
3983 
3984     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3985 
3986     kmovql(cnt1, mask);
3987     notq(cnt1);
3988     bsfq(cnt2, cnt1);
3989     if (ae != StrIntrinsicNode::LL) {
3990       // Divide diff by 2 to get number of chars
3991       sarl(cnt2, 1);
3992     }
3993     addq(result, cnt2);
3994     if (ae == StrIntrinsicNode::LL) {
3995       load_unsigned_byte(cnt1, Address(str2, result));
3996       load_unsigned_byte(result, Address(str1, result));
3997     } else if (ae == StrIntrinsicNode::UU) {
3998       load_unsigned_short(cnt1, Address(str2, result, scale));
3999       load_unsigned_short(result, Address(str1, result, scale));
4000     } else {
4001       load_unsigned_short(cnt1, Address(str2, result, scale2));
4002       load_unsigned_byte(result, Address(str1, result, scale1));
4003     }
4004     subl(result, cnt1);
4005     jmpb(POP_LABEL);
4006   }//if (VM_Version::supports_avx512vlbw())
4007 #endif // _LP64
4008 
4009   // Discard the stored length difference
4010   bind(POP_LABEL);
4011   pop(cnt1);
4012 
4013   // That's it
4014   bind(DONE_LABEL);
4015   if(ae == StrIntrinsicNode::UL) {
4016     negl(result);
4017   }
4018 
4019 }
4020 
4021 // Search for Non-ASCII character (Negative byte value) in a byte array,
4022 // return the index of the first such character, otherwise the length
4023 // of the array segment searched.
4024 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4025 //   @IntrinsicCandidate
4026 //   public static int countPositives(byte[] ba, int off, int len) {
4027 //     for (int i = off; i < off + len; i++) {
4028 //       if (ba[i] < 0) {
4029 //         return i - off;
4030 //       }
4031 //     }
4032 //     return len;
4033 //   }
4034 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4035   Register result, Register tmp1,
4036   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4037   // rsi: byte array
4038   // rcx: len
4039   // rax: result
4040   ShortBranchVerifier sbv(this);
4041   assert_different_registers(ary1, len, result, tmp1);
4042   assert_different_registers(vec1, vec2);
4043   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4044 
4045   movl(result, len); // copy
4046   // len == 0
4047   testl(len, len);
4048   jcc(Assembler::zero, DONE);
4049 
4050   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4051     VM_Version::supports_avx512vlbw() &&
4052     VM_Version::supports_bmi2()) {
4053 
4054     Label test_64_loop, test_tail, BREAK_LOOP;
4055     movl(tmp1, len);
4056     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4057 
4058     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4059     andl(len,  0xffffffc0); // vector count (in chars)
4060     jccb(Assembler::zero, test_tail);
4061 
4062     lea(ary1, Address(ary1, len, Address::times_1));
4063     negptr(len);
4064 
4065     bind(test_64_loop);
4066     // Check whether our 64 elements of size byte contain negatives
4067     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4068     kortestql(mask1, mask1);
4069     jcc(Assembler::notZero, BREAK_LOOP);
4070 
4071     addptr(len, 64);
4072     jccb(Assembler::notZero, test_64_loop);
4073 
4074     bind(test_tail);
4075     // bail out when there is nothing to be done
4076     testl(tmp1, -1);
4077     jcc(Assembler::zero, DONE);
4078 
4079 
4080     // check the tail for absense of negatives
4081     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4082 #ifdef _LP64
4083     {
4084       Register tmp3_aliased = len;
4085       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4086       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4087       notq(tmp3_aliased);
4088       kmovql(mask2, tmp3_aliased);
4089     }
4090 #else
4091     Label k_init;
4092     jmp(k_init);
4093 
4094     // We could not read 64-bits from a general purpose register thus we move
4095     // data required to compose 64 1's to the instruction stream
4096     // We emit 64 byte wide series of elements from 0..63 which later on would
4097     // be used as a compare targets with tail count contained in tmp1 register.
4098     // Result would be a k register having tmp1 consecutive number or 1
4099     // counting from least significant bit.
4100     address tmp = pc();
4101     emit_int64(0x0706050403020100);
4102     emit_int64(0x0F0E0D0C0B0A0908);
4103     emit_int64(0x1716151413121110);
4104     emit_int64(0x1F1E1D1C1B1A1918);
4105     emit_int64(0x2726252423222120);
4106     emit_int64(0x2F2E2D2C2B2A2928);
4107     emit_int64(0x3736353433323130);
4108     emit_int64(0x3F3E3D3C3B3A3938);
4109 
4110     bind(k_init);
4111     lea(len, InternalAddress(tmp));
4112     // create mask to test for negative byte inside a vector
4113     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4114     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4115 
4116 #endif
4117     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4118     ktestq(mask1, mask2);
4119     jcc(Assembler::zero, DONE);
4120 
4121     // do a full check for negative registers in the tail
4122     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4123                      // ary1 already pointing to the right place
4124     jmpb(TAIL_START);
4125 
4126     bind(BREAK_LOOP);
4127     // At least one byte in the last 64 byte block was negative.
4128     // Set up to look at the last 64 bytes as if they were a tail
4129     lea(ary1, Address(ary1, len, Address::times_1));
4130     addptr(result, len);
4131     // Ignore the very last byte: if all others are positive,
4132     // it must be negative, so we can skip right to the 2+1 byte
4133     // end comparison at this point
4134     orl(result, 63);
4135     movl(len, 63);
4136     // Fallthru to tail compare
4137   } else {
4138 
4139     if (UseAVX >= 2 && UseSSE >= 2) {
4140       // With AVX2, use 32-byte vector compare
4141       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4142 
4143       // Compare 32-byte vectors
4144       testl(len, 0xffffffe0);   // vector count (in bytes)
4145       jccb(Assembler::zero, TAIL_START);
4146 
4147       andl(len, 0xffffffe0);
4148       lea(ary1, Address(ary1, len, Address::times_1));
4149       negptr(len);
4150 
4151       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4152       movdl(vec2, tmp1);
4153       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4154 
4155       bind(COMPARE_WIDE_VECTORS);
4156       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4157       vptest(vec1, vec2);
4158       jccb(Assembler::notZero, BREAK_LOOP);
4159       addptr(len, 32);
4160       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4161 
4162       testl(result, 0x0000001f);   // any bytes remaining?
4163       jcc(Assembler::zero, DONE);
4164 
4165       // Quick test using the already prepared vector mask
4166       movl(len, result);
4167       andl(len, 0x0000001f);
4168       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4169       vptest(vec1, vec2);
4170       jcc(Assembler::zero, DONE);
4171       // There are zeros, jump to the tail to determine exactly where
4172       jmpb(TAIL_START);
4173 
4174       bind(BREAK_LOOP);
4175       // At least one byte in the last 32-byte vector is negative.
4176       // Set up to look at the last 32 bytes as if they were a tail
4177       lea(ary1, Address(ary1, len, Address::times_1));
4178       addptr(result, len);
4179       // Ignore the very last byte: if all others are positive,
4180       // it must be negative, so we can skip right to the 2+1 byte
4181       // end comparison at this point
4182       orl(result, 31);
4183       movl(len, 31);
4184       // Fallthru to tail compare
4185     } else if (UseSSE42Intrinsics) {
4186       // With SSE4.2, use double quad vector compare
4187       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4188 
4189       // Compare 16-byte vectors
4190       testl(len, 0xfffffff0);   // vector count (in bytes)
4191       jcc(Assembler::zero, TAIL_START);
4192 
4193       andl(len, 0xfffffff0);
4194       lea(ary1, Address(ary1, len, Address::times_1));
4195       negptr(len);
4196 
4197       movl(tmp1, 0x80808080);
4198       movdl(vec2, tmp1);
4199       pshufd(vec2, vec2, 0);
4200 
4201       bind(COMPARE_WIDE_VECTORS);
4202       movdqu(vec1, Address(ary1, len, Address::times_1));
4203       ptest(vec1, vec2);
4204       jccb(Assembler::notZero, BREAK_LOOP);
4205       addptr(len, 16);
4206       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4207 
4208       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4209       jcc(Assembler::zero, DONE);
4210 
4211       // Quick test using the already prepared vector mask
4212       movl(len, result);
4213       andl(len, 0x0000000f);   // tail count (in bytes)
4214       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4215       ptest(vec1, vec2);
4216       jcc(Assembler::zero, DONE);
4217       jmpb(TAIL_START);
4218 
4219       bind(BREAK_LOOP);
4220       // At least one byte in the last 16-byte vector is negative.
4221       // Set up and look at the last 16 bytes as if they were a tail
4222       lea(ary1, Address(ary1, len, Address::times_1));
4223       addptr(result, len);
4224       // Ignore the very last byte: if all others are positive,
4225       // it must be negative, so we can skip right to the 2+1 byte
4226       // end comparison at this point
4227       orl(result, 15);
4228       movl(len, 15);
4229       // Fallthru to tail compare
4230     }
4231   }
4232 
4233   bind(TAIL_START);
4234   // Compare 4-byte vectors
4235   andl(len, 0xfffffffc); // vector count (in bytes)
4236   jccb(Assembler::zero, COMPARE_CHAR);
4237 
4238   lea(ary1, Address(ary1, len, Address::times_1));
4239   negptr(len);
4240 
4241   bind(COMPARE_VECTORS);
4242   movl(tmp1, Address(ary1, len, Address::times_1));
4243   andl(tmp1, 0x80808080);
4244   jccb(Assembler::notZero, TAIL_ADJUST);
4245   addptr(len, 4);
4246   jccb(Assembler::notZero, COMPARE_VECTORS);
4247 
4248   // Compare trailing char (final 2-3 bytes), if any
4249   bind(COMPARE_CHAR);
4250 
4251   testl(result, 0x2);   // tail  char
4252   jccb(Assembler::zero, COMPARE_BYTE);
4253   load_unsigned_short(tmp1, Address(ary1, 0));
4254   andl(tmp1, 0x00008080);
4255   jccb(Assembler::notZero, CHAR_ADJUST);
4256   lea(ary1, Address(ary1, 2));
4257 
4258   bind(COMPARE_BYTE);
4259   testl(result, 0x1);   // tail  byte
4260   jccb(Assembler::zero, DONE);
4261   load_unsigned_byte(tmp1, Address(ary1, 0));
4262   testl(tmp1, 0x00000080);
4263   jccb(Assembler::zero, DONE);
4264   subptr(result, 1);
4265   jmpb(DONE);
4266 
4267   bind(TAIL_ADJUST);
4268   // there are negative bits in the last 4 byte block.
4269   // Adjust result and check the next three bytes
4270   addptr(result, len);
4271   orl(result, 3);
4272   lea(ary1, Address(ary1, len, Address::times_1));
4273   jmpb(COMPARE_CHAR);
4274 
4275   bind(CHAR_ADJUST);
4276   // We are looking at a char + optional byte tail, and found that one
4277   // of the bytes in the char is negative. Adjust the result, check the
4278   // first byte and readjust if needed.
4279   andl(result, 0xfffffffc);
4280   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4281   jccb(Assembler::notZero, DONE);
4282   addptr(result, 1);
4283 
4284   // That's it
4285   bind(DONE);
4286   if (UseAVX >= 2 && UseSSE >= 2) {
4287     // clean upper bits of YMM registers
4288     vpxor(vec1, vec1);
4289     vpxor(vec2, vec2);
4290   }
4291 }
4292 
4293 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4294 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4295                                       Register limit, Register result, Register chr,
4296                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4297                                       KRegister mask, bool expand_ary2) {
4298   // for expand_ary2, limit is the (smaller) size of the second array.
4299   ShortBranchVerifier sbv(this);
4300   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4301 
4302   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4303          "Expansion only implemented for AVX2");
4304 
4305   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4306   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4307 
4308   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4309   int scaleIncr = expand_ary2 ? 8 : 16;
4310 
4311   if (is_array_equ) {
4312     // Check the input args
4313     cmpoop(ary1, ary2);
4314     jcc(Assembler::equal, TRUE_LABEL);
4315 
4316     // Need additional checks for arrays_equals.
4317     testptr(ary1, ary1);
4318     jcc(Assembler::zero, FALSE_LABEL);
4319     testptr(ary2, ary2);
4320     jcc(Assembler::zero, FALSE_LABEL);
4321 
4322     // Check the lengths
4323     movl(limit, Address(ary1, length_offset));
4324     cmpl(limit, Address(ary2, length_offset));
4325     jcc(Assembler::notEqual, FALSE_LABEL);
4326   }
4327 
4328   // count == 0
4329   testl(limit, limit);
4330   jcc(Assembler::zero, TRUE_LABEL);
4331 
4332   if (is_array_equ) {
4333     // Load array address
4334     lea(ary1, Address(ary1, base_offset));
4335     lea(ary2, Address(ary2, base_offset));
4336   }
4337 
4338   if (is_array_equ && is_char) {
4339     // arrays_equals when used for char[].
4340     shll(limit, 1);      // byte count != 0
4341   }
4342   movl(result, limit); // copy
4343 
4344   if (UseAVX >= 2) {
4345     // With AVX2, use 32-byte vector compare
4346     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4347 
4348     // Compare 32-byte vectors
4349     if (expand_ary2) {
4350       andl(result, 0x0000000f);  //   tail count (in bytes)
4351       andl(limit, 0xfffffff0);   // vector count (in bytes)
4352       jcc(Assembler::zero, COMPARE_TAIL);
4353     } else {
4354       andl(result, 0x0000001f);  //   tail count (in bytes)
4355       andl(limit, 0xffffffe0);   // vector count (in bytes)
4356       jcc(Assembler::zero, COMPARE_TAIL_16);
4357     }
4358 
4359     lea(ary1, Address(ary1, limit, scaleFactor));
4360     lea(ary2, Address(ary2, limit, Address::times_1));
4361     negptr(limit);
4362 
4363 #ifdef _LP64
4364     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4365       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4366 
4367       cmpl(limit, -64);
4368       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4369 
4370       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4371 
4372       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4373       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4374       kortestql(mask, mask);
4375       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4376       addptr(limit, 64);  // update since we already compared at this addr
4377       cmpl(limit, -64);
4378       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4379 
4380       // At this point we may still need to compare -limit+result bytes.
4381       // We could execute the next two instruction and just continue via non-wide path:
4382       //  cmpl(limit, 0);
4383       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4384       // But since we stopped at the points ary{1,2}+limit which are
4385       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4386       // (|limit| <= 32 and result < 32),
4387       // we may just compare the last 64 bytes.
4388       //
4389       addptr(result, -64);   // it is safe, bc we just came from this area
4390       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4391       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4392       kortestql(mask, mask);
4393       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4394 
4395       jmp(TRUE_LABEL);
4396 
4397       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4398 
4399     }//if (VM_Version::supports_avx512vlbw())
4400 #endif //_LP64
4401     bind(COMPARE_WIDE_VECTORS);
4402     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4403     if (expand_ary2) {
4404       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4405     } else {
4406       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4407     }
4408     vpxor(vec1, vec2);
4409 
4410     vptest(vec1, vec1);
4411     jcc(Assembler::notZero, FALSE_LABEL);
4412     addptr(limit, scaleIncr * 2);
4413     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4414 
4415     testl(result, result);
4416     jcc(Assembler::zero, TRUE_LABEL);
4417 
4418     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4419     if (expand_ary2) {
4420       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4421     } else {
4422       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4423     }
4424     vpxor(vec1, vec2);
4425 
4426     vptest(vec1, vec1);
4427     jcc(Assembler::notZero, FALSE_LABEL);
4428     jmp(TRUE_LABEL);
4429 
4430     bind(COMPARE_TAIL_16); // limit is zero
4431     movl(limit, result);
4432 
4433     // Compare 16-byte chunks
4434     andl(result, 0x0000000f);  //   tail count (in bytes)
4435     andl(limit, 0xfffffff0);   // vector count (in bytes)
4436     jcc(Assembler::zero, COMPARE_TAIL);
4437 
4438     lea(ary1, Address(ary1, limit, scaleFactor));
4439     lea(ary2, Address(ary2, limit, Address::times_1));
4440     negptr(limit);
4441 
4442     bind(COMPARE_WIDE_VECTORS_16);
4443     movdqu(vec1, Address(ary1, limit, scaleFactor));
4444     if (expand_ary2) {
4445       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4446     } else {
4447       movdqu(vec2, Address(ary2, limit, Address::times_1));
4448     }
4449     pxor(vec1, vec2);
4450 
4451     ptest(vec1, vec1);
4452     jcc(Assembler::notZero, FALSE_LABEL);
4453     addptr(limit, scaleIncr);
4454     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4455 
4456     bind(COMPARE_TAIL); // limit is zero
4457     movl(limit, result);
4458     // Fallthru to tail compare
4459   } else if (UseSSE42Intrinsics) {
4460     // With SSE4.2, use double quad vector compare
4461     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4462 
4463     // Compare 16-byte vectors
4464     andl(result, 0x0000000f);  //   tail count (in bytes)
4465     andl(limit, 0xfffffff0);   // vector count (in bytes)
4466     jcc(Assembler::zero, COMPARE_TAIL);
4467 
4468     lea(ary1, Address(ary1, limit, Address::times_1));
4469     lea(ary2, Address(ary2, limit, Address::times_1));
4470     negptr(limit);
4471 
4472     bind(COMPARE_WIDE_VECTORS);
4473     movdqu(vec1, Address(ary1, limit, Address::times_1));
4474     movdqu(vec2, Address(ary2, limit, Address::times_1));
4475     pxor(vec1, vec2);
4476 
4477     ptest(vec1, vec1);
4478     jcc(Assembler::notZero, FALSE_LABEL);
4479     addptr(limit, 16);
4480     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4481 
4482     testl(result, result);
4483     jcc(Assembler::zero, TRUE_LABEL);
4484 
4485     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4486     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4487     pxor(vec1, vec2);
4488 
4489     ptest(vec1, vec1);
4490     jccb(Assembler::notZero, FALSE_LABEL);
4491     jmpb(TRUE_LABEL);
4492 
4493     bind(COMPARE_TAIL); // limit is zero
4494     movl(limit, result);
4495     // Fallthru to tail compare
4496   }
4497 
4498   // Compare 4-byte vectors
4499   if (expand_ary2) {
4500     testl(result, result);
4501     jccb(Assembler::zero, TRUE_LABEL);
4502   } else {
4503     andl(limit, 0xfffffffc); // vector count (in bytes)
4504     jccb(Assembler::zero, COMPARE_CHAR);
4505   }
4506 
4507   lea(ary1, Address(ary1, limit, scaleFactor));
4508   lea(ary2, Address(ary2, limit, Address::times_1));
4509   negptr(limit);
4510 
4511   bind(COMPARE_VECTORS);
4512   if (expand_ary2) {
4513     // There are no "vector" operations for bytes to shorts
4514     movzbl(chr, Address(ary2, limit, Address::times_1));
4515     cmpw(Address(ary1, limit, Address::times_2), chr);
4516     jccb(Assembler::notEqual, FALSE_LABEL);
4517     addptr(limit, 1);
4518     jcc(Assembler::notZero, COMPARE_VECTORS);
4519     jmp(TRUE_LABEL);
4520   } else {
4521     movl(chr, Address(ary1, limit, Address::times_1));
4522     cmpl(chr, Address(ary2, limit, Address::times_1));
4523     jccb(Assembler::notEqual, FALSE_LABEL);
4524     addptr(limit, 4);
4525     jcc(Assembler::notZero, COMPARE_VECTORS);
4526   }
4527 
4528   // Compare trailing char (final 2 bytes), if any
4529   bind(COMPARE_CHAR);
4530   testl(result, 0x2);   // tail  char
4531   jccb(Assembler::zero, COMPARE_BYTE);
4532   load_unsigned_short(chr, Address(ary1, 0));
4533   load_unsigned_short(limit, Address(ary2, 0));
4534   cmpl(chr, limit);
4535   jccb(Assembler::notEqual, FALSE_LABEL);
4536 
4537   if (is_array_equ && is_char) {
4538     bind(COMPARE_BYTE);
4539   } else {
4540     lea(ary1, Address(ary1, 2));
4541     lea(ary2, Address(ary2, 2));
4542 
4543     bind(COMPARE_BYTE);
4544     testl(result, 0x1);   // tail  byte
4545     jccb(Assembler::zero, TRUE_LABEL);
4546     load_unsigned_byte(chr, Address(ary1, 0));
4547     load_unsigned_byte(limit, Address(ary2, 0));
4548     cmpl(chr, limit);
4549     jccb(Assembler::notEqual, FALSE_LABEL);
4550   }
4551   bind(TRUE_LABEL);
4552   movl(result, 1);   // return true
4553   jmpb(DONE);
4554 
4555   bind(FALSE_LABEL);
4556   xorl(result, result); // return false
4557 
4558   // That's it
4559   bind(DONE);
4560   if (UseAVX >= 2) {
4561     // clean upper bits of YMM registers
4562     vpxor(vec1, vec1);
4563     vpxor(vec2, vec2);
4564   }
4565 }
4566 
4567 #ifdef _LP64
4568 
4569 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4570 #define __ masm.
4571   Register dst = stub.data<0>();
4572   XMMRegister src = stub.data<1>();
4573   address target = stub.data<2>();
4574   __ bind(stub.entry());
4575   __ subptr(rsp, 8);
4576   __ movdbl(Address(rsp), src);
4577   __ call(RuntimeAddress(target));
4578   __ pop(dst);
4579   __ jmp(stub.continuation());
4580 #undef __
4581 }
4582 
4583 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4584   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4585   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4586 
4587   address slowpath_target;
4588   if (dst_bt == T_INT) {
4589     if (src_bt == T_FLOAT) {
4590       cvttss2sil(dst, src);
4591       cmpl(dst, 0x80000000);
4592       slowpath_target = StubRoutines::x86::f2i_fixup();
4593     } else {
4594       cvttsd2sil(dst, src);
4595       cmpl(dst, 0x80000000);
4596       slowpath_target = StubRoutines::x86::d2i_fixup();
4597     }
4598   } else {
4599     if (src_bt == T_FLOAT) {
4600       cvttss2siq(dst, src);
4601       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4602       slowpath_target = StubRoutines::x86::f2l_fixup();
4603     } else {
4604       cvttsd2siq(dst, src);
4605       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4606       slowpath_target = StubRoutines::x86::d2l_fixup();
4607     }
4608   }
4609 
4610   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4611   jcc(Assembler::equal, stub->entry());
4612   bind(stub->continuation());
4613 }
4614 
4615 #endif // _LP64
4616 
4617 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4618                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4619   switch(ideal_opc) {
4620     case Op_LShiftVS:
4621       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4622     case Op_LShiftVI:
4623       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4624     case Op_LShiftVL:
4625       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4626     case Op_RShiftVS:
4627       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4628     case Op_RShiftVI:
4629       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4630     case Op_RShiftVL:
4631       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4632     case Op_URShiftVS:
4633       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4634     case Op_URShiftVI:
4635       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4636     case Op_URShiftVL:
4637       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4638     case Op_RotateRightV:
4639       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4640     case Op_RotateLeftV:
4641       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4642     default:
4643       fatal("Unsupported masked operation"); break;
4644   }
4645 }
4646 
4647 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4648                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4649                                     bool is_varshift) {
4650   switch (ideal_opc) {
4651     case Op_AddVB:
4652       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4653     case Op_AddVS:
4654       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4655     case Op_AddVI:
4656       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4657     case Op_AddVL:
4658       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4659     case Op_AddVF:
4660       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4661     case Op_AddVD:
4662       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4663     case Op_SubVB:
4664       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4665     case Op_SubVS:
4666       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4667     case Op_SubVI:
4668       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4669     case Op_SubVL:
4670       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4671     case Op_SubVF:
4672       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4673     case Op_SubVD:
4674       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4675     case Op_MulVS:
4676       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4677     case Op_MulVI:
4678       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4679     case Op_MulVL:
4680       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4681     case Op_MulVF:
4682       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4683     case Op_MulVD:
4684       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4685     case Op_DivVF:
4686       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4687     case Op_DivVD:
4688       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4689     case Op_SqrtVF:
4690       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4691     case Op_SqrtVD:
4692       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4693     case Op_AbsVB:
4694       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4695     case Op_AbsVS:
4696       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4697     case Op_AbsVI:
4698       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4699     case Op_AbsVL:
4700       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4701     case Op_FmaVF:
4702       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4703     case Op_FmaVD:
4704       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4705     case Op_VectorRearrange:
4706       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4707     case Op_LShiftVS:
4708       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4709     case Op_LShiftVI:
4710       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4711     case Op_LShiftVL:
4712       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4713     case Op_RShiftVS:
4714       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4715     case Op_RShiftVI:
4716       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4717     case Op_RShiftVL:
4718       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4719     case Op_URShiftVS:
4720       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4721     case Op_URShiftVI:
4722       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4723     case Op_URShiftVL:
4724       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4725     case Op_RotateLeftV:
4726       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4727     case Op_RotateRightV:
4728       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4729     case Op_MaxV:
4730       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4731     case Op_MinV:
4732       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4733     case Op_XorV:
4734       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4735     case Op_OrV:
4736       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4737     case Op_AndV:
4738       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4739     default:
4740       fatal("Unsupported masked operation"); break;
4741   }
4742 }
4743 
4744 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4745                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4746   switch (ideal_opc) {
4747     case Op_AddVB:
4748       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4749     case Op_AddVS:
4750       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4751     case Op_AddVI:
4752       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4753     case Op_AddVL:
4754       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4755     case Op_AddVF:
4756       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4757     case Op_AddVD:
4758       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4759     case Op_SubVB:
4760       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4761     case Op_SubVS:
4762       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4763     case Op_SubVI:
4764       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4765     case Op_SubVL:
4766       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4767     case Op_SubVF:
4768       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4769     case Op_SubVD:
4770       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4771     case Op_MulVS:
4772       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4773     case Op_MulVI:
4774       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4775     case Op_MulVL:
4776       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4777     case Op_MulVF:
4778       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4779     case Op_MulVD:
4780       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4781     case Op_DivVF:
4782       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4783     case Op_DivVD:
4784       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4785     case Op_FmaVF:
4786       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4787     case Op_FmaVD:
4788       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4789     case Op_MaxV:
4790       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4791     case Op_MinV:
4792       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4793     case Op_XorV:
4794       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4795     case Op_OrV:
4796       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4797     case Op_AndV:
4798       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4799     default:
4800       fatal("Unsupported masked operation"); break;
4801   }
4802 }
4803 
4804 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4805                                   KRegister src1, KRegister src2) {
4806   BasicType etype = T_ILLEGAL;
4807   switch(mask_len) {
4808     case 2:
4809     case 4:
4810     case 8:  etype = T_BYTE; break;
4811     case 16: etype = T_SHORT; break;
4812     case 32: etype = T_INT; break;
4813     case 64: etype = T_LONG; break;
4814     default: fatal("Unsupported type"); break;
4815   }
4816   assert(etype != T_ILLEGAL, "");
4817   switch(ideal_opc) {
4818     case Op_AndVMask:
4819       kand(etype, dst, src1, src2); break;
4820     case Op_OrVMask:
4821       kor(etype, dst, src1, src2); break;
4822     case Op_XorVMask:
4823       kxor(etype, dst, src1, src2); break;
4824     default:
4825       fatal("Unsupported masked operation"); break;
4826   }
4827 }
4828 
4829 /*
4830  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4831  * If src is NaN, the result is 0.
4832  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4833  * the result is equal to the value of Integer.MIN_VALUE.
4834  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4835  * the result is equal to the value of Integer.MAX_VALUE.
4836  */
4837 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4838                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4839                                                                    Register rscratch, AddressLiteral float_sign_flip,
4840                                                                    int vec_enc) {
4841   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4842   Label done;
4843   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4844   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4845   vptest(xtmp2, xtmp2, vec_enc);
4846   jccb(Assembler::equal, done);
4847 
4848   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4849   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4850 
4851   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4852   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4853   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4854 
4855   // Recompute the mask for remaining special value.
4856   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4857   // Extract SRC values corresponding to TRUE mask lanes.
4858   vpand(xtmp4, xtmp2, src, vec_enc);
4859   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4860   // values are set.
4861   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4862 
4863   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4864   bind(done);
4865 }
4866 
4867 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4868                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4869                                                                     Register rscratch, AddressLiteral float_sign_flip,
4870                                                                     int vec_enc) {
4871   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4872   Label done;
4873   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4874   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4875   kortestwl(ktmp1, ktmp1);
4876   jccb(Assembler::equal, done);
4877 
4878   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4879   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4880   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4881 
4882   kxorwl(ktmp1, ktmp1, ktmp2);
4883   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4884   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4885   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4886   bind(done);
4887 }
4888 
4889 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4890                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4891                                                                      Register rscratch, AddressLiteral double_sign_flip,
4892                                                                      int vec_enc) {
4893   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4894 
4895   Label done;
4896   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4897   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4898   kortestwl(ktmp1, ktmp1);
4899   jccb(Assembler::equal, done);
4900 
4901   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4902   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4903   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4904 
4905   kxorwl(ktmp1, ktmp1, ktmp2);
4906   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4907   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4908   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4909   bind(done);
4910 }
4911 
4912 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4913                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4914                                                                      Register rscratch, AddressLiteral float_sign_flip,
4915                                                                      int vec_enc) {
4916   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4917   Label done;
4918   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4919   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4920   kortestwl(ktmp1, ktmp1);
4921   jccb(Assembler::equal, done);
4922 
4923   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4924   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4925   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4926 
4927   kxorwl(ktmp1, ktmp1, ktmp2);
4928   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4929   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4930   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4931   bind(done);
4932 }
4933 
4934 /*
4935  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4936  * If src is NaN, the result is 0.
4937  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4938  * the result is equal to the value of Long.MIN_VALUE.
4939  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4940  * the result is equal to the value of Long.MAX_VALUE.
4941  */
4942 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4943                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4944                                                                       Register rscratch, AddressLiteral double_sign_flip,
4945                                                                       int vec_enc) {
4946   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4947 
4948   Label done;
4949   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4950   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4951   kortestwl(ktmp1, ktmp1);
4952   jccb(Assembler::equal, done);
4953 
4954   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4955   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4956   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4957 
4958   kxorwl(ktmp1, ktmp1, ktmp2);
4959   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4960   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4961   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4962   bind(done);
4963 }
4964 
4965 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4966                                                              XMMRegister xtmp, int index, int vec_enc) {
4967    assert(vec_enc < Assembler::AVX_512bit, "");
4968    if (vec_enc == Assembler::AVX_256bit) {
4969      vextractf128_high(xtmp, src);
4970      vshufps(dst, src, xtmp, index, vec_enc);
4971    } else {
4972      vshufps(dst, src, zero, index, vec_enc);
4973    }
4974 }
4975 
4976 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4977                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4978                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4979   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4980 
4981   Label done;
4982   // Compare the destination lanes with float_sign_flip
4983   // value to get mask for all special values.
4984   movdqu(xtmp1, float_sign_flip, rscratch);
4985   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4986   ptest(xtmp2, xtmp2);
4987   jccb(Assembler::equal, done);
4988 
4989   // Flip float_sign_flip to get max integer value.
4990   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4991   pxor(xtmp1, xtmp4);
4992 
4993   // Set detination lanes corresponding to unordered source lanes as zero.
4994   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
4995   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
4996 
4997   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4998   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4999   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5000 
5001   // Recompute the mask for remaining special value.
5002   pxor(xtmp2, xtmp3);
5003   // Extract mask corresponding to non-negative source lanes.
5004   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5005 
5006   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5007   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5008   pand(xtmp3, xtmp2);
5009 
5010   // Replace destination lanes holding special value(0x80000000) with max int
5011   // if corresponding source lane holds a +ve value.
5012   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5013   bind(done);
5014 }
5015 
5016 
5017 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5018                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5019   switch(to_elem_bt) {
5020     case T_SHORT:
5021       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5022       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5023       vpackusdw(dst, dst, zero, vec_enc);
5024       if (vec_enc == Assembler::AVX_256bit) {
5025         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5026       }
5027       break;
5028     case  T_BYTE:
5029       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5030       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5031       vpackusdw(dst, dst, zero, vec_enc);
5032       if (vec_enc == Assembler::AVX_256bit) {
5033         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5034       }
5035       vpackuswb(dst, dst, zero, vec_enc);
5036       break;
5037     default: assert(false, "%s", type2name(to_elem_bt));
5038   }
5039 }
5040 
5041 /*
5042  * Algorithm for vector D2L and F2I conversions:-
5043  * a) Perform vector D2L/F2I cast.
5044  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5045  *    It signifies that source value could be any of the special floating point
5046  *    values(NaN,-Inf,Inf,Max,-Min).
5047  * c) Set destination to zero if source is NaN value.
5048  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5049  */
5050 
5051 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5052                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5053                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5054   int to_elem_sz = type2aelembytes(to_elem_bt);
5055   assert(to_elem_sz <= 4, "");
5056   vcvttps2dq(dst, src, vec_enc);
5057   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5058   if (to_elem_sz < 4) {
5059     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5060     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5061   }
5062 }
5063 
5064 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5065                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5066                                             Register rscratch, int vec_enc) {
5067   int to_elem_sz = type2aelembytes(to_elem_bt);
5068   assert(to_elem_sz <= 4, "");
5069   vcvttps2dq(dst, src, vec_enc);
5070   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5071   switch(to_elem_bt) {
5072     case T_INT:
5073       break;
5074     case T_SHORT:
5075       evpmovdw(dst, dst, vec_enc);
5076       break;
5077     case T_BYTE:
5078       evpmovdb(dst, dst, vec_enc);
5079       break;
5080     default: assert(false, "%s", type2name(to_elem_bt));
5081   }
5082 }
5083 
5084 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5085                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5086                                             Register rscratch, int vec_enc) {
5087   evcvttps2qq(dst, src, vec_enc);
5088   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5089 }
5090 
5091 // Handling for downcasting from double to integer or sub-word types on AVX2.
5092 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5093                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5094                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5095   int to_elem_sz = type2aelembytes(to_elem_bt);
5096   assert(to_elem_sz < 8, "");
5097   vcvttpd2dq(dst, src, vec_enc);
5098   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5099                                               float_sign_flip, vec_enc);
5100   if (to_elem_sz < 4) {
5101     // xtmp4 holds all zero lanes.
5102     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5103   }
5104 }
5105 
5106 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5107                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5108                                             KRegister ktmp2, AddressLiteral sign_flip,
5109                                             Register rscratch, int vec_enc) {
5110   if (VM_Version::supports_avx512dq()) {
5111     evcvttpd2qq(dst, src, vec_enc);
5112     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5113     switch(to_elem_bt) {
5114       case T_LONG:
5115         break;
5116       case T_INT:
5117         evpmovsqd(dst, dst, vec_enc);
5118         break;
5119       case T_SHORT:
5120         evpmovsqd(dst, dst, vec_enc);
5121         evpmovdw(dst, dst, vec_enc);
5122         break;
5123       case T_BYTE:
5124         evpmovsqd(dst, dst, vec_enc);
5125         evpmovdb(dst, dst, vec_enc);
5126         break;
5127       default: assert(false, "%s", type2name(to_elem_bt));
5128     }
5129   } else {
5130     assert(type2aelembytes(to_elem_bt) <= 4, "");
5131     vcvttpd2dq(dst, src, vec_enc);
5132     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5133     switch(to_elem_bt) {
5134       case T_INT:
5135         break;
5136       case T_SHORT:
5137         evpmovdw(dst, dst, vec_enc);
5138         break;
5139       case T_BYTE:
5140         evpmovdb(dst, dst, vec_enc);
5141         break;
5142       default: assert(false, "%s", type2name(to_elem_bt));
5143     }
5144   }
5145 }
5146 
5147 #ifdef _LP64
5148 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5149                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5150                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5151   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5152   // and re-instantiate original MXCSR.RC mode after that.
5153   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5154 
5155   mov64(tmp, julong_cast(0.5L));
5156   evpbroadcastq(xtmp1, tmp, vec_enc);
5157   vaddpd(xtmp1, src , xtmp1, vec_enc);
5158   evcvtpd2qq(dst, xtmp1, vec_enc);
5159   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5160                                                 double_sign_flip, vec_enc);;
5161 
5162   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5163 }
5164 
5165 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5166                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5167                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5168   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5169   // and re-instantiate original MXCSR.RC mode after that.
5170   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5171 
5172   movl(tmp, jint_cast(0.5));
5173   movq(xtmp1, tmp);
5174   vbroadcastss(xtmp1, xtmp1, vec_enc);
5175   vaddps(xtmp1, src , xtmp1, vec_enc);
5176   vcvtps2dq(dst, xtmp1, vec_enc);
5177   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5178                                               float_sign_flip, vec_enc);
5179 
5180   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5181 }
5182 
5183 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5184                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5185                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5186   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5187   // and re-instantiate original MXCSR.RC mode after that.
5188   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5189 
5190   movl(tmp, jint_cast(0.5));
5191   movq(xtmp1, tmp);
5192   vbroadcastss(xtmp1, xtmp1, vec_enc);
5193   vaddps(xtmp1, src , xtmp1, vec_enc);
5194   vcvtps2dq(dst, xtmp1, vec_enc);
5195   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5196 
5197   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5198 }
5199 #endif // _LP64
5200 
5201 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5202                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5203   switch (from_elem_bt) {
5204     case T_BYTE:
5205       switch (to_elem_bt) {
5206         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5207         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5208         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5209         default: ShouldNotReachHere();
5210       }
5211       break;
5212     case T_SHORT:
5213       switch (to_elem_bt) {
5214         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5215         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5216         default: ShouldNotReachHere();
5217       }
5218       break;
5219     case T_INT:
5220       assert(to_elem_bt == T_LONG, "");
5221       vpmovzxdq(dst, src, vlen_enc);
5222       break;
5223     default:
5224       ShouldNotReachHere();
5225   }
5226 }
5227 
5228 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5229                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5230   switch (from_elem_bt) {
5231     case T_BYTE:
5232       switch (to_elem_bt) {
5233         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5234         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5235         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5236         default: ShouldNotReachHere();
5237       }
5238       break;
5239     case T_SHORT:
5240       switch (to_elem_bt) {
5241         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5242         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5243         default: ShouldNotReachHere();
5244       }
5245       break;
5246     case T_INT:
5247       assert(to_elem_bt == T_LONG, "");
5248       vpmovsxdq(dst, src, vlen_enc);
5249       break;
5250     default:
5251       ShouldNotReachHere();
5252   }
5253 }
5254 
5255 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5256                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5257   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5258   assert(vlen_enc != AVX_512bit, "");
5259 
5260   int dst_bt_size = type2aelembytes(dst_bt);
5261   int src_bt_size = type2aelembytes(src_bt);
5262   if (dst_bt_size > src_bt_size) {
5263     switch (dst_bt_size / src_bt_size) {
5264       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5265       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5266       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5267       default: ShouldNotReachHere();
5268     }
5269   } else {
5270     assert(dst_bt_size < src_bt_size, "");
5271     switch (src_bt_size / dst_bt_size) {
5272       case 2: {
5273         if (vlen_enc == AVX_128bit) {
5274           vpacksswb(dst, src, src, vlen_enc);
5275         } else {
5276           vpacksswb(dst, src, src, vlen_enc);
5277           vpermq(dst, dst, 0x08, vlen_enc);
5278         }
5279         break;
5280       }
5281       case 4: {
5282         if (vlen_enc == AVX_128bit) {
5283           vpackssdw(dst, src, src, vlen_enc);
5284           vpacksswb(dst, dst, dst, vlen_enc);
5285         } else {
5286           vpackssdw(dst, src, src, vlen_enc);
5287           vpermq(dst, dst, 0x08, vlen_enc);
5288           vpacksswb(dst, dst, dst, AVX_128bit);
5289         }
5290         break;
5291       }
5292       case 8: {
5293         if (vlen_enc == AVX_128bit) {
5294           vpshufd(dst, src, 0x08, vlen_enc);
5295           vpackssdw(dst, dst, dst, vlen_enc);
5296           vpacksswb(dst, dst, dst, vlen_enc);
5297         } else {
5298           vpshufd(dst, src, 0x08, vlen_enc);
5299           vpermq(dst, dst, 0x08, vlen_enc);
5300           vpackssdw(dst, dst, dst, AVX_128bit);
5301           vpacksswb(dst, dst, dst, AVX_128bit);
5302         }
5303         break;
5304       }
5305       default: ShouldNotReachHere();
5306     }
5307   }
5308 }
5309 
5310 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5311                                    bool merge, BasicType bt, int vlen_enc) {
5312   if (bt == T_INT) {
5313     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5314   } else {
5315     assert(bt == T_LONG, "");
5316     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5317   }
5318 }
5319 
5320 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5321                                    bool merge, BasicType bt, int vlen_enc) {
5322   if (bt == T_INT) {
5323     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5324   } else {
5325     assert(bt == T_LONG, "");
5326     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5327   }
5328 }
5329 
5330 #ifdef _LP64
5331 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5332                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5333                                                int vec_enc) {
5334   int index = 0;
5335   int vindex = 0;
5336   mov64(rtmp1, 0x0101010101010101L);
5337   pdepq(rtmp1, src, rtmp1);
5338   if (mask_len > 8) {
5339     movq(rtmp2, src);
5340     vpxor(xtmp, xtmp, xtmp, vec_enc);
5341     movq(xtmp, rtmp1);
5342   }
5343   movq(dst, rtmp1);
5344 
5345   mask_len -= 8;
5346   while (mask_len > 0) {
5347     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5348     index++;
5349     if ((index % 2) == 0) {
5350       pxor(xtmp, xtmp);
5351     }
5352     mov64(rtmp1, 0x0101010101010101L);
5353     shrq(rtmp2, 8);
5354     pdepq(rtmp1, rtmp2, rtmp1);
5355     pinsrq(xtmp, rtmp1, index % 2);
5356     vindex = index / 2;
5357     if (vindex) {
5358       // Write entire 16 byte vector when both 64 bit
5359       // lanes are update to save redundant instructions.
5360       if (index % 2) {
5361         vinsertf128(dst, dst, xtmp, vindex);
5362       }
5363     } else {
5364       vmovdqu(dst, xtmp);
5365     }
5366     mask_len -= 8;
5367   }
5368 }
5369 
5370 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5371   switch(opc) {
5372     case Op_VectorMaskTrueCount:
5373       popcntq(dst, tmp);
5374       break;
5375     case Op_VectorMaskLastTrue:
5376       if (VM_Version::supports_lzcnt()) {
5377         lzcntq(tmp, tmp);
5378         movl(dst, 63);
5379         subl(dst, tmp);
5380       } else {
5381         movl(dst, -1);
5382         bsrq(tmp, tmp);
5383         cmov32(Assembler::notZero, dst, tmp);
5384       }
5385       break;
5386     case Op_VectorMaskFirstTrue:
5387       if (VM_Version::supports_bmi1()) {
5388         if (masklen < 32) {
5389           orl(tmp, 1 << masklen);
5390           tzcntl(dst, tmp);
5391         } else if (masklen == 32) {
5392           tzcntl(dst, tmp);
5393         } else {
5394           assert(masklen == 64, "");
5395           tzcntq(dst, tmp);
5396         }
5397       } else {
5398         if (masklen < 32) {
5399           orl(tmp, 1 << masklen);
5400           bsfl(dst, tmp);
5401         } else {
5402           assert(masklen == 32 || masklen == 64, "");
5403           movl(dst, masklen);
5404           if (masklen == 32)  {
5405             bsfl(tmp, tmp);
5406           } else {
5407             bsfq(tmp, tmp);
5408           }
5409           cmov32(Assembler::notZero, dst, tmp);
5410         }
5411       }
5412       break;
5413     case Op_VectorMaskToLong:
5414       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5415       break;
5416     default: assert(false, "Unhandled mask operation");
5417   }
5418 }
5419 
5420 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5421                                               int masklen, int masksize, int vec_enc) {
5422   assert(VM_Version::supports_popcnt(), "");
5423 
5424   if(VM_Version::supports_avx512bw()) {
5425     kmovql(tmp, mask);
5426   } else {
5427     assert(masklen <= 16, "");
5428     kmovwl(tmp, mask);
5429   }
5430 
5431   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5432   // operations needs to be clipped.
5433   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5434     andq(tmp, (1 << masklen) - 1);
5435   }
5436 
5437   vector_mask_operation_helper(opc, dst, tmp, masklen);
5438 }
5439 
5440 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5441                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5442   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5443          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5444   assert(VM_Version::supports_popcnt(), "");
5445 
5446   bool need_clip = false;
5447   switch(bt) {
5448     case T_BOOLEAN:
5449       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5450       vpxor(xtmp, xtmp, xtmp, vec_enc);
5451       vpsubb(xtmp, xtmp, mask, vec_enc);
5452       vpmovmskb(tmp, xtmp, vec_enc);
5453       need_clip = masklen < 16;
5454       break;
5455     case T_BYTE:
5456       vpmovmskb(tmp, mask, vec_enc);
5457       need_clip = masklen < 16;
5458       break;
5459     case T_SHORT:
5460       vpacksswb(xtmp, mask, mask, vec_enc);
5461       if (masklen >= 16) {
5462         vpermpd(xtmp, xtmp, 8, vec_enc);
5463       }
5464       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5465       need_clip = masklen < 16;
5466       break;
5467     case T_INT:
5468     case T_FLOAT:
5469       vmovmskps(tmp, mask, vec_enc);
5470       need_clip = masklen < 4;
5471       break;
5472     case T_LONG:
5473     case T_DOUBLE:
5474       vmovmskpd(tmp, mask, vec_enc);
5475       need_clip = masklen < 2;
5476       break;
5477     default: assert(false, "Unhandled type, %s", type2name(bt));
5478   }
5479 
5480   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5481   // operations needs to be clipped.
5482   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5483     // need_clip implies masklen < 32
5484     andq(tmp, (1 << masklen) - 1);
5485   }
5486 
5487   vector_mask_operation_helper(opc, dst, tmp, masklen);
5488 }
5489 
5490 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5491                                              Register rtmp2, int mask_len) {
5492   kmov(rtmp1, src);
5493   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5494   mov64(rtmp2, -1L);
5495   pextq(rtmp2, rtmp2, rtmp1);
5496   kmov(dst, rtmp2);
5497 }
5498 
5499 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5500                                                     XMMRegister mask, Register rtmp, Register rscratch,
5501                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5502                                                     int vec_enc) {
5503   assert(type2aelembytes(bt) >= 4, "");
5504   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5505   address compress_perm_table = nullptr;
5506   address expand_perm_table = nullptr;
5507   if (type2aelembytes(bt) == 8) {
5508     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5509     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5510     vmovmskpd(rtmp, mask, vec_enc);
5511   } else {
5512     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5513     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5514     vmovmskps(rtmp, mask, vec_enc);
5515   }
5516   shlq(rtmp, 5); // for 32 byte permute row.
5517   if (opcode == Op_CompressV) {
5518     lea(rscratch, ExternalAddress(compress_perm_table));
5519   } else {
5520     lea(rscratch, ExternalAddress(expand_perm_table));
5521   }
5522   addptr(rtmp, rscratch);
5523   vmovdqu(permv, Address(rtmp));
5524   vpermps(dst, permv, src, Assembler::AVX_256bit);
5525   vpxor(xtmp, xtmp, xtmp, vec_enc);
5526   // Blend the result with zero vector using permute mask, each column entry
5527   // in a permute table row contains either a valid permute index or a -1 (default)
5528   // value, this can potentially be used as a blending mask after
5529   // compressing/expanding the source vector lanes.
5530   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5531 }
5532 
5533 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5534                                                bool merge, BasicType bt, int vec_enc) {
5535   if (opcode == Op_CompressV) {
5536     switch(bt) {
5537     case T_BYTE:
5538       evpcompressb(dst, mask, src, merge, vec_enc);
5539       break;
5540     case T_CHAR:
5541     case T_SHORT:
5542       evpcompressw(dst, mask, src, merge, vec_enc);
5543       break;
5544     case T_INT:
5545       evpcompressd(dst, mask, src, merge, vec_enc);
5546       break;
5547     case T_FLOAT:
5548       evcompressps(dst, mask, src, merge, vec_enc);
5549       break;
5550     case T_LONG:
5551       evpcompressq(dst, mask, src, merge, vec_enc);
5552       break;
5553     case T_DOUBLE:
5554       evcompresspd(dst, mask, src, merge, vec_enc);
5555       break;
5556     default:
5557       fatal("Unsupported type %s", type2name(bt));
5558       break;
5559     }
5560   } else {
5561     assert(opcode == Op_ExpandV, "");
5562     switch(bt) {
5563     case T_BYTE:
5564       evpexpandb(dst, mask, src, merge, vec_enc);
5565       break;
5566     case T_CHAR:
5567     case T_SHORT:
5568       evpexpandw(dst, mask, src, merge, vec_enc);
5569       break;
5570     case T_INT:
5571       evpexpandd(dst, mask, src, merge, vec_enc);
5572       break;
5573     case T_FLOAT:
5574       evexpandps(dst, mask, src, merge, vec_enc);
5575       break;
5576     case T_LONG:
5577       evpexpandq(dst, mask, src, merge, vec_enc);
5578       break;
5579     case T_DOUBLE:
5580       evexpandpd(dst, mask, src, merge, vec_enc);
5581       break;
5582     default:
5583       fatal("Unsupported type %s", type2name(bt));
5584       break;
5585     }
5586   }
5587 }
5588 #endif
5589 
5590 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5591                                            KRegister ktmp1, int vec_enc) {
5592   if (opcode == Op_SignumVD) {
5593     vsubpd(dst, zero, one, vec_enc);
5594     // if src < 0 ? -1 : 1
5595     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5596     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5597     // if src == NaN, -0.0 or 0.0 return src.
5598     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5599     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5600   } else {
5601     assert(opcode == Op_SignumVF, "");
5602     vsubps(dst, zero, one, vec_enc);
5603     // if src < 0 ? -1 : 1
5604     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5605     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5606     // if src == NaN, -0.0 or 0.0 return src.
5607     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5608     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5609   }
5610 }
5611 
5612 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5613                                           XMMRegister xtmp1, int vec_enc) {
5614   if (opcode == Op_SignumVD) {
5615     vsubpd(dst, zero, one, vec_enc);
5616     // if src < 0 ? -1 : 1
5617     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5618     // if src == NaN, -0.0 or 0.0 return src.
5619     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5620     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5621   } else {
5622     assert(opcode == Op_SignumVF, "");
5623     vsubps(dst, zero, one, vec_enc);
5624     // if src < 0 ? -1 : 1
5625     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5626     // if src == NaN, -0.0 or 0.0 return src.
5627     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5628     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5629   }
5630 }
5631 
5632 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5633   if (VM_Version::supports_avx512bw()) {
5634     if (mask_len > 32) {
5635       kmovql(dst, src);
5636     } else {
5637       kmovdl(dst, src);
5638       if (mask_len != 32) {
5639         kshiftrdl(dst, dst, 32 - mask_len);
5640       }
5641     }
5642   } else {
5643     assert(mask_len <= 16, "");
5644     kmovwl(dst, src);
5645     if (mask_len != 16) {
5646       kshiftrwl(dst, dst, 16 - mask_len);
5647     }
5648   }
5649 }
5650 
5651 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5652   int lane_size = type2aelembytes(bt);
5653   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5654   if ((is_LP64 || lane_size < 8) &&
5655       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5656        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5657     movptr(rtmp, imm32);
5658     switch(lane_size) {
5659       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5660       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5661       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5662       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5663       fatal("Unsupported lane size %d", lane_size);
5664       break;
5665     }
5666   } else {
5667     movptr(rtmp, imm32);
5668     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5669     switch(lane_size) {
5670       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5671       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5672       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5673       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5674       fatal("Unsupported lane size %d", lane_size);
5675       break;
5676     }
5677   }
5678 }
5679 
5680 //
5681 // Following is lookup table based popcount computation algorithm:-
5682 //       Index   Bit set count
5683 //     [ 0000 ->   0,
5684 //       0001 ->   1,
5685 //       0010 ->   1,
5686 //       0011 ->   2,
5687 //       0100 ->   1,
5688 //       0101 ->   2,
5689 //       0110 ->   2,
5690 //       0111 ->   3,
5691 //       1000 ->   1,
5692 //       1001 ->   2,
5693 //       1010 ->   3,
5694 //       1011 ->   3,
5695 //       1100 ->   2,
5696 //       1101 ->   3,
5697 //       1111 ->   4 ]
5698 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5699 //     shuffle indices for lookup table access.
5700 //  b. Right shift each byte of vector lane by 4 positions.
5701 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5702 //     shuffle indices for lookup table access.
5703 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5704 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5705 //     count of all the bytes of a quadword.
5706 //  f. Perform step e. for upper 128bit vector lane.
5707 //  g. Pack the bitset count of quadwords back to double word.
5708 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5709 
5710 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5711                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5712   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5713   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5714   vpsrlw(dst, src, 4, vec_enc);
5715   vpand(dst, dst, xtmp1, vec_enc);
5716   vpand(xtmp1, src, xtmp1, vec_enc);
5717   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5718   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5719   vpshufb(dst, xtmp2, dst, vec_enc);
5720   vpaddb(dst, dst, xtmp1, vec_enc);
5721 }
5722 
5723 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5724                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5725   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5726   // Following code is as per steps e,f,g and h of above algorithm.
5727   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5728   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5729   vpsadbw(dst, dst, xtmp2, vec_enc);
5730   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5731   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5732   vpackuswb(dst, xtmp1, dst, vec_enc);
5733 }
5734 
5735 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5736                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5737   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5738   // Add the popcount of upper and lower bytes of word.
5739   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5740   vpsrlw(dst, xtmp1, 8, vec_enc);
5741   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5742   vpaddw(dst, dst, xtmp1, vec_enc);
5743 }
5744 
5745 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5746                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5747   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5748   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5749   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5750 }
5751 
5752 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5753                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5754   switch(bt) {
5755     case T_LONG:
5756       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5757       break;
5758     case T_INT:
5759       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5760       break;
5761     case T_CHAR:
5762     case T_SHORT:
5763       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5764       break;
5765     case T_BYTE:
5766     case T_BOOLEAN:
5767       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5768       break;
5769     default:
5770       fatal("Unsupported type %s", type2name(bt));
5771       break;
5772   }
5773 }
5774 
5775 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5776                                                       KRegister mask, bool merge, int vec_enc) {
5777   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5778   switch(bt) {
5779     case T_LONG:
5780       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5781       evpopcntq(dst, mask, src, merge, vec_enc);
5782       break;
5783     case T_INT:
5784       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5785       evpopcntd(dst, mask, src, merge, vec_enc);
5786       break;
5787     case T_CHAR:
5788     case T_SHORT:
5789       assert(VM_Version::supports_avx512_bitalg(), "");
5790       evpopcntw(dst, mask, src, merge, vec_enc);
5791       break;
5792     case T_BYTE:
5793     case T_BOOLEAN:
5794       assert(VM_Version::supports_avx512_bitalg(), "");
5795       evpopcntb(dst, mask, src, merge, vec_enc);
5796       break;
5797     default:
5798       fatal("Unsupported type %s", type2name(bt));
5799       break;
5800   }
5801 }
5802 
5803 #ifndef _LP64
5804 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5805   assert(VM_Version::supports_avx512bw(), "");
5806   kmovdl(tmp, src);
5807   kunpckdql(dst, tmp, tmp);
5808 }
5809 #endif
5810 
5811 // Bit reversal algorithm first reverses the bits of each byte followed by
5812 // a byte level reversal for multi-byte primitive types (short/int/long).
5813 // Algorithm performs a lookup table access to get reverse bit sequence
5814 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5815 // is obtained by swapping the reverse bit sequences of upper and lower
5816 // nibble of a byte.
5817 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5818                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5819   if (VM_Version::supports_avx512vlbw()) {
5820 
5821     // Get the reverse bit sequence of lower nibble of each byte.
5822     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5823     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5824     evpandq(dst, xtmp2, src, vec_enc);
5825     vpshufb(dst, xtmp1, dst, vec_enc);
5826     vpsllq(dst, dst, 4, vec_enc);
5827 
5828     // Get the reverse bit sequence of upper nibble of each byte.
5829     vpandn(xtmp2, xtmp2, src, vec_enc);
5830     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5831     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5832 
5833     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5834     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5835     evporq(xtmp2, dst, xtmp2, vec_enc);
5836     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5837 
5838   } else if(vec_enc == Assembler::AVX_512bit) {
5839     // Shift based bit reversal.
5840     assert(bt == T_LONG || bt == T_INT, "");
5841 
5842     // Swap lower and upper nibble of each byte.
5843     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5844 
5845     // Swap two least and most significant bits of each nibble.
5846     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5847 
5848     // Swap adjacent pair of bits.
5849     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5850     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5851 
5852     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5853     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5854   } else {
5855     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5856     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5857 
5858     // Get the reverse bit sequence of lower nibble of each byte.
5859     vpand(dst, xtmp2, src, vec_enc);
5860     vpshufb(dst, xtmp1, dst, vec_enc);
5861     vpsllq(dst, dst, 4, vec_enc);
5862 
5863     // Get the reverse bit sequence of upper nibble of each byte.
5864     vpandn(xtmp2, xtmp2, src, vec_enc);
5865     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5866     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5867 
5868     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5869     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5870     vpor(xtmp2, dst, xtmp2, vec_enc);
5871     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5872   }
5873 }
5874 
5875 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5876                                                 XMMRegister xtmp, Register rscratch) {
5877   assert(VM_Version::supports_gfni(), "");
5878   assert(rscratch != noreg || always_reachable(mask), "missing");
5879 
5880   // Galois field instruction based bit reversal based on following algorithm.
5881   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5882   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5883   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5884   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5885 }
5886 
5887 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5888                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5889   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5890   evpandq(dst, xtmp1, src, vec_enc);
5891   vpsllq(dst, dst, nbits, vec_enc);
5892   vpandn(xtmp1, xtmp1, src, vec_enc);
5893   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5894   evporq(dst, dst, xtmp1, vec_enc);
5895 }
5896 
5897 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5898                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5899   // Shift based bit reversal.
5900   assert(VM_Version::supports_evex(), "");
5901   switch(bt) {
5902     case T_LONG:
5903       // Swap upper and lower double word of each quad word.
5904       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5905       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5906       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5907       break;
5908     case T_INT:
5909       // Swap upper and lower word of each double word.
5910       evprord(xtmp1, k0, src, 16, true, vec_enc);
5911       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5912       break;
5913     case T_CHAR:
5914     case T_SHORT:
5915       // Swap upper and lower byte of each word.
5916       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5917       break;
5918     case T_BYTE:
5919       evmovdquq(dst, k0, src, true, vec_enc);
5920       break;
5921     default:
5922       fatal("Unsupported type %s", type2name(bt));
5923       break;
5924   }
5925 }
5926 
5927 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5928   if (bt == T_BYTE) {
5929     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5930       evmovdquq(dst, k0, src, true, vec_enc);
5931     } else {
5932       vmovdqu(dst, src);
5933     }
5934     return;
5935   }
5936   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5937   // pre-computed shuffle indices.
5938   switch(bt) {
5939     case T_LONG:
5940       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5941       break;
5942     case T_INT:
5943       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5944       break;
5945     case T_CHAR:
5946     case T_SHORT:
5947       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5948       break;
5949     default:
5950       fatal("Unsupported type %s", type2name(bt));
5951       break;
5952   }
5953   vpshufb(dst, src, dst, vec_enc);
5954 }
5955 
5956 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5957                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5958                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5959   assert(is_integral_type(bt), "");
5960   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5961   assert(VM_Version::supports_avx512cd(), "");
5962   switch(bt) {
5963     case T_LONG:
5964       evplzcntq(dst, ktmp, src, merge, vec_enc);
5965       break;
5966     case T_INT:
5967       evplzcntd(dst, ktmp, src, merge, vec_enc);
5968       break;
5969     case T_SHORT:
5970       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5971       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5972       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5973       vpunpckhwd(dst, xtmp1, src, vec_enc);
5974       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5975       vpackusdw(dst, xtmp2, dst, vec_enc);
5976       break;
5977     case T_BYTE:
5978       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5979       // accessing the lookup table.
5980       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5981       // accessing the lookup table.
5982       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5983       assert(VM_Version::supports_avx512bw(), "");
5984       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5985       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5986       vpand(xtmp2, dst, src, vec_enc);
5987       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5988       vpsrlw(xtmp3, src, 4, vec_enc);
5989       vpand(xtmp3, dst, xtmp3, vec_enc);
5990       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5991       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5992       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5993       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5994       break;
5995     default:
5996       fatal("Unsupported type %s", type2name(bt));
5997       break;
5998   }
5999 }
6000 
6001 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6002                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6003   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6004   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6005   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6006   // accessing the lookup table.
6007   vpand(dst, xtmp2, src, vec_enc);
6008   vpshufb(dst, xtmp1, dst, vec_enc);
6009   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6010   // accessing the lookup table.
6011   vpsrlw(xtmp3, src, 4, vec_enc);
6012   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6013   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6014   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6015   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6016   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6017   vpaddb(dst, dst, xtmp2, vec_enc);
6018   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6019 }
6020 
6021 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6022                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6023   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6024   // Add zero counts of lower byte and upper byte of a word if
6025   // upper byte holds a zero value.
6026   vpsrlw(xtmp3, src, 8, vec_enc);
6027   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6028   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6029   vpsllw(xtmp2, dst, 8, vec_enc);
6030   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6031   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6032   vpsrlw(dst, dst, 8, vec_enc);
6033 }
6034 
6035 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6036                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6037   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6038   // hence biased exponent can be used to compute leading zero count as per
6039   // following formula:-
6040   // LZCNT = 32 - (biased_exp - 127)
6041   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6042 
6043   // Broadcast 0xFF
6044   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6045   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6046 
6047   // Extract biased exponent.
6048   vcvtdq2ps(dst, src, vec_enc);
6049   vpsrld(dst, dst, 23, vec_enc);
6050   vpand(dst, dst, xtmp1, vec_enc);
6051 
6052   // Broadcast 127.
6053   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6054   // Exponent = biased_exp - 127
6055   vpsubd(dst, dst, xtmp1, vec_enc);
6056 
6057   // Exponent = Exponent  + 1
6058   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6059   vpaddd(dst, dst, xtmp3, vec_enc);
6060 
6061   // Replace -ve exponent with zero, exponent is -ve when src
6062   // lane contains a zero value.
6063   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6064   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6065 
6066   // Rematerialize broadcast 32.
6067   vpslld(xtmp1, xtmp3, 5, vec_enc);
6068   // Exponent is 32 if corresponding source lane contains max_int value.
6069   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6070   // LZCNT = 32 - exponent
6071   vpsubd(dst, xtmp1, dst, vec_enc);
6072 
6073   // Replace LZCNT with a value 1 if corresponding source lane
6074   // contains max_int value.
6075   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6076 
6077   // Replace biased_exp with 0 if source lane value is less than zero.
6078   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6079   vblendvps(dst, dst, xtmp2, src, vec_enc);
6080 }
6081 
6082 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6083                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6084   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6085   // Add zero counts of lower word and upper word of a double word if
6086   // upper word holds a zero value.
6087   vpsrld(xtmp3, src, 16, vec_enc);
6088   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6089   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6090   vpslld(xtmp2, dst, 16, vec_enc);
6091   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6092   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6093   vpsrld(dst, dst, 16, vec_enc);
6094   // Add zero counts of lower doubleword and upper doubleword of a
6095   // quadword if upper doubleword holds a zero value.
6096   vpsrlq(xtmp3, src, 32, vec_enc);
6097   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6098   vpsllq(xtmp2, dst, 32, vec_enc);
6099   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6100   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6101   vpsrlq(dst, dst, 32, vec_enc);
6102 }
6103 
6104 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6105                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6106                                                        Register rtmp, int vec_enc) {
6107   assert(is_integral_type(bt), "unexpected type");
6108   assert(vec_enc < Assembler::AVX_512bit, "");
6109   switch(bt) {
6110     case T_LONG:
6111       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6112       break;
6113     case T_INT:
6114       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6115       break;
6116     case T_SHORT:
6117       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6118       break;
6119     case T_BYTE:
6120       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6121       break;
6122     default:
6123       fatal("Unsupported type %s", type2name(bt));
6124       break;
6125   }
6126 }
6127 
6128 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6129   switch(bt) {
6130     case T_BYTE:
6131       vpsubb(dst, src1, src2, vec_enc);
6132       break;
6133     case T_SHORT:
6134       vpsubw(dst, src1, src2, vec_enc);
6135       break;
6136     case T_INT:
6137       vpsubd(dst, src1, src2, vec_enc);
6138       break;
6139     case T_LONG:
6140       vpsubq(dst, src1, src2, vec_enc);
6141       break;
6142     default:
6143       fatal("Unsupported type %s", type2name(bt));
6144       break;
6145   }
6146 }
6147 
6148 // Trailing zero count computation is based on leading zero count operation as per
6149 // following equation. All AVX3 targets support AVX512CD feature which offers
6150 // direct vector instruction to compute leading zero count.
6151 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6152 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6153                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6154                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6155   assert(is_integral_type(bt), "");
6156   // xtmp = -1
6157   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6158   // xtmp = xtmp + src
6159   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6160   // xtmp = xtmp & ~src
6161   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6162   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6163   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6164   vpsub(bt, dst, xtmp4, dst, vec_enc);
6165 }
6166 
6167 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6168 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6169 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6170                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6171   assert(is_integral_type(bt), "");
6172   // xtmp = 0
6173   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6174   // xtmp = 0 - src
6175   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6176   // xtmp = xtmp | src
6177   vpor(xtmp3, xtmp3, src, vec_enc);
6178   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6179   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6180   vpsub(bt, dst, xtmp1, dst, vec_enc);
6181 }
6182 
6183 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6184   Label done;
6185   Label neg_divisor_fastpath;
6186   cmpl(divisor, 0);
6187   jccb(Assembler::less, neg_divisor_fastpath);
6188   xorl(rdx, rdx);
6189   divl(divisor);
6190   jmpb(done);
6191   bind(neg_divisor_fastpath);
6192   // Fastpath for divisor < 0:
6193   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6194   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6195   movl(rdx, rax);
6196   subl(rdx, divisor);
6197   if (VM_Version::supports_bmi1()) {
6198     andnl(rax, rdx, rax);
6199   } else {
6200     notl(rdx);
6201     andl(rax, rdx);
6202   }
6203   shrl(rax, 31);
6204   bind(done);
6205 }
6206 
6207 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6208   Label done;
6209   Label neg_divisor_fastpath;
6210   cmpl(divisor, 0);
6211   jccb(Assembler::less, neg_divisor_fastpath);
6212   xorl(rdx, rdx);
6213   divl(divisor);
6214   jmpb(done);
6215   bind(neg_divisor_fastpath);
6216   // Fastpath when divisor < 0:
6217   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6218   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6219   movl(rdx, rax);
6220   subl(rax, divisor);
6221   if (VM_Version::supports_bmi1()) {
6222     andnl(rax, rax, rdx);
6223   } else {
6224     notl(rax);
6225     andl(rax, rdx);
6226   }
6227   sarl(rax, 31);
6228   andl(rax, divisor);
6229   subl(rdx, rax);
6230   bind(done);
6231 }
6232 
6233 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6234   Label done;
6235   Label neg_divisor_fastpath;
6236 
6237   cmpl(divisor, 0);
6238   jccb(Assembler::less, neg_divisor_fastpath);
6239   xorl(rdx, rdx);
6240   divl(divisor);
6241   jmpb(done);
6242   bind(neg_divisor_fastpath);
6243   // Fastpath for divisor < 0:
6244   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6245   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6246   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6247   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6248   movl(rdx, rax);
6249   subl(rax, divisor);
6250   if (VM_Version::supports_bmi1()) {
6251     andnl(rax, rax, rdx);
6252   } else {
6253     notl(rax);
6254     andl(rax, rdx);
6255   }
6256   movl(tmp, rax);
6257   shrl(rax, 31); // quotient
6258   sarl(tmp, 31);
6259   andl(tmp, divisor);
6260   subl(rdx, tmp); // remainder
6261   bind(done);
6262 }
6263 
6264 #ifdef _LP64
6265 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6266                                  XMMRegister xtmp2, Register rtmp) {
6267   if(VM_Version::supports_gfni()) {
6268     // Galois field instruction based bit reversal based on following algorithm.
6269     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6270     mov64(rtmp, 0x8040201008040201L);
6271     movq(xtmp1, src);
6272     movq(xtmp2, rtmp);
6273     gf2p8affineqb(xtmp1, xtmp2, 0);
6274     movq(dst, xtmp1);
6275   } else {
6276     // Swap even and odd numbered bits.
6277     movl(rtmp, src);
6278     andl(rtmp, 0x55555555);
6279     shll(rtmp, 1);
6280     movl(dst, src);
6281     andl(dst, 0xAAAAAAAA);
6282     shrl(dst, 1);
6283     orl(dst, rtmp);
6284 
6285     // Swap LSB and MSB 2 bits of each nibble.
6286     movl(rtmp, dst);
6287     andl(rtmp, 0x33333333);
6288     shll(rtmp, 2);
6289     andl(dst, 0xCCCCCCCC);
6290     shrl(dst, 2);
6291     orl(dst, rtmp);
6292 
6293     // Swap LSB and MSB 4 bits of each byte.
6294     movl(rtmp, dst);
6295     andl(rtmp, 0x0F0F0F0F);
6296     shll(rtmp, 4);
6297     andl(dst, 0xF0F0F0F0);
6298     shrl(dst, 4);
6299     orl(dst, rtmp);
6300   }
6301   bswapl(dst);
6302 }
6303 
6304 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6305                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6306   if(VM_Version::supports_gfni()) {
6307     // Galois field instruction based bit reversal based on following algorithm.
6308     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6309     mov64(rtmp1, 0x8040201008040201L);
6310     movq(xtmp1, src);
6311     movq(xtmp2, rtmp1);
6312     gf2p8affineqb(xtmp1, xtmp2, 0);
6313     movq(dst, xtmp1);
6314   } else {
6315     // Swap even and odd numbered bits.
6316     movq(rtmp1, src);
6317     mov64(rtmp2, 0x5555555555555555L);
6318     andq(rtmp1, rtmp2);
6319     shlq(rtmp1, 1);
6320     movq(dst, src);
6321     notq(rtmp2);
6322     andq(dst, rtmp2);
6323     shrq(dst, 1);
6324     orq(dst, rtmp1);
6325 
6326     // Swap LSB and MSB 2 bits of each nibble.
6327     movq(rtmp1, dst);
6328     mov64(rtmp2, 0x3333333333333333L);
6329     andq(rtmp1, rtmp2);
6330     shlq(rtmp1, 2);
6331     notq(rtmp2);
6332     andq(dst, rtmp2);
6333     shrq(dst, 2);
6334     orq(dst, rtmp1);
6335 
6336     // Swap LSB and MSB 4 bits of each byte.
6337     movq(rtmp1, dst);
6338     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6339     andq(rtmp1, rtmp2);
6340     shlq(rtmp1, 4);
6341     notq(rtmp2);
6342     andq(dst, rtmp2);
6343     shrq(dst, 4);
6344     orq(dst, rtmp1);
6345   }
6346   bswapq(dst);
6347 }
6348 
6349 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6350   Label done;
6351   Label neg_divisor_fastpath;
6352   cmpq(divisor, 0);
6353   jccb(Assembler::less, neg_divisor_fastpath);
6354   xorl(rdx, rdx);
6355   divq(divisor);
6356   jmpb(done);
6357   bind(neg_divisor_fastpath);
6358   // Fastpath for divisor < 0:
6359   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6360   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6361   movq(rdx, rax);
6362   subq(rdx, divisor);
6363   if (VM_Version::supports_bmi1()) {
6364     andnq(rax, rdx, rax);
6365   } else {
6366     notq(rdx);
6367     andq(rax, rdx);
6368   }
6369   shrq(rax, 63);
6370   bind(done);
6371 }
6372 
6373 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6374   Label done;
6375   Label neg_divisor_fastpath;
6376   cmpq(divisor, 0);
6377   jccb(Assembler::less, neg_divisor_fastpath);
6378   xorq(rdx, rdx);
6379   divq(divisor);
6380   jmp(done);
6381   bind(neg_divisor_fastpath);
6382   // Fastpath when divisor < 0:
6383   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6384   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6385   movq(rdx, rax);
6386   subq(rax, divisor);
6387   if (VM_Version::supports_bmi1()) {
6388     andnq(rax, rax, rdx);
6389   } else {
6390     notq(rax);
6391     andq(rax, rdx);
6392   }
6393   sarq(rax, 63);
6394   andq(rax, divisor);
6395   subq(rdx, rax);
6396   bind(done);
6397 }
6398 
6399 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6400   Label done;
6401   Label neg_divisor_fastpath;
6402   cmpq(divisor, 0);
6403   jccb(Assembler::less, neg_divisor_fastpath);
6404   xorq(rdx, rdx);
6405   divq(divisor);
6406   jmp(done);
6407   bind(neg_divisor_fastpath);
6408   // Fastpath for divisor < 0:
6409   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6410   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6411   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6412   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6413   movq(rdx, rax);
6414   subq(rax, divisor);
6415   if (VM_Version::supports_bmi1()) {
6416     andnq(rax, rax, rdx);
6417   } else {
6418     notq(rax);
6419     andq(rax, rdx);
6420   }
6421   movq(tmp, rax);
6422   shrq(rax, 63); // quotient
6423   sarq(tmp, 63);
6424   andq(tmp, divisor);
6425   subq(rdx, tmp); // remainder
6426   bind(done);
6427 }
6428 #endif
6429 
6430 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6431                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6432                                         int vlen_enc) {
6433   assert(VM_Version::supports_avx512bw(), "");
6434   // Byte shuffles are inlane operations and indices are determined using
6435   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6436   // normalized to index range 0-15. This makes sure that all the multiples
6437   // of an index value are placed at same relative position in 128 bit
6438   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6439   // will be 16th element in their respective 128 bit lanes.
6440   movl(rtmp, 16);
6441   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6442 
6443   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6444   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6445   // original shuffle indices and move the shuffled lanes corresponding to true
6446   // mask to destination vector.
6447   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6448   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6449   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6450 
6451   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6452   // and broadcasting second 128 bit lane.
6453   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6454   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6455   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6456   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6457   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6458 
6459   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6460   // and broadcasting third 128 bit lane.
6461   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6462   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6463   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6464   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6465   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6466 
6467   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6468   // and broadcasting third 128 bit lane.
6469   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6470   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6471   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6472   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6473   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6474 }
6475 
6476 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6477                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6478   if (vlen_enc == AVX_128bit) {
6479     vpermilps(dst, src, shuffle, vlen_enc);
6480   } else if (bt == T_INT) {
6481     vpermd(dst, shuffle, src, vlen_enc);
6482   } else {
6483     assert(bt == T_FLOAT, "");
6484     vpermps(dst, shuffle, src, vlen_enc);
6485   }
6486 }