1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  54 
  55   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  56   // NativeJump::patch_verified_entry will be able to patch out the entry
  57   // code safely. The push to verify stack depth is ok at 5 bytes,
  58   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  59   // stack bang then we must use the 6 byte frame allocation even if
  60   // we have no frame. :-(
  61   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  62 
  63   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  64   // Remove word for return addr
  65   framesize -= wordSize;
  66   stack_bang_size -= wordSize;
  67 
  68   // Calls to C2R adapters often do not accept exceptional returns.
  69   // We require that their callers must bang for them.  But be careful, because
  70   // some VM calls (such as call site linkage) can use several kilobytes of
  71   // stack.  But the stack safety zone should account for that.
  72   // See bugs 4446381, 4468289, 4497237.
  73   if (stack_bang_size > 0) {
  74     generate_stack_overflow_check(stack_bang_size);
  75 
  76     // We always push rbp, so that on return to interpreter rbp, will be
  77     // restored correctly and we can correct the stack.
  78     push(rbp);
  79     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  80     if (PreserveFramePointer) {
  81       mov(rbp, rsp);
  82     }
  83     // Remove word for ebp
  84     framesize -= wordSize;
  85 
  86     // Create frame
  87     if (framesize) {
  88       subptr(rsp, framesize);
  89     }
  90   } else {
  91     // Create frame (force generation of a 4 byte immediate value)
  92     subptr_imm32(rsp, framesize);
  93 
  94     // Save RBP register now.
  95     framesize -= wordSize;
  96     movptr(Address(rsp, framesize), rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       movptr(rbp, rsp);
 100       if (framesize > 0) {
 101         addptr(rbp, framesize);
 102       }
 103     }
 104   }
 105 
 106   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 107     framesize -= wordSize;
 108     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 109   }
 110 
 111 #ifndef _LP64
 112   // If method sets FPU control word do it now
 113   if (fp_mode_24b) {
 114     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 115   }
 116   if (UseSSE >= 2 && VerifyFPU) {
 117     verify_FPU(0, "FPU stack must be clean on entry");
 118   }
 119 #endif
 120 
 121 #ifdef ASSERT
 122   if (VerifyStackAtCalls) {
 123     Label L;
 124     push(rax);
 125     mov(rax, rsp);
 126     andptr(rax, StackAlignmentInBytes-1);
 127     cmpptr(rax, StackAlignmentInBytes-wordSize);
 128     pop(rax);
 129     jcc(Assembler::equal, L);
 130     STOP("Stack is not properly aligned!");
 131     bind(L);
 132   }
 133 #endif
 134 
 135   if (!is_stub) {
 136     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137  #ifdef _LP64
 138     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 139       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 140       Label dummy_slow_path;
 141       Label dummy_continuation;
 142       Label* slow_path = &dummy_slow_path;
 143       Label* continuation = &dummy_continuation;
 144       if (!Compile::current()->output()->in_scratch_emit_size()) {
 145         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 146         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 147         Compile::current()->output()->add_stub(stub);
 148         slow_path = &stub->entry();
 149         continuation = &stub->continuation();
 150       }
 151       bs->nmethod_entry_barrier(this, slow_path, continuation);
 152     }
 153 #else
 154     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 155     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 156 #endif
 157   }
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 // fast_lock and fast_unlock used by C2
 176 
 177 // Because the transitions from emitted code to the runtime
 178 // monitorenter/exit helper stubs are so slow it's critical that
 179 // we inline both the stack-locking fast path and the inflated fast path.
 180 //
 181 // See also: cmpFastLock and cmpFastUnlock.
 182 //
 183 // What follows is a specialized inline transliteration of the code
 184 // in enter() and exit(). If we're concerned about I$ bloat another
 185 // option would be to emit TrySlowEnter and TrySlowExit methods
 186 // at startup-time.  These methods would accept arguments as
 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 188 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 190 // In practice, however, the # of lock sites is bounded and is usually small.
 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 192 // if the processor uses simple bimodal branch predictors keyed by EIP
 193 // Since the helper routines would be called from multiple synchronization
 194 // sites.
 195 //
 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 198 // to those specialized methods.  That'd give us a mostly platform-independent
 199 // implementation that the JITs could optimize and inline at their pleasure.
 200 // Done correctly, the only time we'd need to cross to native could would be
 201 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 203 // (b) explicit barriers or fence operations.
 204 //
 205 // TODO:
 206 //
 207 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 208 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 209 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 210 //    the lock operators would typically be faster than reifying Self.
 211 //
 212 // *  Ideally I'd define the primitives as:
 213 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 214 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 215 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 216 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 217 //    Furthermore the register assignments are overconstrained, possibly resulting in
 218 //    sub-optimal code near the synchronization site.
 219 //
 220 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 221 //    Alternately, use a better sp-proximity test.
 222 //
 223 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 224 //    Either one is sufficient to uniquely identify a thread.
 225 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 226 //
 227 // *  Intrinsify notify() and notifyAll() for the common cases where the
 228 //    object is locked by the calling thread but the waitlist is empty.
 229 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 230 //
 231 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 232 //    But beware of excessive branch density on AMD Opterons.
 233 //
 234 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 235 //    or failure of the fast path.  If the fast path fails then we pass
 236 //    control to the slow path, typically in C.  In fast_lock and
 237 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 238 //    will emit a conditional branch immediately after the node.
 239 //    So we have branches to branches and lots of ICC.ZF games.
 240 //    Instead, it might be better to have C2 pass a "FailureLabel"
 241 //    into fast_lock and fast_unlock.  In the case of success, control
 242 //    will drop through the node.  ICC.ZF is undefined at exit.
 243 //    In the case of failure, the node will branch directly to the
 244 //    FailureLabel
 245 
 246 
 247 // obj: object to lock
 248 // box: on-stack box address (displaced header location) - KILLED
 249 // rax,: tmp -- KILLED
 250 // scr: tmp -- KILLED
 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 252                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 253                                  Metadata* method_data) {
 254   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 255   // Ensure the register assignments are disjoint
 256   assert(tmpReg == rax, "");
 257   assert(cx1Reg == noreg, "");
 258   assert(cx2Reg == noreg, "");
 259   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 260 
 261   // Possible cases that we'll encounter in fast_lock
 262   // ------------------------------------------------
 263   // * Inflated
 264   //    -- unlocked
 265   //    -- Locked
 266   //       = by self
 267   //       = by other
 268   // * neutral
 269   // * stack-locked
 270   //    -- by self
 271   //       = sp-proximity test hits
 272   //       = sp-proximity test generates false-negative
 273   //    -- by other
 274   //
 275 
 276   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 277 
 278   if (DiagnoseSyncOnValueBasedClasses != 0) {
 279     load_klass(tmpReg, objReg, scrReg);
 280     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 281     jcc(Assembler::notZero, DONE_LABEL);
 282   }
 283 
 284   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 285   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 286   jcc(Assembler::notZero, IsInflated);
 287 
 288   if (LockingMode == LM_MONITOR) {
 289     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 290     testptr(objReg, objReg);
 291   } else {
 292     assert(LockingMode == LM_LEGACY, "must be");
 293     // Attempt stack-locking ...
 294     orptr (tmpReg, markWord::unlocked_value);
 295     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 296     lock();
 297     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 298     jcc(Assembler::equal, COUNT);           // Success
 299 
 300     // Recursive locking.
 301     // The object is stack-locked: markword contains stack pointer to BasicLock.
 302     // Locked by current thread if difference with current SP is less than one page.
 303     subptr(tmpReg, rsp);
 304     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 305     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 306     movptr(Address(boxReg, 0), tmpReg);
 307   }
 308   jmp(DONE_LABEL);
 309 
 310   bind(IsInflated);
 311   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 312 
 313 #ifndef _LP64
 314   // The object is inflated.
 315 
 316   // boxReg refers to the on-stack BasicLock in the current frame.
 317   // We'd like to write:
 318   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 319   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 320   // additional latency as we have another ST in the store buffer that must drain.
 321 
 322   // avoid ST-before-CAS
 323   // register juggle because we need tmpReg for cmpxchgptr below
 324   movptr(scrReg, boxReg);
 325   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 326 
 327   // Optimistic form: consider XORL tmpReg,tmpReg
 328   movptr(tmpReg, NULL_WORD);
 329 
 330   // Appears unlocked - try to swing _owner from null to non-null.
 331   // Ideally, I'd manifest "Self" with get_thread and then attempt
 332   // to CAS the register containing Self into m->Owner.
 333   // But we don't have enough registers, so instead we can either try to CAS
 334   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 335   // we later store "Self" into m->Owner.  Transiently storing a stack address
 336   // (rsp or the address of the box) into  m->owner is harmless.
 337   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 338   lock();
 339   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 340   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 341   // If we weren't able to swing _owner from null to the BasicLock
 342   // then take the slow path.
 343   jccb  (Assembler::notZero, NO_COUNT);
 344   // update _owner from BasicLock to thread
 345   get_thread (scrReg);                    // beware: clobbers ICCs
 346   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 347   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 348 
 349   // If the CAS fails we can either retry or pass control to the slow path.
 350   // We use the latter tactic.
 351   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 352   // If the CAS was successful ...
 353   //   Self has acquired the lock
 354   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 355   // Intentional fall-through into DONE_LABEL ...
 356 #else // _LP64
 357   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 358   movq(scrReg, tmpReg);
 359   xorq(tmpReg, tmpReg);
 360   lock();
 361   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 362   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 363   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 364   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 365   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 366   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 367 
 368   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 369   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 370   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 371   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 372 #endif // _LP64
 373   bind(DONE_LABEL);
 374 
 375   // ZFlag == 1 count in fast path
 376   // ZFlag == 0 count in slow path
 377   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 378 
 379   bind(COUNT);
 380   // Count monitors in fast path
 381   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 382 
 383   xorl(tmpReg, tmpReg); // Set ZF == 1
 384 
 385   bind(NO_COUNT);
 386 
 387   // At NO_COUNT the icc ZFlag is set as follows ...
 388   // fast_unlock uses the same protocol.
 389   // ZFlag == 1 -> Success
 390   // ZFlag == 0 -> Failure - force control through the slow path
 391 }
 392 
 393 // obj: object to unlock
 394 // box: box address (displaced header location), killed.  Must be EAX.
 395 // tmp: killed, cannot be obj nor box.
 396 //
 397 // Some commentary on balanced locking:
 398 //
 399 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 400 // Methods that don't have provably balanced locking are forced to run in the
 401 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 402 // The interpreter provides two properties:
 403 // I1:  At return-time the interpreter automatically and quietly unlocks any
 404 //      objects acquired the current activation (frame).  Recall that the
 405 //      interpreter maintains an on-stack list of locks currently held by
 406 //      a frame.
 407 // I2:  If a method attempts to unlock an object that is not held by the
 408 //      the frame the interpreter throws IMSX.
 409 //
 410 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 411 // B() doesn't have provably balanced locking so it runs in the interpreter.
 412 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 413 // is still locked by A().
 414 //
 415 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 416 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 417 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 418 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 419 // Arguably given that the spec legislates the JNI case as undefined our implementation
 420 // could reasonably *avoid* checking owner in fast_unlock().
 421 // In the interest of performance we elide m->Owner==Self check in unlock.
 422 // A perfectly viable alternative is to elide the owner check except when
 423 // Xcheck:jni is enabled.
 424 
 425 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 426   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 427   assert(boxReg == rax, "");
 428   assert_different_registers(objReg, boxReg, tmpReg);
 429 
 430   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 431 
 432   if (LockingMode == LM_LEGACY) {
 433     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 434     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 435   }
 436   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 437   if (LockingMode != LM_MONITOR) {
 438     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 439     jcc(Assembler::zero, Stacked);
 440   }
 441 
 442   // It's inflated.
 443 
 444   // Despite our balanced locking property we still check that m->_owner == Self
 445   // as java routines or native JNI code called by this thread might
 446   // have released the lock.
 447   // Refer to the comments in synchronizer.cpp for how we might encode extra
 448   // state in _succ so we can avoid fetching EntryList|cxq.
 449   //
 450   // If there's no contention try a 1-0 exit.  That is, exit without
 451   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 452   // we detect and recover from the race that the 1-0 exit admits.
 453   //
 454   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 455   // before it STs null into _owner, releasing the lock.  Updates
 456   // to data protected by the critical section must be visible before
 457   // we drop the lock (and thus before any other thread could acquire
 458   // the lock and observe the fields protected by the lock).
 459   // IA32's memory-model is SPO, so STs are ordered with respect to
 460   // each other and there's no need for an explicit barrier (fence).
 461   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 462   Label LSuccess, LNotRecursive;
 463 
 464   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 465   jccb(Assembler::equal, LNotRecursive);
 466 
 467   // Recursive inflated unlock
 468   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 469   jmpb(LSuccess);
 470 
 471   bind(LNotRecursive);
 472 
 473   // Set owner to null.
 474   // Release to satisfy the JMM
 475   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 476   // We need a full fence after clearing owner to avoid stranding.
 477   // StoreLoad achieves this.
 478   membar(StoreLoad);
 479 
 480   // Check if the entry lists are empty.
 481   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 482   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 483   jccb(Assembler::zero, LSuccess);    // If so we are done.
 484 
 485   // Check if there is a successor.
 486   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 487   jccb(Assembler::notZero, LSuccess); // If so we are done.
 488 
 489   // Save the monitor pointer in the current thread, so we can try to
 490   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 491   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 492 #ifndef _LP64
 493   get_thread(boxReg);
 494   movptr(Address(boxReg, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 495 #else // _LP64
 496   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 497 #endif
 498 
 499   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 500   jmpb  (DONE_LABEL);
 501 
 502   bind  (LSuccess);
 503   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 504   jmpb  (DONE_LABEL);
 505 
 506   if (LockingMode == LM_LEGACY) {
 507     bind  (Stacked);
 508     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 509     lock();
 510     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 511     // Intentional fall-thru into DONE_LABEL
 512   }
 513 
 514   bind(DONE_LABEL);
 515 
 516   // ZFlag == 1 count in fast path
 517   // ZFlag == 0 count in slow path
 518   jccb(Assembler::notZero, NO_COUNT);
 519 
 520   bind(COUNT);
 521   // Count monitors in fast path
 522 #ifndef _LP64
 523   get_thread(tmpReg);
 524   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 525 #else // _LP64
 526   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 527 #endif
 528 
 529   xorl(tmpReg, tmpReg); // Set ZF == 1
 530 
 531   bind(NO_COUNT);
 532 }
 533 
 534 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 535                                               Register t, Register thread) {
 536   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 537   assert(rax_reg == rax, "Used for CAS");
 538   assert_different_registers(obj, box, rax_reg, t, thread);
 539 
 540   // Handle inflated monitor.
 541   Label inflated;
 542   // Finish fast lock successfully. ZF value is irrelevant.
 543   Label locked;
 544   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 545   Label slow_path;
 546 
 547   if (UseObjectMonitorTable) {
 548     // Clear cache in case fast locking succeeds.
 549     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 550   }
 551 
 552   if (DiagnoseSyncOnValueBasedClasses != 0) {
 553     load_klass(rax_reg, obj, t);
 554     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 555     jcc(Assembler::notZero, slow_path);
 556   }
 557 
 558   const Register mark = t;
 559 
 560   { // Lightweight Lock
 561 
 562     Label push;
 563 
 564     const Register top = UseObjectMonitorTable ? rax_reg : box;
 565 
 566     // Load the mark.
 567     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 568 
 569     // Prefetch top.
 570     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 571 
 572     // Check for monitor (0b10).
 573     testptr(mark, markWord::monitor_value);
 574     jcc(Assembler::notZero, inflated);
 575 
 576     // Check if lock-stack is full.
 577     cmpl(top, LockStack::end_offset() - 1);
 578     jcc(Assembler::greater, slow_path);
 579 
 580     // Check if recursive.
 581     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 582     jccb(Assembler::equal, push);
 583 
 584     // Try to lock. Transition lock bits 0b01 => 0b00
 585     movptr(rax_reg, mark);
 586     orptr(rax_reg, markWord::unlocked_value);
 587     andptr(mark, ~(int32_t)markWord::unlocked_value);
 588     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 589     jcc(Assembler::notEqual, slow_path);
 590 
 591     if (UseObjectMonitorTable) {
 592       // Need to reload top, clobbered by CAS.
 593       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 594     }
 595     bind(push);
 596     // After successful lock, push object on lock-stack.
 597     movptr(Address(thread, top), obj);
 598     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 599     jmpb(locked);
 600   }
 601 
 602   { // Handle inflated monitor.
 603     bind(inflated);
 604 
 605     const Register monitor = t;
 606 
 607     if (!UseObjectMonitorTable) {
 608       assert(mark == monitor, "should be the same here");
 609     } else {
 610       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 611       // Fetch ObjectMonitor* from the cache or take the slow-path.
 612       Label monitor_found;
 613 
 614       // Load cache address
 615       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 616 
 617       const int num_unrolled = 2;
 618       for (int i = 0; i < num_unrolled; i++) {
 619         cmpptr(obj, Address(t));
 620         jccb(Assembler::equal, monitor_found);
 621         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 622       }
 623 
 624       Label loop;
 625 
 626       // Search for obj in cache.
 627       bind(loop);
 628 
 629       // Check for match.
 630       cmpptr(obj, Address(t));
 631       jccb(Assembler::equal, monitor_found);
 632 
 633       // Search until null encountered, guaranteed _null_sentinel at end.
 634       cmpptr(Address(t), 1);
 635       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 636       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 637       jmpb(loop);
 638 
 639       // Cache hit.
 640       bind(monitor_found);
 641       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 642     }
 643     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 644     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 645     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 646 
 647     Label monitor_locked;
 648     // Lock the monitor.
 649 
 650     // CAS owner (null => current thread).
 651     xorptr(rax_reg, rax_reg);
 652     lock(); cmpxchgptr(thread, owner_address);
 653     jccb(Assembler::equal, monitor_locked);
 654 
 655     // Check if recursive.
 656     cmpptr(thread, rax_reg);
 657     jccb(Assembler::notEqual, slow_path);
 658 
 659     // Recursive.
 660     increment(recursions_address);
 661 
 662     bind(monitor_locked);
 663     if (UseObjectMonitorTable) {
 664       // Cache the monitor for unlock
 665       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 666     }
 667   }
 668 
 669   bind(locked);
 670   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 671   // Set ZF = 1
 672   xorl(rax_reg, rax_reg);
 673 
 674 #ifdef ASSERT
 675   // Check that locked label is reached with ZF set.
 676   Label zf_correct;
 677   Label zf_bad_zero;
 678   jcc(Assembler::zero, zf_correct);
 679   jmp(zf_bad_zero);
 680 #endif
 681 
 682   bind(slow_path);
 683 #ifdef ASSERT
 684   // Check that slow_path label is reached with ZF not set.
 685   jcc(Assembler::notZero, zf_correct);
 686   stop("Fast Lock ZF != 0");
 687   bind(zf_bad_zero);
 688   stop("Fast Lock ZF != 1");
 689   bind(zf_correct);
 690 #endif
 691   // C2 uses the value of ZF to determine the continuation.
 692 }
 693 
 694 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 695   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 696   assert(reg_rax == rax, "Used for CAS");
 697   assert_different_registers(obj, reg_rax, t);
 698 
 699   // Handle inflated monitor.
 700   Label inflated, inflated_check_lock_stack;
 701   // Finish fast unlock successfully.  MUST jump with ZF == 1
 702   Label unlocked, slow_path;
 703 
 704   const Register mark = t;
 705   const Register monitor = t;
 706   const Register top = UseObjectMonitorTable ? t : reg_rax;
 707   const Register box = reg_rax;
 708 
 709   Label dummy;
 710   C2FastUnlockLightweightStub* stub = nullptr;
 711 
 712   if (!Compile::current()->output()->in_scratch_emit_size()) {
 713     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 714     Compile::current()->output()->add_stub(stub);
 715   }
 716 
 717   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 718 
 719   { // Lightweight Unlock
 720 
 721     // Load top.
 722     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 723 
 724     if (!UseObjectMonitorTable) {
 725       // Prefetch mark.
 726       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 727     }
 728 
 729     // Check if obj is top of lock-stack.
 730     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 731     // Top of lock stack was not obj. Must be monitor.
 732     jcc(Assembler::notEqual, inflated_check_lock_stack);
 733 
 734     // Pop lock-stack.
 735     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 736     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 737 
 738     // Check if recursive.
 739     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 740     jcc(Assembler::equal, unlocked);
 741 
 742     // We elide the monitor check, let the CAS fail instead.
 743 
 744     if (UseObjectMonitorTable) {
 745       // Load mark.
 746       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 747     }
 748 
 749     // Try to unlock. Transition lock bits 0b00 => 0b01
 750     movptr(reg_rax, mark);
 751     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 752     orptr(mark, markWord::unlocked_value);
 753     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 754     jcc(Assembler::notEqual, push_and_slow_path);
 755     jmp(unlocked);
 756   }
 757 
 758 
 759   { // Handle inflated monitor.
 760     bind(inflated_check_lock_stack);
 761 #ifdef ASSERT
 762     Label check_done;
 763     subl(top, oopSize);
 764     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 765     jcc(Assembler::below, check_done);
 766     cmpptr(obj, Address(thread, top));
 767     jccb(Assembler::notEqual, inflated_check_lock_stack);
 768     stop("Fast Unlock lock on stack");
 769     bind(check_done);
 770     if (UseObjectMonitorTable) {
 771       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 772     }
 773     testptr(mark, markWord::monitor_value);
 774     jccb(Assembler::notZero, inflated);
 775     stop("Fast Unlock not monitor");
 776 #endif
 777 
 778     bind(inflated);
 779 
 780     if (!UseObjectMonitorTable) {
 781       assert(mark == monitor, "should be the same here");
 782     } else {
 783       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 784       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 785       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 786       cmpptr(monitor, alignof(ObjectMonitor*));
 787       jcc(Assembler::below, slow_path);
 788     }
 789     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 790     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 791     const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag};
 792     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 793     const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag};
 794     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 795 
 796     Label recursive;
 797 
 798     // Check if recursive.
 799     cmpptr(recursions_address, 0);
 800     jccb(Assembler::notZero, recursive);
 801 
 802     // Set owner to null.
 803     // Release to satisfy the JMM
 804     movptr(owner_address, NULL_WORD);
 805     // We need a full fence after clearing owner to avoid stranding.
 806     // StoreLoad achieves this.
 807     membar(StoreLoad);
 808 
 809     // Check if the entry lists are empty.
 810     movptr(reg_rax, cxq_address);
 811     orptr(reg_rax, EntryList_address);
 812     jccb(Assembler::zero, unlocked);    // If so we are done.
 813 
 814     // Check if there is a successor.
 815     cmpptr(succ_address, NULL_WORD);
 816     jccb(Assembler::notZero, unlocked); // If so we are done.
 817 
 818     // Save the monitor pointer in the current thread, so we can try to
 819     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 820     if (!UseObjectMonitorTable) {
 821       andptr(monitor, ~(int32_t)markWord::monitor_value);
 822     }
 823     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 824 
 825     orl(t, 1); // Fast Unlock ZF = 0
 826     jmpb(slow_path);
 827 
 828     // Recursive unlock.
 829     bind(recursive);
 830     decrement(recursions_address);
 831   }
 832 
 833   bind(unlocked);
 834   decrement(Address(thread, JavaThread::held_monitor_count_offset()));
 835   xorl(t, t); // Fast Unlock ZF = 1
 836 
 837 #ifdef ASSERT
 838   // Check that unlocked label is reached with ZF set.
 839   Label zf_correct;
 840   jcc(Assembler::zero, zf_correct);
 841   stop("Fast Unlock ZF != 1");
 842 #endif
 843 
 844   bind(slow_path);
 845   if (stub != nullptr) {
 846     bind(stub->slow_path_continuation());
 847   }
 848 #ifdef ASSERT
 849   // Check that stub->continuation() label is reached with ZF not set.
 850   jccb(Assembler::notZero, zf_correct);
 851   stop("Fast Unlock ZF != 0");
 852   bind(zf_correct);
 853 #endif
 854   // C2 uses the value of ZF to determine the continuation.
 855 }
 856 
 857 //-------------------------------------------------------------------------------------------
 858 // Generic instructions support for use in .ad files C2 code generation
 859 
 860 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 861   if (dst != src) {
 862     movdqu(dst, src);
 863   }
 864   if (opcode == Op_AbsVD) {
 865     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 866   } else {
 867     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 868     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 869   }
 870 }
 871 
 872 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 873   if (opcode == Op_AbsVD) {
 874     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 875   } else {
 876     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 877     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 878   }
 879 }
 880 
 881 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 882   if (dst != src) {
 883     movdqu(dst, src);
 884   }
 885   if (opcode == Op_AbsVF) {
 886     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 887   } else {
 888     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 889     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 890   }
 891 }
 892 
 893 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 894   if (opcode == Op_AbsVF) {
 895     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 896   } else {
 897     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 898     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 899   }
 900 }
 901 
 902 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 903   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 904   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 905 
 906   if (opcode == Op_MinV) {
 907     if (elem_bt == T_BYTE) {
 908       pminsb(dst, src);
 909     } else if (elem_bt == T_SHORT) {
 910       pminsw(dst, src);
 911     } else if (elem_bt == T_INT) {
 912       pminsd(dst, src);
 913     } else {
 914       assert(elem_bt == T_LONG, "required");
 915       assert(tmp == xmm0, "required");
 916       assert_different_registers(dst, src, tmp);
 917       movdqu(xmm0, dst);
 918       pcmpgtq(xmm0, src);
 919       blendvpd(dst, src);  // xmm0 as mask
 920     }
 921   } else { // opcode == Op_MaxV
 922     if (elem_bt == T_BYTE) {
 923       pmaxsb(dst, src);
 924     } else if (elem_bt == T_SHORT) {
 925       pmaxsw(dst, src);
 926     } else if (elem_bt == T_INT) {
 927       pmaxsd(dst, src);
 928     } else {
 929       assert(elem_bt == T_LONG, "required");
 930       assert(tmp == xmm0, "required");
 931       assert_different_registers(dst, src, tmp);
 932       movdqu(xmm0, src);
 933       pcmpgtq(xmm0, dst);
 934       blendvpd(dst, src);  // xmm0 as mask
 935     }
 936   }
 937 }
 938 
 939 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 940                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 941                                  int vlen_enc) {
 942   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 943 
 944   if (opcode == Op_MinV) {
 945     if (elem_bt == T_BYTE) {
 946       vpminsb(dst, src1, src2, vlen_enc);
 947     } else if (elem_bt == T_SHORT) {
 948       vpminsw(dst, src1, src2, vlen_enc);
 949     } else if (elem_bt == T_INT) {
 950       vpminsd(dst, src1, src2, vlen_enc);
 951     } else {
 952       assert(elem_bt == T_LONG, "required");
 953       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 954         vpminsq(dst, src1, src2, vlen_enc);
 955       } else {
 956         assert_different_registers(dst, src1, src2);
 957         vpcmpgtq(dst, src1, src2, vlen_enc);
 958         vblendvpd(dst, src1, src2, dst, vlen_enc);
 959       }
 960     }
 961   } else { // opcode == Op_MaxV
 962     if (elem_bt == T_BYTE) {
 963       vpmaxsb(dst, src1, src2, vlen_enc);
 964     } else if (elem_bt == T_SHORT) {
 965       vpmaxsw(dst, src1, src2, vlen_enc);
 966     } else if (elem_bt == T_INT) {
 967       vpmaxsd(dst, src1, src2, vlen_enc);
 968     } else {
 969       assert(elem_bt == T_LONG, "required");
 970       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 971         vpmaxsq(dst, src1, src2, vlen_enc);
 972       } else {
 973         assert_different_registers(dst, src1, src2);
 974         vpcmpgtq(dst, src1, src2, vlen_enc);
 975         vblendvpd(dst, src2, src1, dst, vlen_enc);
 976       }
 977     }
 978   }
 979 }
 980 
 981 // Float/Double min max
 982 
 983 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 984                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 985                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 986                                    int vlen_enc) {
 987   assert(UseAVX > 0, "required");
 988   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 989          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 990   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 991   assert_different_registers(a, tmp, atmp, btmp);
 992   assert_different_registers(b, tmp, atmp, btmp);
 993 
 994   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 995   bool is_double_word = is_double_word_type(elem_bt);
 996 
 997   /* Note on 'non-obvious' assembly sequence:
 998    *
 999    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1000    * and Java on how they handle floats:
1001    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1002    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1003    *
1004    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1005    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1006    *                (only useful when signs differ, noop otherwise)
1007    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1008 
1009    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1010    *   btmp = (b < +0.0) ? a : b
1011    *   atmp = (b < +0.0) ? b : a
1012    *   Tmp  = Max_Float(atmp , btmp)
1013    *   Res  = (atmp == NaN) ? atmp : Tmp
1014    */
1015 
1016   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1017   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1018   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1019   XMMRegister mask;
1020 
1021   if (!is_double_word && is_min) {
1022     mask = a;
1023     vblend = &MacroAssembler::vblendvps;
1024     vmaxmin = &MacroAssembler::vminps;
1025     vcmp = &MacroAssembler::vcmpps;
1026   } else if (!is_double_word && !is_min) {
1027     mask = b;
1028     vblend = &MacroAssembler::vblendvps;
1029     vmaxmin = &MacroAssembler::vmaxps;
1030     vcmp = &MacroAssembler::vcmpps;
1031   } else if (is_double_word && is_min) {
1032     mask = a;
1033     vblend = &MacroAssembler::vblendvpd;
1034     vmaxmin = &MacroAssembler::vminpd;
1035     vcmp = &MacroAssembler::vcmppd;
1036   } else {
1037     assert(is_double_word && !is_min, "sanity");
1038     mask = b;
1039     vblend = &MacroAssembler::vblendvpd;
1040     vmaxmin = &MacroAssembler::vmaxpd;
1041     vcmp = &MacroAssembler::vcmppd;
1042   }
1043 
1044   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1045   XMMRegister maxmin, scratch;
1046   if (dst == btmp) {
1047     maxmin = btmp;
1048     scratch = tmp;
1049   } else {
1050     maxmin = tmp;
1051     scratch = btmp;
1052   }
1053 
1054   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1055   if (precompute_mask && !is_double_word) {
1056     vpsrad(tmp, mask, 32, vlen_enc);
1057     mask = tmp;
1058   } else if (precompute_mask && is_double_word) {
1059     vpxor(tmp, tmp, tmp, vlen_enc);
1060     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1061     mask = tmp;
1062   }
1063 
1064   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1065   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1066   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1067   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1068   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1069 }
1070 
1071 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1072                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1073                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1074                                     int vlen_enc) {
1075   assert(UseAVX > 2, "required");
1076   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1077          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1078   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1079   assert_different_registers(dst, a, atmp, btmp);
1080   assert_different_registers(dst, b, atmp, btmp);
1081 
1082   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1083   bool is_double_word = is_double_word_type(elem_bt);
1084   bool merge = true;
1085 
1086   if (!is_double_word && is_min) {
1087     evpmovd2m(ktmp, a, vlen_enc);
1088     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1089     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1090     vminps(dst, atmp, btmp, vlen_enc);
1091     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1092     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1093   } else if (!is_double_word && !is_min) {
1094     evpmovd2m(ktmp, b, vlen_enc);
1095     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1096     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1097     vmaxps(dst, atmp, btmp, vlen_enc);
1098     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1099     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1100   } else if (is_double_word && is_min) {
1101     evpmovq2m(ktmp, a, vlen_enc);
1102     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1103     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1104     vminpd(dst, atmp, btmp, vlen_enc);
1105     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1106     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1107   } else {
1108     assert(is_double_word && !is_min, "sanity");
1109     evpmovq2m(ktmp, b, vlen_enc);
1110     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1111     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1112     vmaxpd(dst, atmp, btmp, vlen_enc);
1113     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1114     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1115   }
1116 }
1117 
1118 // Float/Double signum
1119 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1120   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1121 
1122   Label DONE_LABEL;
1123 
1124   if (opcode == Op_SignumF) {
1125     assert(UseSSE > 0, "required");
1126     ucomiss(dst, zero);
1127     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1128     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1129     movflt(dst, one);
1130     jcc(Assembler::above, DONE_LABEL);
1131     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1132   } else if (opcode == Op_SignumD) {
1133     assert(UseSSE > 1, "required");
1134     ucomisd(dst, zero);
1135     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1136     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1137     movdbl(dst, one);
1138     jcc(Assembler::above, DONE_LABEL);
1139     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1140   }
1141 
1142   bind(DONE_LABEL);
1143 }
1144 
1145 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1146   if (sign) {
1147     pmovsxbw(dst, src);
1148   } else {
1149     pmovzxbw(dst, src);
1150   }
1151 }
1152 
1153 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1154   if (sign) {
1155     vpmovsxbw(dst, src, vector_len);
1156   } else {
1157     vpmovzxbw(dst, src, vector_len);
1158   }
1159 }
1160 
1161 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1162   if (sign) {
1163     vpmovsxbd(dst, src, vector_len);
1164   } else {
1165     vpmovzxbd(dst, src, vector_len);
1166   }
1167 }
1168 
1169 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1170   if (sign) {
1171     vpmovsxwd(dst, src, vector_len);
1172   } else {
1173     vpmovzxwd(dst, src, vector_len);
1174   }
1175 }
1176 
1177 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1178                                      int shift, int vector_len) {
1179   if (opcode == Op_RotateLeftV) {
1180     if (etype == T_INT) {
1181       evprold(dst, src, shift, vector_len);
1182     } else {
1183       assert(etype == T_LONG, "expected type T_LONG");
1184       evprolq(dst, src, shift, vector_len);
1185     }
1186   } else {
1187     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1188     if (etype == T_INT) {
1189       evprord(dst, src, shift, vector_len);
1190     } else {
1191       assert(etype == T_LONG, "expected type T_LONG");
1192       evprorq(dst, src, shift, vector_len);
1193     }
1194   }
1195 }
1196 
1197 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1198                                      XMMRegister shift, int vector_len) {
1199   if (opcode == Op_RotateLeftV) {
1200     if (etype == T_INT) {
1201       evprolvd(dst, src, shift, vector_len);
1202     } else {
1203       assert(etype == T_LONG, "expected type T_LONG");
1204       evprolvq(dst, src, shift, vector_len);
1205     }
1206   } else {
1207     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1208     if (etype == T_INT) {
1209       evprorvd(dst, src, shift, vector_len);
1210     } else {
1211       assert(etype == T_LONG, "expected type T_LONG");
1212       evprorvq(dst, src, shift, vector_len);
1213     }
1214   }
1215 }
1216 
1217 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1218   if (opcode == Op_RShiftVI) {
1219     psrad(dst, shift);
1220   } else if (opcode == Op_LShiftVI) {
1221     pslld(dst, shift);
1222   } else {
1223     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1224     psrld(dst, shift);
1225   }
1226 }
1227 
1228 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1229   switch (opcode) {
1230     case Op_RShiftVI:  psrad(dst, shift); break;
1231     case Op_LShiftVI:  pslld(dst, shift); break;
1232     case Op_URShiftVI: psrld(dst, shift); break;
1233 
1234     default: assert(false, "%s", NodeClassNames[opcode]);
1235   }
1236 }
1237 
1238 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1239   if (opcode == Op_RShiftVI) {
1240     vpsrad(dst, nds, shift, vector_len);
1241   } else if (opcode == Op_LShiftVI) {
1242     vpslld(dst, nds, shift, vector_len);
1243   } else {
1244     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1245     vpsrld(dst, nds, shift, vector_len);
1246   }
1247 }
1248 
1249 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1250   switch (opcode) {
1251     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1252     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1253     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1254 
1255     default: assert(false, "%s", NodeClassNames[opcode]);
1256   }
1257 }
1258 
1259 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1260   switch (opcode) {
1261     case Op_RShiftVB:  // fall-through
1262     case Op_RShiftVS:  psraw(dst, shift); break;
1263 
1264     case Op_LShiftVB:  // fall-through
1265     case Op_LShiftVS:  psllw(dst, shift);   break;
1266 
1267     case Op_URShiftVS: // fall-through
1268     case Op_URShiftVB: psrlw(dst, shift);  break;
1269 
1270     default: assert(false, "%s", NodeClassNames[opcode]);
1271   }
1272 }
1273 
1274 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1275   switch (opcode) {
1276     case Op_RShiftVB:  // fall-through
1277     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1278 
1279     case Op_LShiftVB:  // fall-through
1280     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1281 
1282     case Op_URShiftVS: // fall-through
1283     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1284 
1285     default: assert(false, "%s", NodeClassNames[opcode]);
1286   }
1287 }
1288 
1289 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1290   switch (opcode) {
1291     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1292     case Op_LShiftVL:  psllq(dst, shift); break;
1293     case Op_URShiftVL: psrlq(dst, shift); break;
1294 
1295     default: assert(false, "%s", NodeClassNames[opcode]);
1296   }
1297 }
1298 
1299 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1300   if (opcode == Op_RShiftVL) {
1301     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1302   } else if (opcode == Op_LShiftVL) {
1303     psllq(dst, shift);
1304   } else {
1305     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1306     psrlq(dst, shift);
1307   }
1308 }
1309 
1310 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1311   switch (opcode) {
1312     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1313     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1314     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1315 
1316     default: assert(false, "%s", NodeClassNames[opcode]);
1317   }
1318 }
1319 
1320 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1321   if (opcode == Op_RShiftVL) {
1322     evpsraq(dst, nds, shift, vector_len);
1323   } else if (opcode == Op_LShiftVL) {
1324     vpsllq(dst, nds, shift, vector_len);
1325   } else {
1326     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1327     vpsrlq(dst, nds, shift, vector_len);
1328   }
1329 }
1330 
1331 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1332   switch (opcode) {
1333     case Op_RShiftVB:  // fall-through
1334     case Op_RShiftVS:  // fall-through
1335     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1336 
1337     case Op_LShiftVB:  // fall-through
1338     case Op_LShiftVS:  // fall-through
1339     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1340 
1341     case Op_URShiftVB: // fall-through
1342     case Op_URShiftVS: // fall-through
1343     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1344 
1345     default: assert(false, "%s", NodeClassNames[opcode]);
1346   }
1347 }
1348 
1349 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1350   switch (opcode) {
1351     case Op_RShiftVB:  // fall-through
1352     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1353 
1354     case Op_LShiftVB:  // fall-through
1355     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1356 
1357     case Op_URShiftVB: // fall-through
1358     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1359 
1360     default: assert(false, "%s", NodeClassNames[opcode]);
1361   }
1362 }
1363 
1364 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1365   assert(UseAVX >= 2, "required");
1366   switch (opcode) {
1367     case Op_RShiftVL: {
1368       if (UseAVX > 2) {
1369         assert(tmp == xnoreg, "not used");
1370         if (!VM_Version::supports_avx512vl()) {
1371           vlen_enc = Assembler::AVX_512bit;
1372         }
1373         evpsravq(dst, src, shift, vlen_enc);
1374       } else {
1375         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1376         vpsrlvq(dst, src, shift, vlen_enc);
1377         vpsrlvq(tmp, tmp, shift, vlen_enc);
1378         vpxor(dst, dst, tmp, vlen_enc);
1379         vpsubq(dst, dst, tmp, vlen_enc);
1380       }
1381       break;
1382     }
1383     case Op_LShiftVL: {
1384       assert(tmp == xnoreg, "not used");
1385       vpsllvq(dst, src, shift, vlen_enc);
1386       break;
1387     }
1388     case Op_URShiftVL: {
1389       assert(tmp == xnoreg, "not used");
1390       vpsrlvq(dst, src, shift, vlen_enc);
1391       break;
1392     }
1393     default: assert(false, "%s", NodeClassNames[opcode]);
1394   }
1395 }
1396 
1397 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1398 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1399   assert(opcode == Op_LShiftVB ||
1400          opcode == Op_RShiftVB ||
1401          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1402   bool sign = (opcode != Op_URShiftVB);
1403   assert(vector_len == 0, "required");
1404   vextendbd(sign, dst, src, 1);
1405   vpmovzxbd(vtmp, shift, 1);
1406   varshiftd(opcode, dst, dst, vtmp, 1);
1407   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1408   vextracti128_high(vtmp, dst);
1409   vpackusdw(dst, dst, vtmp, 0);
1410 }
1411 
1412 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1413 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1414   assert(opcode == Op_LShiftVB ||
1415          opcode == Op_RShiftVB ||
1416          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1417   bool sign = (opcode != Op_URShiftVB);
1418   int ext_vector_len = vector_len + 1;
1419   vextendbw(sign, dst, src, ext_vector_len);
1420   vpmovzxbw(vtmp, shift, ext_vector_len);
1421   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1422   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1423   if (vector_len == 0) {
1424     vextracti128_high(vtmp, dst);
1425     vpackuswb(dst, dst, vtmp, vector_len);
1426   } else {
1427     vextracti64x4_high(vtmp, dst);
1428     vpackuswb(dst, dst, vtmp, vector_len);
1429     vpermq(dst, dst, 0xD8, vector_len);
1430   }
1431 }
1432 
1433 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1434   switch(typ) {
1435     case T_BYTE:
1436       pinsrb(dst, val, idx);
1437       break;
1438     case T_SHORT:
1439       pinsrw(dst, val, idx);
1440       break;
1441     case T_INT:
1442       pinsrd(dst, val, idx);
1443       break;
1444     case T_LONG:
1445       pinsrq(dst, val, idx);
1446       break;
1447     default:
1448       assert(false,"Should not reach here.");
1449       break;
1450   }
1451 }
1452 
1453 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1454   switch(typ) {
1455     case T_BYTE:
1456       vpinsrb(dst, src, val, idx);
1457       break;
1458     case T_SHORT:
1459       vpinsrw(dst, src, val, idx);
1460       break;
1461     case T_INT:
1462       vpinsrd(dst, src, val, idx);
1463       break;
1464     case T_LONG:
1465       vpinsrq(dst, src, val, idx);
1466       break;
1467     default:
1468       assert(false,"Should not reach here.");
1469       break;
1470   }
1471 }
1472 
1473 #ifdef _LP64
1474 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1475                                                 XMMRegister dst, Register base,
1476                                                 Register idx_base,
1477                                                 Register offset, Register mask,
1478                                                 Register mask_idx, Register rtmp,
1479                                                 int vlen_enc) {
1480   vpxor(dst, dst, dst, vlen_enc);
1481   if (elem_bt == T_SHORT) {
1482     for (int i = 0; i < 4; i++) {
1483       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1484       Label skip_load;
1485       btq(mask, mask_idx);
1486       jccb(Assembler::carryClear, skip_load);
1487       movl(rtmp, Address(idx_base, i * 4));
1488       if (offset != noreg) {
1489         addl(rtmp, offset);
1490       }
1491       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1492       bind(skip_load);
1493       incq(mask_idx);
1494     }
1495   } else {
1496     assert(elem_bt == T_BYTE, "");
1497     for (int i = 0; i < 8; i++) {
1498       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1499       Label skip_load;
1500       btq(mask, mask_idx);
1501       jccb(Assembler::carryClear, skip_load);
1502       movl(rtmp, Address(idx_base, i * 4));
1503       if (offset != noreg) {
1504         addl(rtmp, offset);
1505       }
1506       pinsrb(dst, Address(base, rtmp), i);
1507       bind(skip_load);
1508       incq(mask_idx);
1509     }
1510   }
1511 }
1512 #endif // _LP64
1513 
1514 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1515                                          Register base, Register idx_base,
1516                                          Register offset, Register rtmp,
1517                                          int vlen_enc) {
1518   vpxor(dst, dst, dst, vlen_enc);
1519   if (elem_bt == T_SHORT) {
1520     for (int i = 0; i < 4; i++) {
1521       // dst[i] = src[offset + idx_base[i]]
1522       movl(rtmp, Address(idx_base, i * 4));
1523       if (offset != noreg) {
1524         addl(rtmp, offset);
1525       }
1526       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1527     }
1528   } else {
1529     assert(elem_bt == T_BYTE, "");
1530     for (int i = 0; i < 8; i++) {
1531       // dst[i] = src[offset + idx_base[i]]
1532       movl(rtmp, Address(idx_base, i * 4));
1533       if (offset != noreg) {
1534         addl(rtmp, offset);
1535       }
1536       pinsrb(dst, Address(base, rtmp), i);
1537     }
1538   }
1539 }
1540 
1541 /*
1542  * Gather using hybrid algorithm, first partially unroll scalar loop
1543  * to accumulate values from gather indices into a quad-word(64bit) slice.
1544  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1545  * permutation to place the slice into appropriate vector lane
1546  * locations in destination vector. Following pseudo code describes the
1547  * algorithm in detail:
1548  *
1549  * DST_VEC = ZERO_VEC
1550  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1551  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1552  * FOREACH_ITER:
1553  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1554  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1555  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1556  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1557  *
1558  * With each iteration, doubleword permute indices (0,1) corresponding
1559  * to gathered quadword gets right shifted by two lane positions.
1560  *
1561  */
1562 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1563                                         Register base, Register idx_base,
1564                                         Register offset, Register mask,
1565                                         XMMRegister xtmp1, XMMRegister xtmp2,
1566                                         XMMRegister temp_dst, Register rtmp,
1567                                         Register mask_idx, Register length,
1568                                         int vector_len, int vlen_enc) {
1569   Label GATHER8_LOOP;
1570   assert(is_subword_type(elem_ty), "");
1571   movl(length, vector_len);
1572   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1573   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1574   vallones(xtmp2, vlen_enc);
1575   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1576   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1577   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1578 
1579   bind(GATHER8_LOOP);
1580     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1581     if (mask == noreg) {
1582       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1583     } else {
1584       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1585     }
1586     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1587     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1588     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1589     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1590     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1591     vpor(dst, dst, temp_dst, vlen_enc);
1592     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1593     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1594     jcc(Assembler::notEqual, GATHER8_LOOP);
1595 }
1596 
1597 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1598   switch(typ) {
1599     case T_INT:
1600       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1601       break;
1602     case T_FLOAT:
1603       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1604       break;
1605     case T_LONG:
1606       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1607       break;
1608     case T_DOUBLE:
1609       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1610       break;
1611     default:
1612       assert(false,"Should not reach here.");
1613       break;
1614   }
1615 }
1616 
1617 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1618   switch(typ) {
1619     case T_INT:
1620       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1621       break;
1622     case T_FLOAT:
1623       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1624       break;
1625     case T_LONG:
1626       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1627       break;
1628     case T_DOUBLE:
1629       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1630       break;
1631     default:
1632       assert(false,"Should not reach here.");
1633       break;
1634   }
1635 }
1636 
1637 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1638   switch(typ) {
1639     case T_INT:
1640       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1641       break;
1642     case T_FLOAT:
1643       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1644       break;
1645     case T_LONG:
1646       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1647       break;
1648     case T_DOUBLE:
1649       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1650       break;
1651     default:
1652       assert(false,"Should not reach here.");
1653       break;
1654   }
1655 }
1656 
1657 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1658   if (vlen_in_bytes <= 16) {
1659     pxor (dst, dst);
1660     psubb(dst, src);
1661     switch (elem_bt) {
1662       case T_BYTE:   /* nothing to do */ break;
1663       case T_SHORT:  pmovsxbw(dst, dst); break;
1664       case T_INT:    pmovsxbd(dst, dst); break;
1665       case T_FLOAT:  pmovsxbd(dst, dst); break;
1666       case T_LONG:   pmovsxbq(dst, dst); break;
1667       case T_DOUBLE: pmovsxbq(dst, dst); break;
1668 
1669       default: assert(false, "%s", type2name(elem_bt));
1670     }
1671   } else {
1672     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1673     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1674 
1675     vpxor (dst, dst, dst, vlen_enc);
1676     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1677 
1678     switch (elem_bt) {
1679       case T_BYTE:   /* nothing to do */            break;
1680       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1681       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1682       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1683       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1684       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1685 
1686       default: assert(false, "%s", type2name(elem_bt));
1687     }
1688   }
1689 }
1690 
1691 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1692   if (novlbwdq) {
1693     vpmovsxbd(xtmp, src, vlen_enc);
1694     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1695             Assembler::eq, true, vlen_enc, noreg);
1696   } else {
1697     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1698     vpsubb(xtmp, xtmp, src, vlen_enc);
1699     evpmovb2m(dst, xtmp, vlen_enc);
1700   }
1701 }
1702 
1703 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1704   switch (vlen_in_bytes) {
1705     case 4:  movdl(dst, src);   break;
1706     case 8:  movq(dst, src);    break;
1707     case 16: movdqu(dst, src);  break;
1708     case 32: vmovdqu(dst, src); break;
1709     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1710     default: ShouldNotReachHere();
1711   }
1712 }
1713 
1714 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1715   assert(rscratch != noreg || always_reachable(src), "missing");
1716 
1717   if (reachable(src)) {
1718     load_vector(dst, as_Address(src), vlen_in_bytes);
1719   } else {
1720     lea(rscratch, src);
1721     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1722   }
1723 }
1724 
1725 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1726   int vlen_enc = vector_length_encoding(vlen);
1727   if (VM_Version::supports_avx()) {
1728     if (bt == T_LONG) {
1729       if (VM_Version::supports_avx2()) {
1730         vpbroadcastq(dst, src, vlen_enc);
1731       } else {
1732         vmovddup(dst, src, vlen_enc);
1733       }
1734     } else if (bt == T_DOUBLE) {
1735       if (vlen_enc != Assembler::AVX_128bit) {
1736         vbroadcastsd(dst, src, vlen_enc, noreg);
1737       } else {
1738         vmovddup(dst, src, vlen_enc);
1739       }
1740     } else {
1741       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1742         vpbroadcastd(dst, src, vlen_enc);
1743       } else {
1744         vbroadcastss(dst, src, vlen_enc);
1745       }
1746     }
1747   } else if (VM_Version::supports_sse3()) {
1748     movddup(dst, src);
1749   } else {
1750     movq(dst, src);
1751     if (vlen == 16) {
1752       punpcklqdq(dst, dst);
1753     }
1754   }
1755 }
1756 
1757 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1758   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1759   int offset = exact_log2(type2aelembytes(bt)) << 6;
1760   if (is_floating_point_type(bt)) {
1761     offset += 128;
1762   }
1763   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1764   load_vector(dst, addr, vlen_in_bytes);
1765 }
1766 
1767 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1768 
1769 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1770   int vector_len = Assembler::AVX_128bit;
1771 
1772   switch (opcode) {
1773     case Op_AndReductionV:  pand(dst, src); break;
1774     case Op_OrReductionV:   por (dst, src); break;
1775     case Op_XorReductionV:  pxor(dst, src); break;
1776     case Op_MinReductionV:
1777       switch (typ) {
1778         case T_BYTE:        pminsb(dst, src); break;
1779         case T_SHORT:       pminsw(dst, src); break;
1780         case T_INT:         pminsd(dst, src); break;
1781         case T_LONG:        assert(UseAVX > 2, "required");
1782                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1783         default:            assert(false, "wrong type");
1784       }
1785       break;
1786     case Op_MaxReductionV:
1787       switch (typ) {
1788         case T_BYTE:        pmaxsb(dst, src); break;
1789         case T_SHORT:       pmaxsw(dst, src); break;
1790         case T_INT:         pmaxsd(dst, src); break;
1791         case T_LONG:        assert(UseAVX > 2, "required");
1792                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1793         default:            assert(false, "wrong type");
1794       }
1795       break;
1796     case Op_AddReductionVF: addss(dst, src); break;
1797     case Op_AddReductionVD: addsd(dst, src); break;
1798     case Op_AddReductionVI:
1799       switch (typ) {
1800         case T_BYTE:        paddb(dst, src); break;
1801         case T_SHORT:       paddw(dst, src); break;
1802         case T_INT:         paddd(dst, src); break;
1803         default:            assert(false, "wrong type");
1804       }
1805       break;
1806     case Op_AddReductionVL: paddq(dst, src); break;
1807     case Op_MulReductionVF: mulss(dst, src); break;
1808     case Op_MulReductionVD: mulsd(dst, src); break;
1809     case Op_MulReductionVI:
1810       switch (typ) {
1811         case T_SHORT:       pmullw(dst, src); break;
1812         case T_INT:         pmulld(dst, src); break;
1813         default:            assert(false, "wrong type");
1814       }
1815       break;
1816     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1817                             evpmullq(dst, dst, src, vector_len); break;
1818     default:                assert(false, "wrong opcode");
1819   }
1820 }
1821 
1822 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1823   switch (opcode) {
1824     case Op_AddReductionVF: addps(dst, src); break;
1825     case Op_AddReductionVD: addpd(dst, src); break;
1826     case Op_MulReductionVF: mulps(dst, src); break;
1827     case Op_MulReductionVD: mulpd(dst, src); break;
1828     default:                assert(false, "%s", NodeClassNames[opcode]);
1829   }
1830 }
1831 
1832 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1833   int vector_len = Assembler::AVX_256bit;
1834 
1835   switch (opcode) {
1836     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1837     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1838     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1839     case Op_MinReductionV:
1840       switch (typ) {
1841         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1842         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1843         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1844         case T_LONG:        assert(UseAVX > 2, "required");
1845                             vpminsq(dst, src1, src2, vector_len); break;
1846         default:            assert(false, "wrong type");
1847       }
1848       break;
1849     case Op_MaxReductionV:
1850       switch (typ) {
1851         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1852         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1853         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1854         case T_LONG:        assert(UseAVX > 2, "required");
1855                             vpmaxsq(dst, src1, src2, vector_len); break;
1856         default:            assert(false, "wrong type");
1857       }
1858       break;
1859     case Op_AddReductionVI:
1860       switch (typ) {
1861         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1862         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1863         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1864         default:            assert(false, "wrong type");
1865       }
1866       break;
1867     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1868     case Op_MulReductionVI:
1869       switch (typ) {
1870         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1871         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1872         default:            assert(false, "wrong type");
1873       }
1874       break;
1875     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1876     default:                assert(false, "wrong opcode");
1877   }
1878 }
1879 
1880 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1881   int vector_len = Assembler::AVX_256bit;
1882 
1883   switch (opcode) {
1884     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1885     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1886     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1887     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1888     default:                assert(false, "%s", NodeClassNames[opcode]);
1889   }
1890 }
1891 
1892 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1893                                   XMMRegister dst, XMMRegister src,
1894                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1895   switch (opcode) {
1896     case Op_AddReductionVF:
1897     case Op_MulReductionVF:
1898       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1899       break;
1900 
1901     case Op_AddReductionVD:
1902     case Op_MulReductionVD:
1903       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1904       break;
1905 
1906     default: assert(false, "wrong opcode");
1907   }
1908 }
1909 
1910 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1911                                             XMMRegister dst, XMMRegister src,
1912                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1913   switch (opcode) {
1914     case Op_AddReductionVF:
1915     case Op_MulReductionVF:
1916       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1917       break;
1918 
1919     case Op_AddReductionVD:
1920     case Op_MulReductionVD:
1921       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1922       break;
1923 
1924     default: assert(false, "%s", NodeClassNames[opcode]);
1925   }
1926 }
1927 
1928 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1929                              Register dst, Register src1, XMMRegister src2,
1930                              XMMRegister vtmp1, XMMRegister vtmp2) {
1931   switch (vlen) {
1932     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1933     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1934     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1935     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1936 
1937     default: assert(false, "wrong vector length");
1938   }
1939 }
1940 
1941 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1942                              Register dst, Register src1, XMMRegister src2,
1943                              XMMRegister vtmp1, XMMRegister vtmp2) {
1944   switch (vlen) {
1945     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1946     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1947     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1948     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1949 
1950     default: assert(false, "wrong vector length");
1951   }
1952 }
1953 
1954 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1955                              Register dst, Register src1, XMMRegister src2,
1956                              XMMRegister vtmp1, XMMRegister vtmp2) {
1957   switch (vlen) {
1958     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1959     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1960     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1961     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1962 
1963     default: assert(false, "wrong vector length");
1964   }
1965 }
1966 
1967 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1968                              Register dst, Register src1, XMMRegister src2,
1969                              XMMRegister vtmp1, XMMRegister vtmp2) {
1970   switch (vlen) {
1971     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1972     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1973     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1974     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1975 
1976     default: assert(false, "wrong vector length");
1977   }
1978 }
1979 
1980 #ifdef _LP64
1981 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1982                              Register dst, Register src1, XMMRegister src2,
1983                              XMMRegister vtmp1, XMMRegister vtmp2) {
1984   switch (vlen) {
1985     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1986     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1987     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1988 
1989     default: assert(false, "wrong vector length");
1990   }
1991 }
1992 #endif // _LP64
1993 
1994 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1995   switch (vlen) {
1996     case 2:
1997       assert(vtmp2 == xnoreg, "");
1998       reduce2F(opcode, dst, src, vtmp1);
1999       break;
2000     case 4:
2001       assert(vtmp2 == xnoreg, "");
2002       reduce4F(opcode, dst, src, vtmp1);
2003       break;
2004     case 8:
2005       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2006       break;
2007     case 16:
2008       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2009       break;
2010     default: assert(false, "wrong vector length");
2011   }
2012 }
2013 
2014 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2015   switch (vlen) {
2016     case 2:
2017       assert(vtmp2 == xnoreg, "");
2018       reduce2D(opcode, dst, src, vtmp1);
2019       break;
2020     case 4:
2021       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2022       break;
2023     case 8:
2024       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2025       break;
2026     default: assert(false, "wrong vector length");
2027   }
2028 }
2029 
2030 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2031   switch (vlen) {
2032     case 2:
2033       assert(vtmp1 == xnoreg, "");
2034       assert(vtmp2 == xnoreg, "");
2035       unorderedReduce2F(opcode, dst, src);
2036       break;
2037     case 4:
2038       assert(vtmp2 == xnoreg, "");
2039       unorderedReduce4F(opcode, dst, src, vtmp1);
2040       break;
2041     case 8:
2042       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2043       break;
2044     case 16:
2045       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2046       break;
2047     default: assert(false, "wrong vector length");
2048   }
2049 }
2050 
2051 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2052   switch (vlen) {
2053     case 2:
2054       assert(vtmp1 == xnoreg, "");
2055       assert(vtmp2 == xnoreg, "");
2056       unorderedReduce2D(opcode, dst, src);
2057       break;
2058     case 4:
2059       assert(vtmp2 == xnoreg, "");
2060       unorderedReduce4D(opcode, dst, src, vtmp1);
2061       break;
2062     case 8:
2063       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2064       break;
2065     default: assert(false, "wrong vector length");
2066   }
2067 }
2068 
2069 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2070   if (opcode == Op_AddReductionVI) {
2071     if (vtmp1 != src2) {
2072       movdqu(vtmp1, src2);
2073     }
2074     phaddd(vtmp1, vtmp1);
2075   } else {
2076     pshufd(vtmp1, src2, 0x1);
2077     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2078   }
2079   movdl(vtmp2, src1);
2080   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2081   movdl(dst, vtmp1);
2082 }
2083 
2084 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2085   if (opcode == Op_AddReductionVI) {
2086     if (vtmp1 != src2) {
2087       movdqu(vtmp1, src2);
2088     }
2089     phaddd(vtmp1, src2);
2090     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2091   } else {
2092     pshufd(vtmp2, src2, 0xE);
2093     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2094     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2095   }
2096 }
2097 
2098 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2099   if (opcode == Op_AddReductionVI) {
2100     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2101     vextracti128_high(vtmp2, vtmp1);
2102     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2103     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2104   } else {
2105     vextracti128_high(vtmp1, src2);
2106     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2107     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2108   }
2109 }
2110 
2111 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2112   vextracti64x4_high(vtmp2, src2);
2113   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2114   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2115 }
2116 
2117 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2118   pshufd(vtmp2, src2, 0x1);
2119   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2120   movdqu(vtmp1, vtmp2);
2121   psrldq(vtmp1, 2);
2122   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2123   movdqu(vtmp2, vtmp1);
2124   psrldq(vtmp2, 1);
2125   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2126   movdl(vtmp2, src1);
2127   pmovsxbd(vtmp1, vtmp1);
2128   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2129   pextrb(dst, vtmp1, 0x0);
2130   movsbl(dst, dst);
2131 }
2132 
2133 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2134   pshufd(vtmp1, src2, 0xE);
2135   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2136   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2137 }
2138 
2139 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2140   vextracti128_high(vtmp2, src2);
2141   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2142   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2143 }
2144 
2145 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2146   vextracti64x4_high(vtmp1, src2);
2147   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2148   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2149 }
2150 
2151 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2152   pmovsxbw(vtmp2, src2);
2153   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2154 }
2155 
2156 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2157   if (UseAVX > 1) {
2158     int vector_len = Assembler::AVX_256bit;
2159     vpmovsxbw(vtmp1, src2, vector_len);
2160     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2161   } else {
2162     pmovsxbw(vtmp2, src2);
2163     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2164     pshufd(vtmp2, src2, 0x1);
2165     pmovsxbw(vtmp2, src2);
2166     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2167   }
2168 }
2169 
2170 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2171   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2172     int vector_len = Assembler::AVX_512bit;
2173     vpmovsxbw(vtmp1, src2, vector_len);
2174     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2175   } else {
2176     assert(UseAVX >= 2,"Should not reach here.");
2177     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2178     vextracti128_high(vtmp2, src2);
2179     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2180   }
2181 }
2182 
2183 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2184   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2185   vextracti64x4_high(vtmp2, src2);
2186   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2187 }
2188 
2189 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2190   if (opcode == Op_AddReductionVI) {
2191     if (vtmp1 != src2) {
2192       movdqu(vtmp1, src2);
2193     }
2194     phaddw(vtmp1, vtmp1);
2195     phaddw(vtmp1, vtmp1);
2196   } else {
2197     pshufd(vtmp2, src2, 0x1);
2198     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2199     movdqu(vtmp1, vtmp2);
2200     psrldq(vtmp1, 2);
2201     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2202   }
2203   movdl(vtmp2, src1);
2204   pmovsxwd(vtmp1, vtmp1);
2205   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2206   pextrw(dst, vtmp1, 0x0);
2207   movswl(dst, dst);
2208 }
2209 
2210 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2211   if (opcode == Op_AddReductionVI) {
2212     if (vtmp1 != src2) {
2213       movdqu(vtmp1, src2);
2214     }
2215     phaddw(vtmp1, src2);
2216   } else {
2217     pshufd(vtmp1, src2, 0xE);
2218     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2219   }
2220   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2221 }
2222 
2223 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2224   if (opcode == Op_AddReductionVI) {
2225     int vector_len = Assembler::AVX_256bit;
2226     vphaddw(vtmp2, src2, src2, vector_len);
2227     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2228   } else {
2229     vextracti128_high(vtmp2, src2);
2230     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2231   }
2232   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2233 }
2234 
2235 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2236   int vector_len = Assembler::AVX_256bit;
2237   vextracti64x4_high(vtmp1, src2);
2238   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2239   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2240 }
2241 
2242 #ifdef _LP64
2243 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2244   pshufd(vtmp2, src2, 0xE);
2245   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2246   movdq(vtmp1, src1);
2247   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2248   movdq(dst, vtmp1);
2249 }
2250 
2251 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2252   vextracti128_high(vtmp1, src2);
2253   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2254   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2255 }
2256 
2257 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2258   vextracti64x4_high(vtmp2, src2);
2259   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2260   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2261 }
2262 
2263 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2264   mov64(temp, -1L);
2265   bzhiq(temp, temp, len);
2266   kmovql(dst, temp);
2267 }
2268 #endif // _LP64
2269 
2270 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2271   reduce_operation_128(T_FLOAT, opcode, dst, src);
2272   pshufd(vtmp, src, 0x1);
2273   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2274 }
2275 
2276 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2277   reduce2F(opcode, dst, src, vtmp);
2278   pshufd(vtmp, src, 0x2);
2279   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2280   pshufd(vtmp, src, 0x3);
2281   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2282 }
2283 
2284 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2285   reduce4F(opcode, dst, src, vtmp2);
2286   vextractf128_high(vtmp2, src);
2287   reduce4F(opcode, dst, vtmp2, vtmp1);
2288 }
2289 
2290 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2291   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2292   vextracti64x4_high(vtmp1, src);
2293   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2294 }
2295 
2296 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2297   pshufd(dst, src, 0x1);
2298   reduce_operation_128(T_FLOAT, opcode, dst, src);
2299 }
2300 
2301 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2302   pshufd(vtmp, src, 0xE);
2303   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2304   unorderedReduce2F(opcode, dst, vtmp);
2305 }
2306 
2307 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2308   vextractf128_high(vtmp1, src);
2309   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2310   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2311 }
2312 
2313 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2314   vextractf64x4_high(vtmp2, src);
2315   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2316   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2317 }
2318 
2319 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2320   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2321   pshufd(vtmp, src, 0xE);
2322   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2323 }
2324 
2325 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2326   reduce2D(opcode, dst, src, vtmp2);
2327   vextractf128_high(vtmp2, src);
2328   reduce2D(opcode, dst, vtmp2, vtmp1);
2329 }
2330 
2331 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2332   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2333   vextracti64x4_high(vtmp1, src);
2334   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2335 }
2336 
2337 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2338   pshufd(dst, src, 0xE);
2339   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2340 }
2341 
2342 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2343   vextractf128_high(vtmp, src);
2344   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2345   unorderedReduce2D(opcode, dst, vtmp);
2346 }
2347 
2348 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2349   vextractf64x4_high(vtmp2, src);
2350   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2351   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2352 }
2353 
2354 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2355   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2356 }
2357 
2358 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2359   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2360 }
2361 
2362 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2363                                  int vec_enc) {
2364   switch(elem_bt) {
2365     case T_INT:
2366     case T_FLOAT:
2367       vmaskmovps(dst, src, mask, vec_enc);
2368       break;
2369     case T_LONG:
2370     case T_DOUBLE:
2371       vmaskmovpd(dst, src, mask, vec_enc);
2372       break;
2373     default:
2374       fatal("Unsupported type %s", type2name(elem_bt));
2375       break;
2376   }
2377 }
2378 
2379 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2380                                  int vec_enc) {
2381   switch(elem_bt) {
2382     case T_INT:
2383     case T_FLOAT:
2384       vmaskmovps(dst, src, mask, vec_enc);
2385       break;
2386     case T_LONG:
2387     case T_DOUBLE:
2388       vmaskmovpd(dst, src, mask, vec_enc);
2389       break;
2390     default:
2391       fatal("Unsupported type %s", type2name(elem_bt));
2392       break;
2393   }
2394 }
2395 
2396 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2397                                           XMMRegister dst, XMMRegister src,
2398                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2399                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2400   const int permconst[] = {1, 14};
2401   XMMRegister wsrc = src;
2402   XMMRegister wdst = xmm_0;
2403   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2404 
2405   int vlen_enc = Assembler::AVX_128bit;
2406   if (vlen == 16) {
2407     vlen_enc = Assembler::AVX_256bit;
2408   }
2409 
2410   for (int i = log2(vlen) - 1; i >=0; i--) {
2411     if (i == 0 && !is_dst_valid) {
2412       wdst = dst;
2413     }
2414     if (i == 3) {
2415       vextracti64x4_high(wtmp, wsrc);
2416     } else if (i == 2) {
2417       vextracti128_high(wtmp, wsrc);
2418     } else { // i = [0,1]
2419       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2420     }
2421     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2422     wsrc = wdst;
2423     vlen_enc = Assembler::AVX_128bit;
2424   }
2425   if (is_dst_valid) {
2426     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2427   }
2428 }
2429 
2430 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2431                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2432                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2433   XMMRegister wsrc = src;
2434   XMMRegister wdst = xmm_0;
2435   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2436   int vlen_enc = Assembler::AVX_128bit;
2437   if (vlen == 8) {
2438     vlen_enc = Assembler::AVX_256bit;
2439   }
2440   for (int i = log2(vlen) - 1; i >=0; i--) {
2441     if (i == 0 && !is_dst_valid) {
2442       wdst = dst;
2443     }
2444     if (i == 1) {
2445       vextracti128_high(wtmp, wsrc);
2446     } else if (i == 2) {
2447       vextracti64x4_high(wtmp, wsrc);
2448     } else {
2449       assert(i == 0, "%d", i);
2450       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2451     }
2452     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2453     wsrc = wdst;
2454     vlen_enc = Assembler::AVX_128bit;
2455   }
2456   if (is_dst_valid) {
2457     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2458   }
2459 }
2460 
2461 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2462   switch (bt) {
2463     case T_BYTE:  pextrb(dst, src, idx); break;
2464     case T_SHORT: pextrw(dst, src, idx); break;
2465     case T_INT:   pextrd(dst, src, idx); break;
2466     case T_LONG:  pextrq(dst, src, idx); break;
2467 
2468     default:
2469       assert(false,"Should not reach here.");
2470       break;
2471   }
2472 }
2473 
2474 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2475   int esize =  type2aelembytes(typ);
2476   int elem_per_lane = 16/esize;
2477   int lane = elemindex / elem_per_lane;
2478   int eindex = elemindex % elem_per_lane;
2479 
2480   if (lane >= 2) {
2481     assert(UseAVX > 2, "required");
2482     vextractf32x4(dst, src, lane & 3);
2483     return dst;
2484   } else if (lane > 0) {
2485     assert(UseAVX > 0, "required");
2486     vextractf128(dst, src, lane);
2487     return dst;
2488   } else {
2489     return src;
2490   }
2491 }
2492 
2493 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2494   if (typ == T_BYTE) {
2495     movsbl(dst, dst);
2496   } else if (typ == T_SHORT) {
2497     movswl(dst, dst);
2498   }
2499 }
2500 
2501 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2502   int esize =  type2aelembytes(typ);
2503   int elem_per_lane = 16/esize;
2504   int eindex = elemindex % elem_per_lane;
2505   assert(is_integral_type(typ),"required");
2506 
2507   if (eindex == 0) {
2508     if (typ == T_LONG) {
2509       movq(dst, src);
2510     } else {
2511       movdl(dst, src);
2512       movsxl(typ, dst);
2513     }
2514   } else {
2515     extract(typ, dst, src, eindex);
2516     movsxl(typ, dst);
2517   }
2518 }
2519 
2520 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2521   int esize =  type2aelembytes(typ);
2522   int elem_per_lane = 16/esize;
2523   int eindex = elemindex % elem_per_lane;
2524   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2525 
2526   if (eindex == 0) {
2527     movq(dst, src);
2528   } else {
2529     if (typ == T_FLOAT) {
2530       if (UseAVX == 0) {
2531         movdqu(dst, src);
2532         shufps(dst, dst, eindex);
2533       } else {
2534         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2535       }
2536     } else {
2537       if (UseAVX == 0) {
2538         movdqu(dst, src);
2539         psrldq(dst, eindex*esize);
2540       } else {
2541         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2542       }
2543       movq(dst, dst);
2544     }
2545   }
2546   // Zero upper bits
2547   if (typ == T_FLOAT) {
2548     if (UseAVX == 0) {
2549       assert(vtmp != xnoreg, "required.");
2550       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2551       pand(dst, vtmp);
2552     } else {
2553       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2554     }
2555   }
2556 }
2557 
2558 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2559   switch(typ) {
2560     case T_BYTE:
2561     case T_BOOLEAN:
2562       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2563       break;
2564     case T_SHORT:
2565     case T_CHAR:
2566       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2567       break;
2568     case T_INT:
2569     case T_FLOAT:
2570       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2571       break;
2572     case T_LONG:
2573     case T_DOUBLE:
2574       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2575       break;
2576     default:
2577       assert(false,"Should not reach here.");
2578       break;
2579   }
2580 }
2581 
2582 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2583   assert(rscratch != noreg || always_reachable(src2), "missing");
2584 
2585   switch(typ) {
2586     case T_BOOLEAN:
2587     case T_BYTE:
2588       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2589       break;
2590     case T_CHAR:
2591     case T_SHORT:
2592       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2593       break;
2594     case T_INT:
2595     case T_FLOAT:
2596       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2597       break;
2598     case T_LONG:
2599     case T_DOUBLE:
2600       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2601       break;
2602     default:
2603       assert(false,"Should not reach here.");
2604       break;
2605   }
2606 }
2607 
2608 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2609   switch(typ) {
2610     case T_BYTE:
2611       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2612       break;
2613     case T_SHORT:
2614       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2615       break;
2616     case T_INT:
2617     case T_FLOAT:
2618       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2619       break;
2620     case T_LONG:
2621     case T_DOUBLE:
2622       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2623       break;
2624     default:
2625       assert(false,"Should not reach here.");
2626       break;
2627   }
2628 }
2629 
2630 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2631   assert(vlen_in_bytes <= 32, "");
2632   int esize = type2aelembytes(bt);
2633   if (vlen_in_bytes == 32) {
2634     assert(vtmp == xnoreg, "required.");
2635     if (esize >= 4) {
2636       vtestps(src1, src2, AVX_256bit);
2637     } else {
2638       vptest(src1, src2, AVX_256bit);
2639     }
2640     return;
2641   }
2642   if (vlen_in_bytes < 16) {
2643     // Duplicate the lower part to fill the whole register,
2644     // Don't need to do so for src2
2645     assert(vtmp != xnoreg, "required");
2646     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2647     pshufd(vtmp, src1, shuffle_imm);
2648   } else {
2649     assert(vtmp == xnoreg, "required");
2650     vtmp = src1;
2651   }
2652   if (esize >= 4 && VM_Version::supports_avx()) {
2653     vtestps(vtmp, src2, AVX_128bit);
2654   } else {
2655     ptest(vtmp, src2);
2656   }
2657 }
2658 
2659 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2660   assert(UseAVX >= 2, "required");
2661 #ifdef ASSERT
2662   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2663   bool is_bw_supported = VM_Version::supports_avx512bw();
2664   if (is_bw && !is_bw_supported) {
2665     assert(vlen_enc != Assembler::AVX_512bit, "required");
2666     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2667            "XMM register should be 0-15");
2668   }
2669 #endif // ASSERT
2670   switch (elem_bt) {
2671     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2672     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2673     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2674     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2675     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2676     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2677     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2678   }
2679 }
2680 
2681 #ifdef _LP64
2682 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2683   assert(UseAVX >= 2, "required");
2684   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2685   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2686   if ((UseAVX > 2) &&
2687       (!is_bw || VM_Version::supports_avx512bw()) &&
2688       (!is_vl || VM_Version::supports_avx512vl())) {
2689     switch (elem_bt) {
2690       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2691       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2692       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2693       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2694       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2695     }
2696   } else {
2697     assert(vlen_enc != Assembler::AVX_512bit, "required");
2698     assert((dst->encoding() < 16),"XMM register should be 0-15");
2699     switch (elem_bt) {
2700       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2701       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2702       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2703       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2704       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2705       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2706       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2707     }
2708   }
2709 }
2710 #endif
2711 
2712 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2713   switch (to_elem_bt) {
2714     case T_SHORT:
2715       vpmovsxbw(dst, src, vlen_enc);
2716       break;
2717     case T_INT:
2718       vpmovsxbd(dst, src, vlen_enc);
2719       break;
2720     case T_FLOAT:
2721       vpmovsxbd(dst, src, vlen_enc);
2722       vcvtdq2ps(dst, dst, vlen_enc);
2723       break;
2724     case T_LONG:
2725       vpmovsxbq(dst, src, vlen_enc);
2726       break;
2727     case T_DOUBLE: {
2728       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2729       vpmovsxbd(dst, src, mid_vlen_enc);
2730       vcvtdq2pd(dst, dst, vlen_enc);
2731       break;
2732     }
2733     default:
2734       fatal("Unsupported type %s", type2name(to_elem_bt));
2735       break;
2736   }
2737 }
2738 
2739 //-------------------------------------------------------------------------------------------
2740 
2741 // IndexOf for constant substrings with size >= 8 chars
2742 // which don't need to be loaded through stack.
2743 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2744                                          Register cnt1, Register cnt2,
2745                                          int int_cnt2,  Register result,
2746                                          XMMRegister vec, Register tmp,
2747                                          int ae) {
2748   ShortBranchVerifier sbv(this);
2749   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2750   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2751 
2752   // This method uses the pcmpestri instruction with bound registers
2753   //   inputs:
2754   //     xmm - substring
2755   //     rax - substring length (elements count)
2756   //     mem - scanned string
2757   //     rdx - string length (elements count)
2758   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2759   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2760   //   outputs:
2761   //     rcx - matched index in string
2762   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2763   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2764   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2765   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2766   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2767 
2768   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2769         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2770         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2771 
2772   // Note, inline_string_indexOf() generates checks:
2773   // if (substr.count > string.count) return -1;
2774   // if (substr.count == 0) return 0;
2775   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2776 
2777   // Load substring.
2778   if (ae == StrIntrinsicNode::UL) {
2779     pmovzxbw(vec, Address(str2, 0));
2780   } else {
2781     movdqu(vec, Address(str2, 0));
2782   }
2783   movl(cnt2, int_cnt2);
2784   movptr(result, str1); // string addr
2785 
2786   if (int_cnt2 > stride) {
2787     jmpb(SCAN_TO_SUBSTR);
2788 
2789     // Reload substr for rescan, this code
2790     // is executed only for large substrings (> 8 chars)
2791     bind(RELOAD_SUBSTR);
2792     if (ae == StrIntrinsicNode::UL) {
2793       pmovzxbw(vec, Address(str2, 0));
2794     } else {
2795       movdqu(vec, Address(str2, 0));
2796     }
2797     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2798 
2799     bind(RELOAD_STR);
2800     // We came here after the beginning of the substring was
2801     // matched but the rest of it was not so we need to search
2802     // again. Start from the next element after the previous match.
2803 
2804     // cnt2 is number of substring reminding elements and
2805     // cnt1 is number of string reminding elements when cmp failed.
2806     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2807     subl(cnt1, cnt2);
2808     addl(cnt1, int_cnt2);
2809     movl(cnt2, int_cnt2); // Now restore cnt2
2810 
2811     decrementl(cnt1);     // Shift to next element
2812     cmpl(cnt1, cnt2);
2813     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2814 
2815     addptr(result, (1<<scale1));
2816 
2817   } // (int_cnt2 > 8)
2818 
2819   // Scan string for start of substr in 16-byte vectors
2820   bind(SCAN_TO_SUBSTR);
2821   pcmpestri(vec, Address(result, 0), mode);
2822   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2823   subl(cnt1, stride);
2824   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2825   cmpl(cnt1, cnt2);
2826   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2827   addptr(result, 16);
2828   jmpb(SCAN_TO_SUBSTR);
2829 
2830   // Found a potential substr
2831   bind(FOUND_CANDIDATE);
2832   // Matched whole vector if first element matched (tmp(rcx) == 0).
2833   if (int_cnt2 == stride) {
2834     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2835   } else { // int_cnt2 > 8
2836     jccb(Assembler::overflow, FOUND_SUBSTR);
2837   }
2838   // After pcmpestri tmp(rcx) contains matched element index
2839   // Compute start addr of substr
2840   lea(result, Address(result, tmp, scale1));
2841 
2842   // Make sure string is still long enough
2843   subl(cnt1, tmp);
2844   cmpl(cnt1, cnt2);
2845   if (int_cnt2 == stride) {
2846     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2847   } else { // int_cnt2 > 8
2848     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2849   }
2850   // Left less then substring.
2851 
2852   bind(RET_NOT_FOUND);
2853   movl(result, -1);
2854   jmp(EXIT);
2855 
2856   if (int_cnt2 > stride) {
2857     // This code is optimized for the case when whole substring
2858     // is matched if its head is matched.
2859     bind(MATCH_SUBSTR_HEAD);
2860     pcmpestri(vec, Address(result, 0), mode);
2861     // Reload only string if does not match
2862     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2863 
2864     Label CONT_SCAN_SUBSTR;
2865     // Compare the rest of substring (> 8 chars).
2866     bind(FOUND_SUBSTR);
2867     // First 8 chars are already matched.
2868     negptr(cnt2);
2869     addptr(cnt2, stride);
2870 
2871     bind(SCAN_SUBSTR);
2872     subl(cnt1, stride);
2873     cmpl(cnt2, -stride); // Do not read beyond substring
2874     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2875     // Back-up strings to avoid reading beyond substring:
2876     // cnt1 = cnt1 - cnt2 + 8
2877     addl(cnt1, cnt2); // cnt2 is negative
2878     addl(cnt1, stride);
2879     movl(cnt2, stride); negptr(cnt2);
2880     bind(CONT_SCAN_SUBSTR);
2881     if (int_cnt2 < (int)G) {
2882       int tail_off1 = int_cnt2<<scale1;
2883       int tail_off2 = int_cnt2<<scale2;
2884       if (ae == StrIntrinsicNode::UL) {
2885         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2886       } else {
2887         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2888       }
2889       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2890     } else {
2891       // calculate index in register to avoid integer overflow (int_cnt2*2)
2892       movl(tmp, int_cnt2);
2893       addptr(tmp, cnt2);
2894       if (ae == StrIntrinsicNode::UL) {
2895         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2896       } else {
2897         movdqu(vec, Address(str2, tmp, scale2, 0));
2898       }
2899       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2900     }
2901     // Need to reload strings pointers if not matched whole vector
2902     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2903     addptr(cnt2, stride);
2904     jcc(Assembler::negative, SCAN_SUBSTR);
2905     // Fall through if found full substring
2906 
2907   } // (int_cnt2 > 8)
2908 
2909   bind(RET_FOUND);
2910   // Found result if we matched full small substring.
2911   // Compute substr offset
2912   subptr(result, str1);
2913   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2914     shrl(result, 1); // index
2915   }
2916   bind(EXIT);
2917 
2918 } // string_indexofC8
2919 
2920 // Small strings are loaded through stack if they cross page boundary.
2921 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2922                                        Register cnt1, Register cnt2,
2923                                        int int_cnt2,  Register result,
2924                                        XMMRegister vec, Register tmp,
2925                                        int ae) {
2926   ShortBranchVerifier sbv(this);
2927   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2928   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2929 
2930   //
2931   // int_cnt2 is length of small (< 8 chars) constant substring
2932   // or (-1) for non constant substring in which case its length
2933   // is in cnt2 register.
2934   //
2935   // Note, inline_string_indexOf() generates checks:
2936   // if (substr.count > string.count) return -1;
2937   // if (substr.count == 0) return 0;
2938   //
2939   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2940   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2941   // This method uses the pcmpestri instruction with bound registers
2942   //   inputs:
2943   //     xmm - substring
2944   //     rax - substring length (elements count)
2945   //     mem - scanned string
2946   //     rdx - string length (elements count)
2947   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2948   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2949   //   outputs:
2950   //     rcx - matched index in string
2951   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2952   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2953   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2954   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2955 
2956   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2957         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2958         FOUND_CANDIDATE;
2959 
2960   { //========================================================
2961     // We don't know where these strings are located
2962     // and we can't read beyond them. Load them through stack.
2963     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2964 
2965     movptr(tmp, rsp); // save old SP
2966 
2967     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2968       if (int_cnt2 == (1>>scale2)) { // One byte
2969         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2970         load_unsigned_byte(result, Address(str2, 0));
2971         movdl(vec, result); // move 32 bits
2972       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2973         // Not enough header space in 32-bit VM: 12+3 = 15.
2974         movl(result, Address(str2, -1));
2975         shrl(result, 8);
2976         movdl(vec, result); // move 32 bits
2977       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2978         load_unsigned_short(result, Address(str2, 0));
2979         movdl(vec, result); // move 32 bits
2980       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2981         movdl(vec, Address(str2, 0)); // move 32 bits
2982       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2983         movq(vec, Address(str2, 0));  // move 64 bits
2984       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2985         // Array header size is 12 bytes in 32-bit VM
2986         // + 6 bytes for 3 chars == 18 bytes,
2987         // enough space to load vec and shift.
2988         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2989         if (ae == StrIntrinsicNode::UL) {
2990           int tail_off = int_cnt2-8;
2991           pmovzxbw(vec, Address(str2, tail_off));
2992           psrldq(vec, -2*tail_off);
2993         }
2994         else {
2995           int tail_off = int_cnt2*(1<<scale2);
2996           movdqu(vec, Address(str2, tail_off-16));
2997           psrldq(vec, 16-tail_off);
2998         }
2999       }
3000     } else { // not constant substring
3001       cmpl(cnt2, stride);
3002       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3003 
3004       // We can read beyond string if srt+16 does not cross page boundary
3005       // since heaps are aligned and mapped by pages.
3006       assert(os::vm_page_size() < (int)G, "default page should be small");
3007       movl(result, str2); // We need only low 32 bits
3008       andl(result, ((int)os::vm_page_size()-1));
3009       cmpl(result, ((int)os::vm_page_size()-16));
3010       jccb(Assembler::belowEqual, CHECK_STR);
3011 
3012       // Move small strings to stack to allow load 16 bytes into vec.
3013       subptr(rsp, 16);
3014       int stk_offset = wordSize-(1<<scale2);
3015       push(cnt2);
3016 
3017       bind(COPY_SUBSTR);
3018       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3019         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3020         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3021       } else if (ae == StrIntrinsicNode::UU) {
3022         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3023         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3024       }
3025       decrement(cnt2);
3026       jccb(Assembler::notZero, COPY_SUBSTR);
3027 
3028       pop(cnt2);
3029       movptr(str2, rsp);  // New substring address
3030     } // non constant
3031 
3032     bind(CHECK_STR);
3033     cmpl(cnt1, stride);
3034     jccb(Assembler::aboveEqual, BIG_STRINGS);
3035 
3036     // Check cross page boundary.
3037     movl(result, str1); // We need only low 32 bits
3038     andl(result, ((int)os::vm_page_size()-1));
3039     cmpl(result, ((int)os::vm_page_size()-16));
3040     jccb(Assembler::belowEqual, BIG_STRINGS);
3041 
3042     subptr(rsp, 16);
3043     int stk_offset = -(1<<scale1);
3044     if (int_cnt2 < 0) { // not constant
3045       push(cnt2);
3046       stk_offset += wordSize;
3047     }
3048     movl(cnt2, cnt1);
3049 
3050     bind(COPY_STR);
3051     if (ae == StrIntrinsicNode::LL) {
3052       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3053       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3054     } else {
3055       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3056       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3057     }
3058     decrement(cnt2);
3059     jccb(Assembler::notZero, COPY_STR);
3060 
3061     if (int_cnt2 < 0) { // not constant
3062       pop(cnt2);
3063     }
3064     movptr(str1, rsp);  // New string address
3065 
3066     bind(BIG_STRINGS);
3067     // Load substring.
3068     if (int_cnt2 < 0) { // -1
3069       if (ae == StrIntrinsicNode::UL) {
3070         pmovzxbw(vec, Address(str2, 0));
3071       } else {
3072         movdqu(vec, Address(str2, 0));
3073       }
3074       push(cnt2);       // substr count
3075       push(str2);       // substr addr
3076       push(str1);       // string addr
3077     } else {
3078       // Small (< 8 chars) constant substrings are loaded already.
3079       movl(cnt2, int_cnt2);
3080     }
3081     push(tmp);  // original SP
3082 
3083   } // Finished loading
3084 
3085   //========================================================
3086   // Start search
3087   //
3088 
3089   movptr(result, str1); // string addr
3090 
3091   if (int_cnt2  < 0) {  // Only for non constant substring
3092     jmpb(SCAN_TO_SUBSTR);
3093 
3094     // SP saved at sp+0
3095     // String saved at sp+1*wordSize
3096     // Substr saved at sp+2*wordSize
3097     // Substr count saved at sp+3*wordSize
3098 
3099     // Reload substr for rescan, this code
3100     // is executed only for large substrings (> 8 chars)
3101     bind(RELOAD_SUBSTR);
3102     movptr(str2, Address(rsp, 2*wordSize));
3103     movl(cnt2, Address(rsp, 3*wordSize));
3104     if (ae == StrIntrinsicNode::UL) {
3105       pmovzxbw(vec, Address(str2, 0));
3106     } else {
3107       movdqu(vec, Address(str2, 0));
3108     }
3109     // We came here after the beginning of the substring was
3110     // matched but the rest of it was not so we need to search
3111     // again. Start from the next element after the previous match.
3112     subptr(str1, result); // Restore counter
3113     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3114       shrl(str1, 1);
3115     }
3116     addl(cnt1, str1);
3117     decrementl(cnt1);   // Shift to next element
3118     cmpl(cnt1, cnt2);
3119     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3120 
3121     addptr(result, (1<<scale1));
3122   } // non constant
3123 
3124   // Scan string for start of substr in 16-byte vectors
3125   bind(SCAN_TO_SUBSTR);
3126   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3127   pcmpestri(vec, Address(result, 0), mode);
3128   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3129   subl(cnt1, stride);
3130   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3131   cmpl(cnt1, cnt2);
3132   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3133   addptr(result, 16);
3134 
3135   bind(ADJUST_STR);
3136   cmpl(cnt1, stride); // Do not read beyond string
3137   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3138   // Back-up string to avoid reading beyond string.
3139   lea(result, Address(result, cnt1, scale1, -16));
3140   movl(cnt1, stride);
3141   jmpb(SCAN_TO_SUBSTR);
3142 
3143   // Found a potential substr
3144   bind(FOUND_CANDIDATE);
3145   // After pcmpestri tmp(rcx) contains matched element index
3146 
3147   // Make sure string is still long enough
3148   subl(cnt1, tmp);
3149   cmpl(cnt1, cnt2);
3150   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3151   // Left less then substring.
3152 
3153   bind(RET_NOT_FOUND);
3154   movl(result, -1);
3155   jmp(CLEANUP);
3156 
3157   bind(FOUND_SUBSTR);
3158   // Compute start addr of substr
3159   lea(result, Address(result, tmp, scale1));
3160   if (int_cnt2 > 0) { // Constant substring
3161     // Repeat search for small substring (< 8 chars)
3162     // from new point without reloading substring.
3163     // Have to check that we don't read beyond string.
3164     cmpl(tmp, stride-int_cnt2);
3165     jccb(Assembler::greater, ADJUST_STR);
3166     // Fall through if matched whole substring.
3167   } else { // non constant
3168     assert(int_cnt2 == -1, "should be != 0");
3169 
3170     addl(tmp, cnt2);
3171     // Found result if we matched whole substring.
3172     cmpl(tmp, stride);
3173     jcc(Assembler::lessEqual, RET_FOUND);
3174 
3175     // Repeat search for small substring (<= 8 chars)
3176     // from new point 'str1' without reloading substring.
3177     cmpl(cnt2, stride);
3178     // Have to check that we don't read beyond string.
3179     jccb(Assembler::lessEqual, ADJUST_STR);
3180 
3181     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3182     // Compare the rest of substring (> 8 chars).
3183     movptr(str1, result);
3184 
3185     cmpl(tmp, cnt2);
3186     // First 8 chars are already matched.
3187     jccb(Assembler::equal, CHECK_NEXT);
3188 
3189     bind(SCAN_SUBSTR);
3190     pcmpestri(vec, Address(str1, 0), mode);
3191     // Need to reload strings pointers if not matched whole vector
3192     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3193 
3194     bind(CHECK_NEXT);
3195     subl(cnt2, stride);
3196     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3197     addptr(str1, 16);
3198     if (ae == StrIntrinsicNode::UL) {
3199       addptr(str2, 8);
3200     } else {
3201       addptr(str2, 16);
3202     }
3203     subl(cnt1, stride);
3204     cmpl(cnt2, stride); // Do not read beyond substring
3205     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3206     // Back-up strings to avoid reading beyond substring.
3207 
3208     if (ae == StrIntrinsicNode::UL) {
3209       lea(str2, Address(str2, cnt2, scale2, -8));
3210       lea(str1, Address(str1, cnt2, scale1, -16));
3211     } else {
3212       lea(str2, Address(str2, cnt2, scale2, -16));
3213       lea(str1, Address(str1, cnt2, scale1, -16));
3214     }
3215     subl(cnt1, cnt2);
3216     movl(cnt2, stride);
3217     addl(cnt1, stride);
3218     bind(CONT_SCAN_SUBSTR);
3219     if (ae == StrIntrinsicNode::UL) {
3220       pmovzxbw(vec, Address(str2, 0));
3221     } else {
3222       movdqu(vec, Address(str2, 0));
3223     }
3224     jmp(SCAN_SUBSTR);
3225 
3226     bind(RET_FOUND_LONG);
3227     movptr(str1, Address(rsp, wordSize));
3228   } // non constant
3229 
3230   bind(RET_FOUND);
3231   // Compute substr offset
3232   subptr(result, str1);
3233   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3234     shrl(result, 1); // index
3235   }
3236   bind(CLEANUP);
3237   pop(rsp); // restore SP
3238 
3239 } // string_indexof
3240 
3241 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3242                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3243   ShortBranchVerifier sbv(this);
3244   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3245 
3246   int stride = 8;
3247 
3248   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3249         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3250         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3251         FOUND_SEQ_CHAR, DONE_LABEL;
3252 
3253   movptr(result, str1);
3254   if (UseAVX >= 2) {
3255     cmpl(cnt1, stride);
3256     jcc(Assembler::less, SCAN_TO_CHAR);
3257     cmpl(cnt1, 2*stride);
3258     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3259     movdl(vec1, ch);
3260     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3261     vpxor(vec2, vec2);
3262     movl(tmp, cnt1);
3263     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3264     andl(cnt1,0x0000000F);  //tail count (in chars)
3265 
3266     bind(SCAN_TO_16_CHAR_LOOP);
3267     vmovdqu(vec3, Address(result, 0));
3268     vpcmpeqw(vec3, vec3, vec1, 1);
3269     vptest(vec2, vec3);
3270     jcc(Assembler::carryClear, FOUND_CHAR);
3271     addptr(result, 32);
3272     subl(tmp, 2*stride);
3273     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3274     jmp(SCAN_TO_8_CHAR);
3275     bind(SCAN_TO_8_CHAR_INIT);
3276     movdl(vec1, ch);
3277     pshuflw(vec1, vec1, 0x00);
3278     pshufd(vec1, vec1, 0);
3279     pxor(vec2, vec2);
3280   }
3281   bind(SCAN_TO_8_CHAR);
3282   cmpl(cnt1, stride);
3283   jcc(Assembler::less, SCAN_TO_CHAR);
3284   if (UseAVX < 2) {
3285     movdl(vec1, ch);
3286     pshuflw(vec1, vec1, 0x00);
3287     pshufd(vec1, vec1, 0);
3288     pxor(vec2, vec2);
3289   }
3290   movl(tmp, cnt1);
3291   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3292   andl(cnt1,0x00000007);  //tail count (in chars)
3293 
3294   bind(SCAN_TO_8_CHAR_LOOP);
3295   movdqu(vec3, Address(result, 0));
3296   pcmpeqw(vec3, vec1);
3297   ptest(vec2, vec3);
3298   jcc(Assembler::carryClear, FOUND_CHAR);
3299   addptr(result, 16);
3300   subl(tmp, stride);
3301   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3302   bind(SCAN_TO_CHAR);
3303   testl(cnt1, cnt1);
3304   jcc(Assembler::zero, RET_NOT_FOUND);
3305   bind(SCAN_TO_CHAR_LOOP);
3306   load_unsigned_short(tmp, Address(result, 0));
3307   cmpl(ch, tmp);
3308   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3309   addptr(result, 2);
3310   subl(cnt1, 1);
3311   jccb(Assembler::zero, RET_NOT_FOUND);
3312   jmp(SCAN_TO_CHAR_LOOP);
3313 
3314   bind(RET_NOT_FOUND);
3315   movl(result, -1);
3316   jmpb(DONE_LABEL);
3317 
3318   bind(FOUND_CHAR);
3319   if (UseAVX >= 2) {
3320     vpmovmskb(tmp, vec3);
3321   } else {
3322     pmovmskb(tmp, vec3);
3323   }
3324   bsfl(ch, tmp);
3325   addptr(result, ch);
3326 
3327   bind(FOUND_SEQ_CHAR);
3328   subptr(result, str1);
3329   shrl(result, 1);
3330 
3331   bind(DONE_LABEL);
3332 } // string_indexof_char
3333 
3334 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3335                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3336   ShortBranchVerifier sbv(this);
3337   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3338 
3339   int stride = 16;
3340 
3341   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3342         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3343         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3344         FOUND_SEQ_CHAR, DONE_LABEL;
3345 
3346   movptr(result, str1);
3347   if (UseAVX >= 2) {
3348     cmpl(cnt1, stride);
3349     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3350     cmpl(cnt1, stride*2);
3351     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3352     movdl(vec1, ch);
3353     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3354     vpxor(vec2, vec2);
3355     movl(tmp, cnt1);
3356     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3357     andl(cnt1,0x0000001F);  //tail count (in chars)
3358 
3359     bind(SCAN_TO_32_CHAR_LOOP);
3360     vmovdqu(vec3, Address(result, 0));
3361     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3362     vptest(vec2, vec3);
3363     jcc(Assembler::carryClear, FOUND_CHAR);
3364     addptr(result, 32);
3365     subl(tmp, stride*2);
3366     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3367     jmp(SCAN_TO_16_CHAR);
3368 
3369     bind(SCAN_TO_16_CHAR_INIT);
3370     movdl(vec1, ch);
3371     pxor(vec2, vec2);
3372     pshufb(vec1, vec2);
3373   }
3374 
3375   bind(SCAN_TO_16_CHAR);
3376   cmpl(cnt1, stride);
3377   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3378   if (UseAVX < 2) {
3379     movdl(vec1, ch);
3380     pxor(vec2, vec2);
3381     pshufb(vec1, vec2);
3382   }
3383   movl(tmp, cnt1);
3384   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3385   andl(cnt1,0x0000000F);  //tail count (in bytes)
3386 
3387   bind(SCAN_TO_16_CHAR_LOOP);
3388   movdqu(vec3, Address(result, 0));
3389   pcmpeqb(vec3, vec1);
3390   ptest(vec2, vec3);
3391   jcc(Assembler::carryClear, FOUND_CHAR);
3392   addptr(result, 16);
3393   subl(tmp, stride);
3394   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3395 
3396   bind(SCAN_TO_CHAR_INIT);
3397   testl(cnt1, cnt1);
3398   jcc(Assembler::zero, RET_NOT_FOUND);
3399   bind(SCAN_TO_CHAR_LOOP);
3400   load_unsigned_byte(tmp, Address(result, 0));
3401   cmpl(ch, tmp);
3402   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3403   addptr(result, 1);
3404   subl(cnt1, 1);
3405   jccb(Assembler::zero, RET_NOT_FOUND);
3406   jmp(SCAN_TO_CHAR_LOOP);
3407 
3408   bind(RET_NOT_FOUND);
3409   movl(result, -1);
3410   jmpb(DONE_LABEL);
3411 
3412   bind(FOUND_CHAR);
3413   if (UseAVX >= 2) {
3414     vpmovmskb(tmp, vec3);
3415   } else {
3416     pmovmskb(tmp, vec3);
3417   }
3418   bsfl(ch, tmp);
3419   addptr(result, ch);
3420 
3421   bind(FOUND_SEQ_CHAR);
3422   subptr(result, str1);
3423 
3424   bind(DONE_LABEL);
3425 } // stringL_indexof_char
3426 
3427 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3428   switch (eltype) {
3429   case T_BOOLEAN: return sizeof(jboolean);
3430   case T_BYTE:  return sizeof(jbyte);
3431   case T_SHORT: return sizeof(jshort);
3432   case T_CHAR:  return sizeof(jchar);
3433   case T_INT:   return sizeof(jint);
3434   default:
3435     ShouldNotReachHere();
3436     return -1;
3437   }
3438 }
3439 
3440 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3441   switch (eltype) {
3442   // T_BOOLEAN used as surrogate for unsigned byte
3443   case T_BOOLEAN: movzbl(dst, src);   break;
3444   case T_BYTE:    movsbl(dst, src);   break;
3445   case T_SHORT:   movswl(dst, src);   break;
3446   case T_CHAR:    movzwl(dst, src);   break;
3447   case T_INT:     movl(dst, src);     break;
3448   default:
3449     ShouldNotReachHere();
3450   }
3451 }
3452 
3453 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3454   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3455 }
3456 
3457 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3458   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3459 }
3460 
3461 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3462   const int vlen = Assembler::AVX_256bit;
3463   switch (eltype) {
3464   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3465   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3466   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3467   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3468   case T_INT:
3469     // do nothing
3470     break;
3471   default:
3472     ShouldNotReachHere();
3473   }
3474 }
3475 
3476 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3477                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3478                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3479                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3480                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3481                                         BasicType eltype) {
3482   ShortBranchVerifier sbv(this);
3483   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3484   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3485   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3486 
3487   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3488         SHORT_UNROLLED_LOOP_EXIT,
3489         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3490         UNROLLED_VECTOR_LOOP_BEGIN,
3491         END;
3492   switch (eltype) {
3493   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3494   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3495   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3496   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3497   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3498   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3499   }
3500 
3501   // For "renaming" for readibility of the code
3502   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3503                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3504                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3505 
3506   const int elsize = arrays_hashcode_elsize(eltype);
3507 
3508   /*
3509     if (cnt1 >= 2) {
3510       if (cnt1 >= 32) {
3511         UNROLLED VECTOR LOOP
3512       }
3513       UNROLLED SCALAR LOOP
3514     }
3515     SINGLE SCALAR
3516    */
3517 
3518   cmpl(cnt1, 32);
3519   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3520 
3521   // cnt1 >= 32 && generate_vectorized_loop
3522   xorl(index, index);
3523 
3524   // vresult = IntVector.zero(I256);
3525   for (int idx = 0; idx < 4; idx++) {
3526     vpxor(vresult[idx], vresult[idx]);
3527   }
3528   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3529   Register bound = tmp2;
3530   Register next = tmp3;
3531   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3532   movl(next, Address(tmp2, 0));
3533   movdl(vnext, next);
3534   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3535 
3536   // index = 0;
3537   // bound = cnt1 & ~(32 - 1);
3538   movl(bound, cnt1);
3539   andl(bound, ~(32 - 1));
3540   // for (; index < bound; index += 32) {
3541   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3542   // result *= next;
3543   imull(result, next);
3544   // loop fission to upfront the cost of fetching from memory, OOO execution
3545   // can then hopefully do a better job of prefetching
3546   for (int idx = 0; idx < 4; idx++) {
3547     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3548   }
3549   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3550   for (int idx = 0; idx < 4; idx++) {
3551     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3552     arrays_hashcode_elvcast(vtmp[idx], eltype);
3553     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3554   }
3555   // index += 32;
3556   addl(index, 32);
3557   // index < bound;
3558   cmpl(index, bound);
3559   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3560   // }
3561 
3562   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3563   subl(cnt1, bound);
3564   // release bound
3565 
3566   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3567   for (int idx = 0; idx < 4; idx++) {
3568     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3569     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3570     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3571   }
3572   // result += vresult.reduceLanes(ADD);
3573   for (int idx = 0; idx < 4; idx++) {
3574     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3575   }
3576 
3577   // } else if (cnt1 < 32) {
3578 
3579   bind(SHORT_UNROLLED_BEGIN);
3580   // int i = 1;
3581   movl(index, 1);
3582   cmpl(index, cnt1);
3583   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3584 
3585   // for (; i < cnt1 ; i += 2) {
3586   bind(SHORT_UNROLLED_LOOP_BEGIN);
3587   movl(tmp3, 961);
3588   imull(result, tmp3);
3589   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3590   movl(tmp3, tmp2);
3591   shll(tmp3, 5);
3592   subl(tmp3, tmp2);
3593   addl(result, tmp3);
3594   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3595   addl(result, tmp3);
3596   addl(index, 2);
3597   cmpl(index, cnt1);
3598   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3599 
3600   // }
3601   // if (i >= cnt1) {
3602   bind(SHORT_UNROLLED_LOOP_EXIT);
3603   jccb(Assembler::greater, END);
3604   movl(tmp2, result);
3605   shll(result, 5);
3606   subl(result, tmp2);
3607   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3608   addl(result, tmp3);
3609   // }
3610   bind(END);
3611 
3612   BLOCK_COMMENT("} // arrays_hashcode");
3613 
3614 } // arrays_hashcode
3615 
3616 // helper function for string_compare
3617 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3618                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3619                                            Address::ScaleFactor scale2, Register index, int ae) {
3620   if (ae == StrIntrinsicNode::LL) {
3621     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3622     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3623   } else if (ae == StrIntrinsicNode::UU) {
3624     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3625     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3626   } else {
3627     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3628     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3629   }
3630 }
3631 
3632 // Compare strings, used for char[] and byte[].
3633 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3634                                        Register cnt1, Register cnt2, Register result,
3635                                        XMMRegister vec1, int ae, KRegister mask) {
3636   ShortBranchVerifier sbv(this);
3637   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3638   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3639   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3640   int stride2x2 = 0x40;
3641   Address::ScaleFactor scale = Address::no_scale;
3642   Address::ScaleFactor scale1 = Address::no_scale;
3643   Address::ScaleFactor scale2 = Address::no_scale;
3644 
3645   if (ae != StrIntrinsicNode::LL) {
3646     stride2x2 = 0x20;
3647   }
3648 
3649   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3650     shrl(cnt2, 1);
3651   }
3652   // Compute the minimum of the string lengths and the
3653   // difference of the string lengths (stack).
3654   // Do the conditional move stuff
3655   movl(result, cnt1);
3656   subl(cnt1, cnt2);
3657   push(cnt1);
3658   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3659 
3660   // Is the minimum length zero?
3661   testl(cnt2, cnt2);
3662   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3663   if (ae == StrIntrinsicNode::LL) {
3664     // Load first bytes
3665     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3666     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3667   } else if (ae == StrIntrinsicNode::UU) {
3668     // Load first characters
3669     load_unsigned_short(result, Address(str1, 0));
3670     load_unsigned_short(cnt1, Address(str2, 0));
3671   } else {
3672     load_unsigned_byte(result, Address(str1, 0));
3673     load_unsigned_short(cnt1, Address(str2, 0));
3674   }
3675   subl(result, cnt1);
3676   jcc(Assembler::notZero,  POP_LABEL);
3677 
3678   if (ae == StrIntrinsicNode::UU) {
3679     // Divide length by 2 to get number of chars
3680     shrl(cnt2, 1);
3681   }
3682   cmpl(cnt2, 1);
3683   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3684 
3685   // Check if the strings start at the same location and setup scale and stride
3686   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3687     cmpptr(str1, str2);
3688     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3689     if (ae == StrIntrinsicNode::LL) {
3690       scale = Address::times_1;
3691       stride = 16;
3692     } else {
3693       scale = Address::times_2;
3694       stride = 8;
3695     }
3696   } else {
3697     scale1 = Address::times_1;
3698     scale2 = Address::times_2;
3699     // scale not used
3700     stride = 8;
3701   }
3702 
3703   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3704     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3705     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3706     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3707     Label COMPARE_TAIL_LONG;
3708     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3709 
3710     int pcmpmask = 0x19;
3711     if (ae == StrIntrinsicNode::LL) {
3712       pcmpmask &= ~0x01;
3713     }
3714 
3715     // Setup to compare 16-chars (32-bytes) vectors,
3716     // start from first character again because it has aligned address.
3717     if (ae == StrIntrinsicNode::LL) {
3718       stride2 = 32;
3719     } else {
3720       stride2 = 16;
3721     }
3722     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3723       adr_stride = stride << scale;
3724     } else {
3725       adr_stride1 = 8;  //stride << scale1;
3726       adr_stride2 = 16; //stride << scale2;
3727     }
3728 
3729     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3730     // rax and rdx are used by pcmpestri as elements counters
3731     movl(result, cnt2);
3732     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3733     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3734 
3735     // fast path : compare first 2 8-char vectors.
3736     bind(COMPARE_16_CHARS);
3737     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3738       movdqu(vec1, Address(str1, 0));
3739     } else {
3740       pmovzxbw(vec1, Address(str1, 0));
3741     }
3742     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3743     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3744 
3745     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3746       movdqu(vec1, Address(str1, adr_stride));
3747       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3748     } else {
3749       pmovzxbw(vec1, Address(str1, adr_stride1));
3750       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3751     }
3752     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3753     addl(cnt1, stride);
3754 
3755     // Compare the characters at index in cnt1
3756     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3757     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3758     subl(result, cnt2);
3759     jmp(POP_LABEL);
3760 
3761     // Setup the registers to start vector comparison loop
3762     bind(COMPARE_WIDE_VECTORS);
3763     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3764       lea(str1, Address(str1, result, scale));
3765       lea(str2, Address(str2, result, scale));
3766     } else {
3767       lea(str1, Address(str1, result, scale1));
3768       lea(str2, Address(str2, result, scale2));
3769     }
3770     subl(result, stride2);
3771     subl(cnt2, stride2);
3772     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3773     negptr(result);
3774 
3775     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3776     bind(COMPARE_WIDE_VECTORS_LOOP);
3777 
3778 #ifdef _LP64
3779     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3780       cmpl(cnt2, stride2x2);
3781       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3782       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3783       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3784 
3785       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3786       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3787         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3788         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3789       } else {
3790         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3791         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3792       }
3793       kortestql(mask, mask);
3794       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3795       addptr(result, stride2x2);  // update since we already compared at this addr
3796       subl(cnt2, stride2x2);      // and sub the size too
3797       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3798 
3799       vpxor(vec1, vec1);
3800       jmpb(COMPARE_WIDE_TAIL);
3801     }//if (VM_Version::supports_avx512vlbw())
3802 #endif // _LP64
3803 
3804 
3805     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3806     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3807       vmovdqu(vec1, Address(str1, result, scale));
3808       vpxor(vec1, Address(str2, result, scale));
3809     } else {
3810       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3811       vpxor(vec1, Address(str2, result, scale2));
3812     }
3813     vptest(vec1, vec1);
3814     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3815     addptr(result, stride2);
3816     subl(cnt2, stride2);
3817     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3818     // clean upper bits of YMM registers
3819     vpxor(vec1, vec1);
3820 
3821     // compare wide vectors tail
3822     bind(COMPARE_WIDE_TAIL);
3823     testptr(result, result);
3824     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3825 
3826     movl(result, stride2);
3827     movl(cnt2, result);
3828     negptr(result);
3829     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3830 
3831     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3832     bind(VECTOR_NOT_EQUAL);
3833     // clean upper bits of YMM registers
3834     vpxor(vec1, vec1);
3835     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3836       lea(str1, Address(str1, result, scale));
3837       lea(str2, Address(str2, result, scale));
3838     } else {
3839       lea(str1, Address(str1, result, scale1));
3840       lea(str2, Address(str2, result, scale2));
3841     }
3842     jmp(COMPARE_16_CHARS);
3843 
3844     // Compare tail chars, length between 1 to 15 chars
3845     bind(COMPARE_TAIL_LONG);
3846     movl(cnt2, result);
3847     cmpl(cnt2, stride);
3848     jcc(Assembler::less, COMPARE_SMALL_STR);
3849 
3850     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3851       movdqu(vec1, Address(str1, 0));
3852     } else {
3853       pmovzxbw(vec1, Address(str1, 0));
3854     }
3855     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3856     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3857     subptr(cnt2, stride);
3858     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3859     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3860       lea(str1, Address(str1, result, scale));
3861       lea(str2, Address(str2, result, scale));
3862     } else {
3863       lea(str1, Address(str1, result, scale1));
3864       lea(str2, Address(str2, result, scale2));
3865     }
3866     negptr(cnt2);
3867     jmpb(WHILE_HEAD_LABEL);
3868 
3869     bind(COMPARE_SMALL_STR);
3870   } else if (UseSSE42Intrinsics) {
3871     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3872     int pcmpmask = 0x19;
3873     // Setup to compare 8-char (16-byte) vectors,
3874     // start from first character again because it has aligned address.
3875     movl(result, cnt2);
3876     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3877     if (ae == StrIntrinsicNode::LL) {
3878       pcmpmask &= ~0x01;
3879     }
3880     jcc(Assembler::zero, COMPARE_TAIL);
3881     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3882       lea(str1, Address(str1, result, scale));
3883       lea(str2, Address(str2, result, scale));
3884     } else {
3885       lea(str1, Address(str1, result, scale1));
3886       lea(str2, Address(str2, result, scale2));
3887     }
3888     negptr(result);
3889 
3890     // pcmpestri
3891     //   inputs:
3892     //     vec1- substring
3893     //     rax - negative string length (elements count)
3894     //     mem - scanned string
3895     //     rdx - string length (elements count)
3896     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3897     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3898     //   outputs:
3899     //     rcx - first mismatched element index
3900     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3901 
3902     bind(COMPARE_WIDE_VECTORS);
3903     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3904       movdqu(vec1, Address(str1, result, scale));
3905       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3906     } else {
3907       pmovzxbw(vec1, Address(str1, result, scale1));
3908       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3909     }
3910     // After pcmpestri cnt1(rcx) contains mismatched element index
3911 
3912     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3913     addptr(result, stride);
3914     subptr(cnt2, stride);
3915     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3916 
3917     // compare wide vectors tail
3918     testptr(result, result);
3919     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3920 
3921     movl(cnt2, stride);
3922     movl(result, stride);
3923     negptr(result);
3924     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3925       movdqu(vec1, Address(str1, result, scale));
3926       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3927     } else {
3928       pmovzxbw(vec1, Address(str1, result, scale1));
3929       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3930     }
3931     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3932 
3933     // Mismatched characters in the vectors
3934     bind(VECTOR_NOT_EQUAL);
3935     addptr(cnt1, result);
3936     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3937     subl(result, cnt2);
3938     jmpb(POP_LABEL);
3939 
3940     bind(COMPARE_TAIL); // limit is zero
3941     movl(cnt2, result);
3942     // Fallthru to tail compare
3943   }
3944   // Shift str2 and str1 to the end of the arrays, negate min
3945   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3946     lea(str1, Address(str1, cnt2, scale));
3947     lea(str2, Address(str2, cnt2, scale));
3948   } else {
3949     lea(str1, Address(str1, cnt2, scale1));
3950     lea(str2, Address(str2, cnt2, scale2));
3951   }
3952   decrementl(cnt2);  // first character was compared already
3953   negptr(cnt2);
3954 
3955   // Compare the rest of the elements
3956   bind(WHILE_HEAD_LABEL);
3957   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3958   subl(result, cnt1);
3959   jccb(Assembler::notZero, POP_LABEL);
3960   increment(cnt2);
3961   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3962 
3963   // Strings are equal up to min length.  Return the length difference.
3964   bind(LENGTH_DIFF_LABEL);
3965   pop(result);
3966   if (ae == StrIntrinsicNode::UU) {
3967     // Divide diff by 2 to get number of chars
3968     sarl(result, 1);
3969   }
3970   jmpb(DONE_LABEL);
3971 
3972 #ifdef _LP64
3973   if (VM_Version::supports_avx512vlbw()) {
3974 
3975     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3976 
3977     kmovql(cnt1, mask);
3978     notq(cnt1);
3979     bsfq(cnt2, cnt1);
3980     if (ae != StrIntrinsicNode::LL) {
3981       // Divide diff by 2 to get number of chars
3982       sarl(cnt2, 1);
3983     }
3984     addq(result, cnt2);
3985     if (ae == StrIntrinsicNode::LL) {
3986       load_unsigned_byte(cnt1, Address(str2, result));
3987       load_unsigned_byte(result, Address(str1, result));
3988     } else if (ae == StrIntrinsicNode::UU) {
3989       load_unsigned_short(cnt1, Address(str2, result, scale));
3990       load_unsigned_short(result, Address(str1, result, scale));
3991     } else {
3992       load_unsigned_short(cnt1, Address(str2, result, scale2));
3993       load_unsigned_byte(result, Address(str1, result, scale1));
3994     }
3995     subl(result, cnt1);
3996     jmpb(POP_LABEL);
3997   }//if (VM_Version::supports_avx512vlbw())
3998 #endif // _LP64
3999 
4000   // Discard the stored length difference
4001   bind(POP_LABEL);
4002   pop(cnt1);
4003 
4004   // That's it
4005   bind(DONE_LABEL);
4006   if(ae == StrIntrinsicNode::UL) {
4007     negl(result);
4008   }
4009 
4010 }
4011 
4012 // Search for Non-ASCII character (Negative byte value) in a byte array,
4013 // return the index of the first such character, otherwise the length
4014 // of the array segment searched.
4015 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4016 //   @IntrinsicCandidate
4017 //   public static int countPositives(byte[] ba, int off, int len) {
4018 //     for (int i = off; i < off + len; i++) {
4019 //       if (ba[i] < 0) {
4020 //         return i - off;
4021 //       }
4022 //     }
4023 //     return len;
4024 //   }
4025 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4026   Register result, Register tmp1,
4027   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4028   // rsi: byte array
4029   // rcx: len
4030   // rax: result
4031   ShortBranchVerifier sbv(this);
4032   assert_different_registers(ary1, len, result, tmp1);
4033   assert_different_registers(vec1, vec2);
4034   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4035 
4036   movl(result, len); // copy
4037   // len == 0
4038   testl(len, len);
4039   jcc(Assembler::zero, DONE);
4040 
4041   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4042     VM_Version::supports_avx512vlbw() &&
4043     VM_Version::supports_bmi2()) {
4044 
4045     Label test_64_loop, test_tail, BREAK_LOOP;
4046     movl(tmp1, len);
4047     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4048 
4049     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4050     andl(len,  0xffffffc0); // vector count (in chars)
4051     jccb(Assembler::zero, test_tail);
4052 
4053     lea(ary1, Address(ary1, len, Address::times_1));
4054     negptr(len);
4055 
4056     bind(test_64_loop);
4057     // Check whether our 64 elements of size byte contain negatives
4058     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4059     kortestql(mask1, mask1);
4060     jcc(Assembler::notZero, BREAK_LOOP);
4061 
4062     addptr(len, 64);
4063     jccb(Assembler::notZero, test_64_loop);
4064 
4065     bind(test_tail);
4066     // bail out when there is nothing to be done
4067     testl(tmp1, -1);
4068     jcc(Assembler::zero, DONE);
4069 
4070 
4071     // check the tail for absense of negatives
4072     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4073 #ifdef _LP64
4074     {
4075       Register tmp3_aliased = len;
4076       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4077       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4078       notq(tmp3_aliased);
4079       kmovql(mask2, tmp3_aliased);
4080     }
4081 #else
4082     Label k_init;
4083     jmp(k_init);
4084 
4085     // We could not read 64-bits from a general purpose register thus we move
4086     // data required to compose 64 1's to the instruction stream
4087     // We emit 64 byte wide series of elements from 0..63 which later on would
4088     // be used as a compare targets with tail count contained in tmp1 register.
4089     // Result would be a k register having tmp1 consecutive number or 1
4090     // counting from least significant bit.
4091     address tmp = pc();
4092     emit_int64(0x0706050403020100);
4093     emit_int64(0x0F0E0D0C0B0A0908);
4094     emit_int64(0x1716151413121110);
4095     emit_int64(0x1F1E1D1C1B1A1918);
4096     emit_int64(0x2726252423222120);
4097     emit_int64(0x2F2E2D2C2B2A2928);
4098     emit_int64(0x3736353433323130);
4099     emit_int64(0x3F3E3D3C3B3A3938);
4100 
4101     bind(k_init);
4102     lea(len, InternalAddress(tmp));
4103     // create mask to test for negative byte inside a vector
4104     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4105     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4106 
4107 #endif
4108     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4109     ktestq(mask1, mask2);
4110     jcc(Assembler::zero, DONE);
4111 
4112     // do a full check for negative registers in the tail
4113     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4114                      // ary1 already pointing to the right place
4115     jmpb(TAIL_START);
4116 
4117     bind(BREAK_LOOP);
4118     // At least one byte in the last 64 byte block was negative.
4119     // Set up to look at the last 64 bytes as if they were a tail
4120     lea(ary1, Address(ary1, len, Address::times_1));
4121     addptr(result, len);
4122     // Ignore the very last byte: if all others are positive,
4123     // it must be negative, so we can skip right to the 2+1 byte
4124     // end comparison at this point
4125     orl(result, 63);
4126     movl(len, 63);
4127     // Fallthru to tail compare
4128   } else {
4129 
4130     if (UseAVX >= 2 && UseSSE >= 2) {
4131       // With AVX2, use 32-byte vector compare
4132       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4133 
4134       // Compare 32-byte vectors
4135       testl(len, 0xffffffe0);   // vector count (in bytes)
4136       jccb(Assembler::zero, TAIL_START);
4137 
4138       andl(len, 0xffffffe0);
4139       lea(ary1, Address(ary1, len, Address::times_1));
4140       negptr(len);
4141 
4142       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4143       movdl(vec2, tmp1);
4144       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4145 
4146       bind(COMPARE_WIDE_VECTORS);
4147       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4148       vptest(vec1, vec2);
4149       jccb(Assembler::notZero, BREAK_LOOP);
4150       addptr(len, 32);
4151       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4152 
4153       testl(result, 0x0000001f);   // any bytes remaining?
4154       jcc(Assembler::zero, DONE);
4155 
4156       // Quick test using the already prepared vector mask
4157       movl(len, result);
4158       andl(len, 0x0000001f);
4159       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4160       vptest(vec1, vec2);
4161       jcc(Assembler::zero, DONE);
4162       // There are zeros, jump to the tail to determine exactly where
4163       jmpb(TAIL_START);
4164 
4165       bind(BREAK_LOOP);
4166       // At least one byte in the last 32-byte vector is negative.
4167       // Set up to look at the last 32 bytes as if they were a tail
4168       lea(ary1, Address(ary1, len, Address::times_1));
4169       addptr(result, len);
4170       // Ignore the very last byte: if all others are positive,
4171       // it must be negative, so we can skip right to the 2+1 byte
4172       // end comparison at this point
4173       orl(result, 31);
4174       movl(len, 31);
4175       // Fallthru to tail compare
4176     } else if (UseSSE42Intrinsics) {
4177       // With SSE4.2, use double quad vector compare
4178       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4179 
4180       // Compare 16-byte vectors
4181       testl(len, 0xfffffff0);   // vector count (in bytes)
4182       jcc(Assembler::zero, TAIL_START);
4183 
4184       andl(len, 0xfffffff0);
4185       lea(ary1, Address(ary1, len, Address::times_1));
4186       negptr(len);
4187 
4188       movl(tmp1, 0x80808080);
4189       movdl(vec2, tmp1);
4190       pshufd(vec2, vec2, 0);
4191 
4192       bind(COMPARE_WIDE_VECTORS);
4193       movdqu(vec1, Address(ary1, len, Address::times_1));
4194       ptest(vec1, vec2);
4195       jccb(Assembler::notZero, BREAK_LOOP);
4196       addptr(len, 16);
4197       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4198 
4199       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4200       jcc(Assembler::zero, DONE);
4201 
4202       // Quick test using the already prepared vector mask
4203       movl(len, result);
4204       andl(len, 0x0000000f);   // tail count (in bytes)
4205       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4206       ptest(vec1, vec2);
4207       jcc(Assembler::zero, DONE);
4208       jmpb(TAIL_START);
4209 
4210       bind(BREAK_LOOP);
4211       // At least one byte in the last 16-byte vector is negative.
4212       // Set up and look at the last 16 bytes as if they were a tail
4213       lea(ary1, Address(ary1, len, Address::times_1));
4214       addptr(result, len);
4215       // Ignore the very last byte: if all others are positive,
4216       // it must be negative, so we can skip right to the 2+1 byte
4217       // end comparison at this point
4218       orl(result, 15);
4219       movl(len, 15);
4220       // Fallthru to tail compare
4221     }
4222   }
4223 
4224   bind(TAIL_START);
4225   // Compare 4-byte vectors
4226   andl(len, 0xfffffffc); // vector count (in bytes)
4227   jccb(Assembler::zero, COMPARE_CHAR);
4228 
4229   lea(ary1, Address(ary1, len, Address::times_1));
4230   negptr(len);
4231 
4232   bind(COMPARE_VECTORS);
4233   movl(tmp1, Address(ary1, len, Address::times_1));
4234   andl(tmp1, 0x80808080);
4235   jccb(Assembler::notZero, TAIL_ADJUST);
4236   addptr(len, 4);
4237   jccb(Assembler::notZero, COMPARE_VECTORS);
4238 
4239   // Compare trailing char (final 2-3 bytes), if any
4240   bind(COMPARE_CHAR);
4241 
4242   testl(result, 0x2);   // tail  char
4243   jccb(Assembler::zero, COMPARE_BYTE);
4244   load_unsigned_short(tmp1, Address(ary1, 0));
4245   andl(tmp1, 0x00008080);
4246   jccb(Assembler::notZero, CHAR_ADJUST);
4247   lea(ary1, Address(ary1, 2));
4248 
4249   bind(COMPARE_BYTE);
4250   testl(result, 0x1);   // tail  byte
4251   jccb(Assembler::zero, DONE);
4252   load_unsigned_byte(tmp1, Address(ary1, 0));
4253   testl(tmp1, 0x00000080);
4254   jccb(Assembler::zero, DONE);
4255   subptr(result, 1);
4256   jmpb(DONE);
4257 
4258   bind(TAIL_ADJUST);
4259   // there are negative bits in the last 4 byte block.
4260   // Adjust result and check the next three bytes
4261   addptr(result, len);
4262   orl(result, 3);
4263   lea(ary1, Address(ary1, len, Address::times_1));
4264   jmpb(COMPARE_CHAR);
4265 
4266   bind(CHAR_ADJUST);
4267   // We are looking at a char + optional byte tail, and found that one
4268   // of the bytes in the char is negative. Adjust the result, check the
4269   // first byte and readjust if needed.
4270   andl(result, 0xfffffffc);
4271   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4272   jccb(Assembler::notZero, DONE);
4273   addptr(result, 1);
4274 
4275   // That's it
4276   bind(DONE);
4277   if (UseAVX >= 2 && UseSSE >= 2) {
4278     // clean upper bits of YMM registers
4279     vpxor(vec1, vec1);
4280     vpxor(vec2, vec2);
4281   }
4282 }
4283 
4284 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4285 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4286                                       Register limit, Register result, Register chr,
4287                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4288                                       KRegister mask, bool expand_ary2) {
4289   // for expand_ary2, limit is the (smaller) size of the second array.
4290   ShortBranchVerifier sbv(this);
4291   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4292 
4293   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4294          "Expansion only implemented for AVX2");
4295 
4296   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4297   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4298 
4299   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4300   int scaleIncr = expand_ary2 ? 8 : 16;
4301 
4302   if (is_array_equ) {
4303     // Check the input args
4304     cmpoop(ary1, ary2);
4305     jcc(Assembler::equal, TRUE_LABEL);
4306 
4307     // Need additional checks for arrays_equals.
4308     testptr(ary1, ary1);
4309     jcc(Assembler::zero, FALSE_LABEL);
4310     testptr(ary2, ary2);
4311     jcc(Assembler::zero, FALSE_LABEL);
4312 
4313     // Check the lengths
4314     movl(limit, Address(ary1, length_offset));
4315     cmpl(limit, Address(ary2, length_offset));
4316     jcc(Assembler::notEqual, FALSE_LABEL);
4317   }
4318 
4319   // count == 0
4320   testl(limit, limit);
4321   jcc(Assembler::zero, TRUE_LABEL);
4322 
4323   if (is_array_equ) {
4324     // Load array address
4325     lea(ary1, Address(ary1, base_offset));
4326     lea(ary2, Address(ary2, base_offset));
4327   }
4328 
4329   if (is_array_equ && is_char) {
4330     // arrays_equals when used for char[].
4331     shll(limit, 1);      // byte count != 0
4332   }
4333   movl(result, limit); // copy
4334 
4335   if (UseAVX >= 2) {
4336     // With AVX2, use 32-byte vector compare
4337     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4338 
4339     // Compare 32-byte vectors
4340     if (expand_ary2) {
4341       andl(result, 0x0000000f);  //   tail count (in bytes)
4342       andl(limit, 0xfffffff0);   // vector count (in bytes)
4343       jcc(Assembler::zero, COMPARE_TAIL);
4344     } else {
4345       andl(result, 0x0000001f);  //   tail count (in bytes)
4346       andl(limit, 0xffffffe0);   // vector count (in bytes)
4347       jcc(Assembler::zero, COMPARE_TAIL_16);
4348     }
4349 
4350     lea(ary1, Address(ary1, limit, scaleFactor));
4351     lea(ary2, Address(ary2, limit, Address::times_1));
4352     negptr(limit);
4353 
4354 #ifdef _LP64
4355     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4356       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4357 
4358       cmpl(limit, -64);
4359       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4360 
4361       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4362 
4363       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4364       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4365       kortestql(mask, mask);
4366       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4367       addptr(limit, 64);  // update since we already compared at this addr
4368       cmpl(limit, -64);
4369       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4370 
4371       // At this point we may still need to compare -limit+result bytes.
4372       // We could execute the next two instruction and just continue via non-wide path:
4373       //  cmpl(limit, 0);
4374       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4375       // But since we stopped at the points ary{1,2}+limit which are
4376       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4377       // (|limit| <= 32 and result < 32),
4378       // we may just compare the last 64 bytes.
4379       //
4380       addptr(result, -64);   // it is safe, bc we just came from this area
4381       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4382       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4383       kortestql(mask, mask);
4384       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4385 
4386       jmp(TRUE_LABEL);
4387 
4388       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4389 
4390     }//if (VM_Version::supports_avx512vlbw())
4391 #endif //_LP64
4392     bind(COMPARE_WIDE_VECTORS);
4393     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4394     if (expand_ary2) {
4395       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4396     } else {
4397       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4398     }
4399     vpxor(vec1, vec2);
4400 
4401     vptest(vec1, vec1);
4402     jcc(Assembler::notZero, FALSE_LABEL);
4403     addptr(limit, scaleIncr * 2);
4404     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4405 
4406     testl(result, result);
4407     jcc(Assembler::zero, TRUE_LABEL);
4408 
4409     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4410     if (expand_ary2) {
4411       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4412     } else {
4413       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4414     }
4415     vpxor(vec1, vec2);
4416 
4417     vptest(vec1, vec1);
4418     jcc(Assembler::notZero, FALSE_LABEL);
4419     jmp(TRUE_LABEL);
4420 
4421     bind(COMPARE_TAIL_16); // limit is zero
4422     movl(limit, result);
4423 
4424     // Compare 16-byte chunks
4425     andl(result, 0x0000000f);  //   tail count (in bytes)
4426     andl(limit, 0xfffffff0);   // vector count (in bytes)
4427     jcc(Assembler::zero, COMPARE_TAIL);
4428 
4429     lea(ary1, Address(ary1, limit, scaleFactor));
4430     lea(ary2, Address(ary2, limit, Address::times_1));
4431     negptr(limit);
4432 
4433     bind(COMPARE_WIDE_VECTORS_16);
4434     movdqu(vec1, Address(ary1, limit, scaleFactor));
4435     if (expand_ary2) {
4436       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4437     } else {
4438       movdqu(vec2, Address(ary2, limit, Address::times_1));
4439     }
4440     pxor(vec1, vec2);
4441 
4442     ptest(vec1, vec1);
4443     jcc(Assembler::notZero, FALSE_LABEL);
4444     addptr(limit, scaleIncr);
4445     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4446 
4447     bind(COMPARE_TAIL); // limit is zero
4448     movl(limit, result);
4449     // Fallthru to tail compare
4450   } else if (UseSSE42Intrinsics) {
4451     // With SSE4.2, use double quad vector compare
4452     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4453 
4454     // Compare 16-byte vectors
4455     andl(result, 0x0000000f);  //   tail count (in bytes)
4456     andl(limit, 0xfffffff0);   // vector count (in bytes)
4457     jcc(Assembler::zero, COMPARE_TAIL);
4458 
4459     lea(ary1, Address(ary1, limit, Address::times_1));
4460     lea(ary2, Address(ary2, limit, Address::times_1));
4461     negptr(limit);
4462 
4463     bind(COMPARE_WIDE_VECTORS);
4464     movdqu(vec1, Address(ary1, limit, Address::times_1));
4465     movdqu(vec2, Address(ary2, limit, Address::times_1));
4466     pxor(vec1, vec2);
4467 
4468     ptest(vec1, vec1);
4469     jcc(Assembler::notZero, FALSE_LABEL);
4470     addptr(limit, 16);
4471     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4472 
4473     testl(result, result);
4474     jcc(Assembler::zero, TRUE_LABEL);
4475 
4476     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4477     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4478     pxor(vec1, vec2);
4479 
4480     ptest(vec1, vec1);
4481     jccb(Assembler::notZero, FALSE_LABEL);
4482     jmpb(TRUE_LABEL);
4483 
4484     bind(COMPARE_TAIL); // limit is zero
4485     movl(limit, result);
4486     // Fallthru to tail compare
4487   }
4488 
4489   // Compare 4-byte vectors
4490   if (expand_ary2) {
4491     testl(result, result);
4492     jccb(Assembler::zero, TRUE_LABEL);
4493   } else {
4494     andl(limit, 0xfffffffc); // vector count (in bytes)
4495     jccb(Assembler::zero, COMPARE_CHAR);
4496   }
4497 
4498   lea(ary1, Address(ary1, limit, scaleFactor));
4499   lea(ary2, Address(ary2, limit, Address::times_1));
4500   negptr(limit);
4501 
4502   bind(COMPARE_VECTORS);
4503   if (expand_ary2) {
4504     // There are no "vector" operations for bytes to shorts
4505     movzbl(chr, Address(ary2, limit, Address::times_1));
4506     cmpw(Address(ary1, limit, Address::times_2), chr);
4507     jccb(Assembler::notEqual, FALSE_LABEL);
4508     addptr(limit, 1);
4509     jcc(Assembler::notZero, COMPARE_VECTORS);
4510     jmp(TRUE_LABEL);
4511   } else {
4512     movl(chr, Address(ary1, limit, Address::times_1));
4513     cmpl(chr, Address(ary2, limit, Address::times_1));
4514     jccb(Assembler::notEqual, FALSE_LABEL);
4515     addptr(limit, 4);
4516     jcc(Assembler::notZero, COMPARE_VECTORS);
4517   }
4518 
4519   // Compare trailing char (final 2 bytes), if any
4520   bind(COMPARE_CHAR);
4521   testl(result, 0x2);   // tail  char
4522   jccb(Assembler::zero, COMPARE_BYTE);
4523   load_unsigned_short(chr, Address(ary1, 0));
4524   load_unsigned_short(limit, Address(ary2, 0));
4525   cmpl(chr, limit);
4526   jccb(Assembler::notEqual, FALSE_LABEL);
4527 
4528   if (is_array_equ && is_char) {
4529     bind(COMPARE_BYTE);
4530   } else {
4531     lea(ary1, Address(ary1, 2));
4532     lea(ary2, Address(ary2, 2));
4533 
4534     bind(COMPARE_BYTE);
4535     testl(result, 0x1);   // tail  byte
4536     jccb(Assembler::zero, TRUE_LABEL);
4537     load_unsigned_byte(chr, Address(ary1, 0));
4538     load_unsigned_byte(limit, Address(ary2, 0));
4539     cmpl(chr, limit);
4540     jccb(Assembler::notEqual, FALSE_LABEL);
4541   }
4542   bind(TRUE_LABEL);
4543   movl(result, 1);   // return true
4544   jmpb(DONE);
4545 
4546   bind(FALSE_LABEL);
4547   xorl(result, result); // return false
4548 
4549   // That's it
4550   bind(DONE);
4551   if (UseAVX >= 2) {
4552     // clean upper bits of YMM registers
4553     vpxor(vec1, vec1);
4554     vpxor(vec2, vec2);
4555   }
4556 }
4557 
4558 #ifdef _LP64
4559 
4560 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4561 #define __ masm.
4562   Register dst = stub.data<0>();
4563   XMMRegister src = stub.data<1>();
4564   address target = stub.data<2>();
4565   __ bind(stub.entry());
4566   __ subptr(rsp, 8);
4567   __ movdbl(Address(rsp), src);
4568   __ call(RuntimeAddress(target));
4569   __ pop(dst);
4570   __ jmp(stub.continuation());
4571 #undef __
4572 }
4573 
4574 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4575   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4576   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4577 
4578   address slowpath_target;
4579   if (dst_bt == T_INT) {
4580     if (src_bt == T_FLOAT) {
4581       cvttss2sil(dst, src);
4582       cmpl(dst, 0x80000000);
4583       slowpath_target = StubRoutines::x86::f2i_fixup();
4584     } else {
4585       cvttsd2sil(dst, src);
4586       cmpl(dst, 0x80000000);
4587       slowpath_target = StubRoutines::x86::d2i_fixup();
4588     }
4589   } else {
4590     if (src_bt == T_FLOAT) {
4591       cvttss2siq(dst, src);
4592       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4593       slowpath_target = StubRoutines::x86::f2l_fixup();
4594     } else {
4595       cvttsd2siq(dst, src);
4596       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4597       slowpath_target = StubRoutines::x86::d2l_fixup();
4598     }
4599   }
4600 
4601   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4602   jcc(Assembler::equal, stub->entry());
4603   bind(stub->continuation());
4604 }
4605 
4606 #endif // _LP64
4607 
4608 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4609                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4610   switch(ideal_opc) {
4611     case Op_LShiftVS:
4612       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4613     case Op_LShiftVI:
4614       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4615     case Op_LShiftVL:
4616       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4617     case Op_RShiftVS:
4618       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4619     case Op_RShiftVI:
4620       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4621     case Op_RShiftVL:
4622       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4623     case Op_URShiftVS:
4624       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4625     case Op_URShiftVI:
4626       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4627     case Op_URShiftVL:
4628       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4629     case Op_RotateRightV:
4630       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4631     case Op_RotateLeftV:
4632       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4633     default:
4634       fatal("Unsupported masked operation"); break;
4635   }
4636 }
4637 
4638 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4639                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4640                                     bool is_varshift) {
4641   switch (ideal_opc) {
4642     case Op_AddVB:
4643       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4644     case Op_AddVS:
4645       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4646     case Op_AddVI:
4647       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4648     case Op_AddVL:
4649       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4650     case Op_AddVF:
4651       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4652     case Op_AddVD:
4653       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4654     case Op_SubVB:
4655       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4656     case Op_SubVS:
4657       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4658     case Op_SubVI:
4659       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4660     case Op_SubVL:
4661       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4662     case Op_SubVF:
4663       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4664     case Op_SubVD:
4665       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4666     case Op_MulVS:
4667       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4668     case Op_MulVI:
4669       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4670     case Op_MulVL:
4671       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4672     case Op_MulVF:
4673       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4674     case Op_MulVD:
4675       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4676     case Op_DivVF:
4677       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4678     case Op_DivVD:
4679       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4680     case Op_SqrtVF:
4681       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4682     case Op_SqrtVD:
4683       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4684     case Op_AbsVB:
4685       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4686     case Op_AbsVS:
4687       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4688     case Op_AbsVI:
4689       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4690     case Op_AbsVL:
4691       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4692     case Op_FmaVF:
4693       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4694     case Op_FmaVD:
4695       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4696     case Op_VectorRearrange:
4697       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4698     case Op_LShiftVS:
4699       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4700     case Op_LShiftVI:
4701       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4702     case Op_LShiftVL:
4703       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4704     case Op_RShiftVS:
4705       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4706     case Op_RShiftVI:
4707       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4708     case Op_RShiftVL:
4709       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4710     case Op_URShiftVS:
4711       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4712     case Op_URShiftVI:
4713       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4714     case Op_URShiftVL:
4715       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4716     case Op_RotateLeftV:
4717       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4718     case Op_RotateRightV:
4719       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4720     case Op_MaxV:
4721       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4722     case Op_MinV:
4723       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4724     case Op_XorV:
4725       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4726     case Op_OrV:
4727       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4728     case Op_AndV:
4729       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4730     default:
4731       fatal("Unsupported masked operation"); break;
4732   }
4733 }
4734 
4735 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4736                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4737   switch (ideal_opc) {
4738     case Op_AddVB:
4739       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4740     case Op_AddVS:
4741       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4742     case Op_AddVI:
4743       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4744     case Op_AddVL:
4745       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4746     case Op_AddVF:
4747       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4748     case Op_AddVD:
4749       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4750     case Op_SubVB:
4751       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4752     case Op_SubVS:
4753       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4754     case Op_SubVI:
4755       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4756     case Op_SubVL:
4757       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4758     case Op_SubVF:
4759       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4760     case Op_SubVD:
4761       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4762     case Op_MulVS:
4763       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4764     case Op_MulVI:
4765       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4766     case Op_MulVL:
4767       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4768     case Op_MulVF:
4769       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4770     case Op_MulVD:
4771       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4772     case Op_DivVF:
4773       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4774     case Op_DivVD:
4775       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4776     case Op_FmaVF:
4777       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4778     case Op_FmaVD:
4779       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4780     case Op_MaxV:
4781       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4782     case Op_MinV:
4783       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4784     case Op_XorV:
4785       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4786     case Op_OrV:
4787       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4788     case Op_AndV:
4789       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4790     default:
4791       fatal("Unsupported masked operation"); break;
4792   }
4793 }
4794 
4795 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4796                                   KRegister src1, KRegister src2) {
4797   BasicType etype = T_ILLEGAL;
4798   switch(mask_len) {
4799     case 2:
4800     case 4:
4801     case 8:  etype = T_BYTE; break;
4802     case 16: etype = T_SHORT; break;
4803     case 32: etype = T_INT; break;
4804     case 64: etype = T_LONG; break;
4805     default: fatal("Unsupported type"); break;
4806   }
4807   assert(etype != T_ILLEGAL, "");
4808   switch(ideal_opc) {
4809     case Op_AndVMask:
4810       kand(etype, dst, src1, src2); break;
4811     case Op_OrVMask:
4812       kor(etype, dst, src1, src2); break;
4813     case Op_XorVMask:
4814       kxor(etype, dst, src1, src2); break;
4815     default:
4816       fatal("Unsupported masked operation"); break;
4817   }
4818 }
4819 
4820 /*
4821  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4822  * If src is NaN, the result is 0.
4823  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4824  * the result is equal to the value of Integer.MIN_VALUE.
4825  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4826  * the result is equal to the value of Integer.MAX_VALUE.
4827  */
4828 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4829                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4830                                                                    Register rscratch, AddressLiteral float_sign_flip,
4831                                                                    int vec_enc) {
4832   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4833   Label done;
4834   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4835   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4836   vptest(xtmp2, xtmp2, vec_enc);
4837   jccb(Assembler::equal, done);
4838 
4839   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4840   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4841 
4842   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4843   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4844   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4845 
4846   // Recompute the mask for remaining special value.
4847   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4848   // Extract SRC values corresponding to TRUE mask lanes.
4849   vpand(xtmp4, xtmp2, src, vec_enc);
4850   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4851   // values are set.
4852   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4853 
4854   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4855   bind(done);
4856 }
4857 
4858 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4859                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4860                                                                     Register rscratch, AddressLiteral float_sign_flip,
4861                                                                     int vec_enc) {
4862   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4863   Label done;
4864   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4865   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4866   kortestwl(ktmp1, ktmp1);
4867   jccb(Assembler::equal, done);
4868 
4869   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4870   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4871   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4872 
4873   kxorwl(ktmp1, ktmp1, ktmp2);
4874   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4875   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4876   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4877   bind(done);
4878 }
4879 
4880 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4881                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4882                                                                      Register rscratch, AddressLiteral double_sign_flip,
4883                                                                      int vec_enc) {
4884   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4885 
4886   Label done;
4887   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4888   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4889   kortestwl(ktmp1, ktmp1);
4890   jccb(Assembler::equal, done);
4891 
4892   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4893   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4894   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4895 
4896   kxorwl(ktmp1, ktmp1, ktmp2);
4897   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4898   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4899   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4900   bind(done);
4901 }
4902 
4903 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4904                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4905                                                                      Register rscratch, AddressLiteral float_sign_flip,
4906                                                                      int vec_enc) {
4907   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4908   Label done;
4909   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4910   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4911   kortestwl(ktmp1, ktmp1);
4912   jccb(Assembler::equal, done);
4913 
4914   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4915   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4916   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4917 
4918   kxorwl(ktmp1, ktmp1, ktmp2);
4919   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4920   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4921   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4922   bind(done);
4923 }
4924 
4925 /*
4926  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4927  * If src is NaN, the result is 0.
4928  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4929  * the result is equal to the value of Long.MIN_VALUE.
4930  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4931  * the result is equal to the value of Long.MAX_VALUE.
4932  */
4933 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4934                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4935                                                                       Register rscratch, AddressLiteral double_sign_flip,
4936                                                                       int vec_enc) {
4937   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4938 
4939   Label done;
4940   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4941   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4942   kortestwl(ktmp1, ktmp1);
4943   jccb(Assembler::equal, done);
4944 
4945   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4946   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4947   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4948 
4949   kxorwl(ktmp1, ktmp1, ktmp2);
4950   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4951   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4952   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4953   bind(done);
4954 }
4955 
4956 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4957                                                              XMMRegister xtmp, int index, int vec_enc) {
4958    assert(vec_enc < Assembler::AVX_512bit, "");
4959    if (vec_enc == Assembler::AVX_256bit) {
4960      vextractf128_high(xtmp, src);
4961      vshufps(dst, src, xtmp, index, vec_enc);
4962    } else {
4963      vshufps(dst, src, zero, index, vec_enc);
4964    }
4965 }
4966 
4967 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4968                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4969                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4970   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4971 
4972   Label done;
4973   // Compare the destination lanes with float_sign_flip
4974   // value to get mask for all special values.
4975   movdqu(xtmp1, float_sign_flip, rscratch);
4976   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4977   ptest(xtmp2, xtmp2);
4978   jccb(Assembler::equal, done);
4979 
4980   // Flip float_sign_flip to get max integer value.
4981   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4982   pxor(xtmp1, xtmp4);
4983 
4984   // Set detination lanes corresponding to unordered source lanes as zero.
4985   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
4986   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
4987 
4988   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4989   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4990   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
4991 
4992   // Recompute the mask for remaining special value.
4993   pxor(xtmp2, xtmp3);
4994   // Extract mask corresponding to non-negative source lanes.
4995   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
4996 
4997   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4998   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4999   pand(xtmp3, xtmp2);
5000 
5001   // Replace destination lanes holding special value(0x80000000) with max int
5002   // if corresponding source lane holds a +ve value.
5003   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5004   bind(done);
5005 }
5006 
5007 
5008 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5009                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5010   switch(to_elem_bt) {
5011     case T_SHORT:
5012       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5013       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5014       vpackusdw(dst, dst, zero, vec_enc);
5015       if (vec_enc == Assembler::AVX_256bit) {
5016         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5017       }
5018       break;
5019     case  T_BYTE:
5020       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5021       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5022       vpackusdw(dst, dst, zero, vec_enc);
5023       if (vec_enc == Assembler::AVX_256bit) {
5024         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5025       }
5026       vpackuswb(dst, dst, zero, vec_enc);
5027       break;
5028     default: assert(false, "%s", type2name(to_elem_bt));
5029   }
5030 }
5031 
5032 /*
5033  * Algorithm for vector D2L and F2I conversions:-
5034  * a) Perform vector D2L/F2I cast.
5035  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5036  *    It signifies that source value could be any of the special floating point
5037  *    values(NaN,-Inf,Inf,Max,-Min).
5038  * c) Set destination to zero if source is NaN value.
5039  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5040  */
5041 
5042 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5043                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5044                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5045   int to_elem_sz = type2aelembytes(to_elem_bt);
5046   assert(to_elem_sz <= 4, "");
5047   vcvttps2dq(dst, src, vec_enc);
5048   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5049   if (to_elem_sz < 4) {
5050     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5051     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5052   }
5053 }
5054 
5055 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5056                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5057                                             Register rscratch, int vec_enc) {
5058   int to_elem_sz = type2aelembytes(to_elem_bt);
5059   assert(to_elem_sz <= 4, "");
5060   vcvttps2dq(dst, src, vec_enc);
5061   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5062   switch(to_elem_bt) {
5063     case T_INT:
5064       break;
5065     case T_SHORT:
5066       evpmovdw(dst, dst, vec_enc);
5067       break;
5068     case T_BYTE:
5069       evpmovdb(dst, dst, vec_enc);
5070       break;
5071     default: assert(false, "%s", type2name(to_elem_bt));
5072   }
5073 }
5074 
5075 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5076                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5077                                             Register rscratch, int vec_enc) {
5078   evcvttps2qq(dst, src, vec_enc);
5079   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5080 }
5081 
5082 // Handling for downcasting from double to integer or sub-word types on AVX2.
5083 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5084                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5085                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5086   int to_elem_sz = type2aelembytes(to_elem_bt);
5087   assert(to_elem_sz < 8, "");
5088   vcvttpd2dq(dst, src, vec_enc);
5089   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5090                                               float_sign_flip, vec_enc);
5091   if (to_elem_sz < 4) {
5092     // xtmp4 holds all zero lanes.
5093     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5094   }
5095 }
5096 
5097 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5098                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5099                                             KRegister ktmp2, AddressLiteral sign_flip,
5100                                             Register rscratch, int vec_enc) {
5101   if (VM_Version::supports_avx512dq()) {
5102     evcvttpd2qq(dst, src, vec_enc);
5103     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5104     switch(to_elem_bt) {
5105       case T_LONG:
5106         break;
5107       case T_INT:
5108         evpmovsqd(dst, dst, vec_enc);
5109         break;
5110       case T_SHORT:
5111         evpmovsqd(dst, dst, vec_enc);
5112         evpmovdw(dst, dst, vec_enc);
5113         break;
5114       case T_BYTE:
5115         evpmovsqd(dst, dst, vec_enc);
5116         evpmovdb(dst, dst, vec_enc);
5117         break;
5118       default: assert(false, "%s", type2name(to_elem_bt));
5119     }
5120   } else {
5121     assert(type2aelembytes(to_elem_bt) <= 4, "");
5122     vcvttpd2dq(dst, src, vec_enc);
5123     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5124     switch(to_elem_bt) {
5125       case T_INT:
5126         break;
5127       case T_SHORT:
5128         evpmovdw(dst, dst, vec_enc);
5129         break;
5130       case T_BYTE:
5131         evpmovdb(dst, dst, vec_enc);
5132         break;
5133       default: assert(false, "%s", type2name(to_elem_bt));
5134     }
5135   }
5136 }
5137 
5138 #ifdef _LP64
5139 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5140                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5141                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5142   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5143   // and re-instantiate original MXCSR.RC mode after that.
5144   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5145 
5146   mov64(tmp, julong_cast(0.5L));
5147   evpbroadcastq(xtmp1, tmp, vec_enc);
5148   vaddpd(xtmp1, src , xtmp1, vec_enc);
5149   evcvtpd2qq(dst, xtmp1, vec_enc);
5150   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5151                                                 double_sign_flip, vec_enc);;
5152 
5153   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5154 }
5155 
5156 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5157                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5158                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5159   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5160   // and re-instantiate original MXCSR.RC mode after that.
5161   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5162 
5163   movl(tmp, jint_cast(0.5));
5164   movq(xtmp1, tmp);
5165   vbroadcastss(xtmp1, xtmp1, vec_enc);
5166   vaddps(xtmp1, src , xtmp1, vec_enc);
5167   vcvtps2dq(dst, xtmp1, vec_enc);
5168   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5169                                               float_sign_flip, vec_enc);
5170 
5171   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5172 }
5173 
5174 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5175                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5176                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5177   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5178   // and re-instantiate original MXCSR.RC mode after that.
5179   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5180 
5181   movl(tmp, jint_cast(0.5));
5182   movq(xtmp1, tmp);
5183   vbroadcastss(xtmp1, xtmp1, vec_enc);
5184   vaddps(xtmp1, src , xtmp1, vec_enc);
5185   vcvtps2dq(dst, xtmp1, vec_enc);
5186   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5187 
5188   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5189 }
5190 #endif // _LP64
5191 
5192 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5193                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5194   switch (from_elem_bt) {
5195     case T_BYTE:
5196       switch (to_elem_bt) {
5197         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5198         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5199         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5200         default: ShouldNotReachHere();
5201       }
5202       break;
5203     case T_SHORT:
5204       switch (to_elem_bt) {
5205         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5206         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5207         default: ShouldNotReachHere();
5208       }
5209       break;
5210     case T_INT:
5211       assert(to_elem_bt == T_LONG, "");
5212       vpmovzxdq(dst, src, vlen_enc);
5213       break;
5214     default:
5215       ShouldNotReachHere();
5216   }
5217 }
5218 
5219 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5220                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5221   switch (from_elem_bt) {
5222     case T_BYTE:
5223       switch (to_elem_bt) {
5224         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5225         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5226         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5227         default: ShouldNotReachHere();
5228       }
5229       break;
5230     case T_SHORT:
5231       switch (to_elem_bt) {
5232         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5233         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5234         default: ShouldNotReachHere();
5235       }
5236       break;
5237     case T_INT:
5238       assert(to_elem_bt == T_LONG, "");
5239       vpmovsxdq(dst, src, vlen_enc);
5240       break;
5241     default:
5242       ShouldNotReachHere();
5243   }
5244 }
5245 
5246 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5247                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5248   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5249   assert(vlen_enc != AVX_512bit, "");
5250 
5251   int dst_bt_size = type2aelembytes(dst_bt);
5252   int src_bt_size = type2aelembytes(src_bt);
5253   if (dst_bt_size > src_bt_size) {
5254     switch (dst_bt_size / src_bt_size) {
5255       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5256       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5257       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5258       default: ShouldNotReachHere();
5259     }
5260   } else {
5261     assert(dst_bt_size < src_bt_size, "");
5262     switch (src_bt_size / dst_bt_size) {
5263       case 2: {
5264         if (vlen_enc == AVX_128bit) {
5265           vpacksswb(dst, src, src, vlen_enc);
5266         } else {
5267           vpacksswb(dst, src, src, vlen_enc);
5268           vpermq(dst, dst, 0x08, vlen_enc);
5269         }
5270         break;
5271       }
5272       case 4: {
5273         if (vlen_enc == AVX_128bit) {
5274           vpackssdw(dst, src, src, vlen_enc);
5275           vpacksswb(dst, dst, dst, vlen_enc);
5276         } else {
5277           vpackssdw(dst, src, src, vlen_enc);
5278           vpermq(dst, dst, 0x08, vlen_enc);
5279           vpacksswb(dst, dst, dst, AVX_128bit);
5280         }
5281         break;
5282       }
5283       case 8: {
5284         if (vlen_enc == AVX_128bit) {
5285           vpshufd(dst, src, 0x08, vlen_enc);
5286           vpackssdw(dst, dst, dst, vlen_enc);
5287           vpacksswb(dst, dst, dst, vlen_enc);
5288         } else {
5289           vpshufd(dst, src, 0x08, vlen_enc);
5290           vpermq(dst, dst, 0x08, vlen_enc);
5291           vpackssdw(dst, dst, dst, AVX_128bit);
5292           vpacksswb(dst, dst, dst, AVX_128bit);
5293         }
5294         break;
5295       }
5296       default: ShouldNotReachHere();
5297     }
5298   }
5299 }
5300 
5301 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5302                                    bool merge, BasicType bt, int vlen_enc) {
5303   if (bt == T_INT) {
5304     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5305   } else {
5306     assert(bt == T_LONG, "");
5307     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5308   }
5309 }
5310 
5311 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5312                                    bool merge, BasicType bt, int vlen_enc) {
5313   if (bt == T_INT) {
5314     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5315   } else {
5316     assert(bt == T_LONG, "");
5317     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5318   }
5319 }
5320 
5321 #ifdef _LP64
5322 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5323                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5324                                                int vec_enc) {
5325   int index = 0;
5326   int vindex = 0;
5327   mov64(rtmp1, 0x0101010101010101L);
5328   pdepq(rtmp1, src, rtmp1);
5329   if (mask_len > 8) {
5330     movq(rtmp2, src);
5331     vpxor(xtmp, xtmp, xtmp, vec_enc);
5332     movq(xtmp, rtmp1);
5333   }
5334   movq(dst, rtmp1);
5335 
5336   mask_len -= 8;
5337   while (mask_len > 0) {
5338     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5339     index++;
5340     if ((index % 2) == 0) {
5341       pxor(xtmp, xtmp);
5342     }
5343     mov64(rtmp1, 0x0101010101010101L);
5344     shrq(rtmp2, 8);
5345     pdepq(rtmp1, rtmp2, rtmp1);
5346     pinsrq(xtmp, rtmp1, index % 2);
5347     vindex = index / 2;
5348     if (vindex) {
5349       // Write entire 16 byte vector when both 64 bit
5350       // lanes are update to save redundant instructions.
5351       if (index % 2) {
5352         vinsertf128(dst, dst, xtmp, vindex);
5353       }
5354     } else {
5355       vmovdqu(dst, xtmp);
5356     }
5357     mask_len -= 8;
5358   }
5359 }
5360 
5361 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5362   switch(opc) {
5363     case Op_VectorMaskTrueCount:
5364       popcntq(dst, tmp);
5365       break;
5366     case Op_VectorMaskLastTrue:
5367       if (VM_Version::supports_lzcnt()) {
5368         lzcntq(tmp, tmp);
5369         movl(dst, 63);
5370         subl(dst, tmp);
5371       } else {
5372         movl(dst, -1);
5373         bsrq(tmp, tmp);
5374         cmov32(Assembler::notZero, dst, tmp);
5375       }
5376       break;
5377     case Op_VectorMaskFirstTrue:
5378       if (VM_Version::supports_bmi1()) {
5379         if (masklen < 32) {
5380           orl(tmp, 1 << masklen);
5381           tzcntl(dst, tmp);
5382         } else if (masklen == 32) {
5383           tzcntl(dst, tmp);
5384         } else {
5385           assert(masklen == 64, "");
5386           tzcntq(dst, tmp);
5387         }
5388       } else {
5389         if (masklen < 32) {
5390           orl(tmp, 1 << masklen);
5391           bsfl(dst, tmp);
5392         } else {
5393           assert(masklen == 32 || masklen == 64, "");
5394           movl(dst, masklen);
5395           if (masklen == 32)  {
5396             bsfl(tmp, tmp);
5397           } else {
5398             bsfq(tmp, tmp);
5399           }
5400           cmov32(Assembler::notZero, dst, tmp);
5401         }
5402       }
5403       break;
5404     case Op_VectorMaskToLong:
5405       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5406       break;
5407     default: assert(false, "Unhandled mask operation");
5408   }
5409 }
5410 
5411 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5412                                               int masklen, int masksize, int vec_enc) {
5413   assert(VM_Version::supports_popcnt(), "");
5414 
5415   if(VM_Version::supports_avx512bw()) {
5416     kmovql(tmp, mask);
5417   } else {
5418     assert(masklen <= 16, "");
5419     kmovwl(tmp, mask);
5420   }
5421 
5422   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5423   // operations needs to be clipped.
5424   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5425     andq(tmp, (1 << masklen) - 1);
5426   }
5427 
5428   vector_mask_operation_helper(opc, dst, tmp, masklen);
5429 }
5430 
5431 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5432                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5433   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5434          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5435   assert(VM_Version::supports_popcnt(), "");
5436 
5437   bool need_clip = false;
5438   switch(bt) {
5439     case T_BOOLEAN:
5440       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5441       vpxor(xtmp, xtmp, xtmp, vec_enc);
5442       vpsubb(xtmp, xtmp, mask, vec_enc);
5443       vpmovmskb(tmp, xtmp, vec_enc);
5444       need_clip = masklen < 16;
5445       break;
5446     case T_BYTE:
5447       vpmovmskb(tmp, mask, vec_enc);
5448       need_clip = masklen < 16;
5449       break;
5450     case T_SHORT:
5451       vpacksswb(xtmp, mask, mask, vec_enc);
5452       if (masklen >= 16) {
5453         vpermpd(xtmp, xtmp, 8, vec_enc);
5454       }
5455       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5456       need_clip = masklen < 16;
5457       break;
5458     case T_INT:
5459     case T_FLOAT:
5460       vmovmskps(tmp, mask, vec_enc);
5461       need_clip = masklen < 4;
5462       break;
5463     case T_LONG:
5464     case T_DOUBLE:
5465       vmovmskpd(tmp, mask, vec_enc);
5466       need_clip = masklen < 2;
5467       break;
5468     default: assert(false, "Unhandled type, %s", type2name(bt));
5469   }
5470 
5471   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5472   // operations needs to be clipped.
5473   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5474     // need_clip implies masklen < 32
5475     andq(tmp, (1 << masklen) - 1);
5476   }
5477 
5478   vector_mask_operation_helper(opc, dst, tmp, masklen);
5479 }
5480 
5481 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5482                                              Register rtmp2, int mask_len) {
5483   kmov(rtmp1, src);
5484   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5485   mov64(rtmp2, -1L);
5486   pextq(rtmp2, rtmp2, rtmp1);
5487   kmov(dst, rtmp2);
5488 }
5489 
5490 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5491                                                     XMMRegister mask, Register rtmp, Register rscratch,
5492                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5493                                                     int vec_enc) {
5494   assert(type2aelembytes(bt) >= 4, "");
5495   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5496   address compress_perm_table = nullptr;
5497   address expand_perm_table = nullptr;
5498   if (type2aelembytes(bt) == 8) {
5499     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5500     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5501     vmovmskpd(rtmp, mask, vec_enc);
5502   } else {
5503     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5504     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5505     vmovmskps(rtmp, mask, vec_enc);
5506   }
5507   shlq(rtmp, 5); // for 32 byte permute row.
5508   if (opcode == Op_CompressV) {
5509     lea(rscratch, ExternalAddress(compress_perm_table));
5510   } else {
5511     lea(rscratch, ExternalAddress(expand_perm_table));
5512   }
5513   addptr(rtmp, rscratch);
5514   vmovdqu(permv, Address(rtmp));
5515   vpermps(dst, permv, src, Assembler::AVX_256bit);
5516   vpxor(xtmp, xtmp, xtmp, vec_enc);
5517   // Blend the result with zero vector using permute mask, each column entry
5518   // in a permute table row contains either a valid permute index or a -1 (default)
5519   // value, this can potentially be used as a blending mask after
5520   // compressing/expanding the source vector lanes.
5521   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5522 }
5523 
5524 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5525                                                bool merge, BasicType bt, int vec_enc) {
5526   if (opcode == Op_CompressV) {
5527     switch(bt) {
5528     case T_BYTE:
5529       evpcompressb(dst, mask, src, merge, vec_enc);
5530       break;
5531     case T_CHAR:
5532     case T_SHORT:
5533       evpcompressw(dst, mask, src, merge, vec_enc);
5534       break;
5535     case T_INT:
5536       evpcompressd(dst, mask, src, merge, vec_enc);
5537       break;
5538     case T_FLOAT:
5539       evcompressps(dst, mask, src, merge, vec_enc);
5540       break;
5541     case T_LONG:
5542       evpcompressq(dst, mask, src, merge, vec_enc);
5543       break;
5544     case T_DOUBLE:
5545       evcompresspd(dst, mask, src, merge, vec_enc);
5546       break;
5547     default:
5548       fatal("Unsupported type %s", type2name(bt));
5549       break;
5550     }
5551   } else {
5552     assert(opcode == Op_ExpandV, "");
5553     switch(bt) {
5554     case T_BYTE:
5555       evpexpandb(dst, mask, src, merge, vec_enc);
5556       break;
5557     case T_CHAR:
5558     case T_SHORT:
5559       evpexpandw(dst, mask, src, merge, vec_enc);
5560       break;
5561     case T_INT:
5562       evpexpandd(dst, mask, src, merge, vec_enc);
5563       break;
5564     case T_FLOAT:
5565       evexpandps(dst, mask, src, merge, vec_enc);
5566       break;
5567     case T_LONG:
5568       evpexpandq(dst, mask, src, merge, vec_enc);
5569       break;
5570     case T_DOUBLE:
5571       evexpandpd(dst, mask, src, merge, vec_enc);
5572       break;
5573     default:
5574       fatal("Unsupported type %s", type2name(bt));
5575       break;
5576     }
5577   }
5578 }
5579 #endif
5580 
5581 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5582                                            KRegister ktmp1, int vec_enc) {
5583   if (opcode == Op_SignumVD) {
5584     vsubpd(dst, zero, one, vec_enc);
5585     // if src < 0 ? -1 : 1
5586     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5587     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5588     // if src == NaN, -0.0 or 0.0 return src.
5589     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5590     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5591   } else {
5592     assert(opcode == Op_SignumVF, "");
5593     vsubps(dst, zero, one, vec_enc);
5594     // if src < 0 ? -1 : 1
5595     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5596     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5597     // if src == NaN, -0.0 or 0.0 return src.
5598     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5599     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5600   }
5601 }
5602 
5603 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5604                                           XMMRegister xtmp1, int vec_enc) {
5605   if (opcode == Op_SignumVD) {
5606     vsubpd(dst, zero, one, vec_enc);
5607     // if src < 0 ? -1 : 1
5608     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5609     // if src == NaN, -0.0 or 0.0 return src.
5610     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5611     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5612   } else {
5613     assert(opcode == Op_SignumVF, "");
5614     vsubps(dst, zero, one, vec_enc);
5615     // if src < 0 ? -1 : 1
5616     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5617     // if src == NaN, -0.0 or 0.0 return src.
5618     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5619     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5620   }
5621 }
5622 
5623 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5624   if (VM_Version::supports_avx512bw()) {
5625     if (mask_len > 32) {
5626       kmovql(dst, src);
5627     } else {
5628       kmovdl(dst, src);
5629       if (mask_len != 32) {
5630         kshiftrdl(dst, dst, 32 - mask_len);
5631       }
5632     }
5633   } else {
5634     assert(mask_len <= 16, "");
5635     kmovwl(dst, src);
5636     if (mask_len != 16) {
5637       kshiftrwl(dst, dst, 16 - mask_len);
5638     }
5639   }
5640 }
5641 
5642 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5643   int lane_size = type2aelembytes(bt);
5644   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5645   if ((is_LP64 || lane_size < 8) &&
5646       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5647        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5648     movptr(rtmp, imm32);
5649     switch(lane_size) {
5650       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5651       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5652       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5653       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5654       fatal("Unsupported lane size %d", lane_size);
5655       break;
5656     }
5657   } else {
5658     movptr(rtmp, imm32);
5659     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5660     switch(lane_size) {
5661       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5662       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5663       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5664       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5665       fatal("Unsupported lane size %d", lane_size);
5666       break;
5667     }
5668   }
5669 }
5670 
5671 //
5672 // Following is lookup table based popcount computation algorithm:-
5673 //       Index   Bit set count
5674 //     [ 0000 ->   0,
5675 //       0001 ->   1,
5676 //       0010 ->   1,
5677 //       0011 ->   2,
5678 //       0100 ->   1,
5679 //       0101 ->   2,
5680 //       0110 ->   2,
5681 //       0111 ->   3,
5682 //       1000 ->   1,
5683 //       1001 ->   2,
5684 //       1010 ->   3,
5685 //       1011 ->   3,
5686 //       1100 ->   2,
5687 //       1101 ->   3,
5688 //       1111 ->   4 ]
5689 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5690 //     shuffle indices for lookup table access.
5691 //  b. Right shift each byte of vector lane by 4 positions.
5692 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5693 //     shuffle indices for lookup table access.
5694 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5695 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5696 //     count of all the bytes of a quadword.
5697 //  f. Perform step e. for upper 128bit vector lane.
5698 //  g. Pack the bitset count of quadwords back to double word.
5699 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5700 
5701 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5702                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5703   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5704   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5705   vpsrlw(dst, src, 4, vec_enc);
5706   vpand(dst, dst, xtmp1, vec_enc);
5707   vpand(xtmp1, src, xtmp1, vec_enc);
5708   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5709   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5710   vpshufb(dst, xtmp2, dst, vec_enc);
5711   vpaddb(dst, dst, xtmp1, vec_enc);
5712 }
5713 
5714 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5715                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5716   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5717   // Following code is as per steps e,f,g and h of above algorithm.
5718   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5719   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5720   vpsadbw(dst, dst, xtmp2, vec_enc);
5721   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5722   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5723   vpackuswb(dst, xtmp1, dst, vec_enc);
5724 }
5725 
5726 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5727                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5728   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5729   // Add the popcount of upper and lower bytes of word.
5730   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5731   vpsrlw(dst, xtmp1, 8, vec_enc);
5732   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5733   vpaddw(dst, dst, xtmp1, vec_enc);
5734 }
5735 
5736 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5737                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5738   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5739   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5740   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5741 }
5742 
5743 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5744                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5745   switch(bt) {
5746     case T_LONG:
5747       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5748       break;
5749     case T_INT:
5750       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5751       break;
5752     case T_CHAR:
5753     case T_SHORT:
5754       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5755       break;
5756     case T_BYTE:
5757     case T_BOOLEAN:
5758       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5759       break;
5760     default:
5761       fatal("Unsupported type %s", type2name(bt));
5762       break;
5763   }
5764 }
5765 
5766 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5767                                                       KRegister mask, bool merge, int vec_enc) {
5768   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5769   switch(bt) {
5770     case T_LONG:
5771       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5772       evpopcntq(dst, mask, src, merge, vec_enc);
5773       break;
5774     case T_INT:
5775       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5776       evpopcntd(dst, mask, src, merge, vec_enc);
5777       break;
5778     case T_CHAR:
5779     case T_SHORT:
5780       assert(VM_Version::supports_avx512_bitalg(), "");
5781       evpopcntw(dst, mask, src, merge, vec_enc);
5782       break;
5783     case T_BYTE:
5784     case T_BOOLEAN:
5785       assert(VM_Version::supports_avx512_bitalg(), "");
5786       evpopcntb(dst, mask, src, merge, vec_enc);
5787       break;
5788     default:
5789       fatal("Unsupported type %s", type2name(bt));
5790       break;
5791   }
5792 }
5793 
5794 #ifndef _LP64
5795 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5796   assert(VM_Version::supports_avx512bw(), "");
5797   kmovdl(tmp, src);
5798   kunpckdql(dst, tmp, tmp);
5799 }
5800 #endif
5801 
5802 // Bit reversal algorithm first reverses the bits of each byte followed by
5803 // a byte level reversal for multi-byte primitive types (short/int/long).
5804 // Algorithm performs a lookup table access to get reverse bit sequence
5805 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5806 // is obtained by swapping the reverse bit sequences of upper and lower
5807 // nibble of a byte.
5808 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5809                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5810   if (VM_Version::supports_avx512vlbw()) {
5811 
5812     // Get the reverse bit sequence of lower nibble of each byte.
5813     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5814     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5815     evpandq(dst, xtmp2, src, vec_enc);
5816     vpshufb(dst, xtmp1, dst, vec_enc);
5817     vpsllq(dst, dst, 4, vec_enc);
5818 
5819     // Get the reverse bit sequence of upper nibble of each byte.
5820     vpandn(xtmp2, xtmp2, src, vec_enc);
5821     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5822     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5823 
5824     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5825     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5826     evporq(xtmp2, dst, xtmp2, vec_enc);
5827     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5828 
5829   } else if(vec_enc == Assembler::AVX_512bit) {
5830     // Shift based bit reversal.
5831     assert(bt == T_LONG || bt == T_INT, "");
5832 
5833     // Swap lower and upper nibble of each byte.
5834     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5835 
5836     // Swap two least and most significant bits of each nibble.
5837     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5838 
5839     // Swap adjacent pair of bits.
5840     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5841     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5842 
5843     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5844     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5845   } else {
5846     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5847     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5848 
5849     // Get the reverse bit sequence of lower nibble of each byte.
5850     vpand(dst, xtmp2, src, vec_enc);
5851     vpshufb(dst, xtmp1, dst, vec_enc);
5852     vpsllq(dst, dst, 4, vec_enc);
5853 
5854     // Get the reverse bit sequence of upper nibble of each byte.
5855     vpandn(xtmp2, xtmp2, src, vec_enc);
5856     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5857     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5858 
5859     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5860     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5861     vpor(xtmp2, dst, xtmp2, vec_enc);
5862     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5863   }
5864 }
5865 
5866 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5867                                                 XMMRegister xtmp, Register rscratch) {
5868   assert(VM_Version::supports_gfni(), "");
5869   assert(rscratch != noreg || always_reachable(mask), "missing");
5870 
5871   // Galois field instruction based bit reversal based on following algorithm.
5872   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5873   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5874   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5875   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5876 }
5877 
5878 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5879                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5880   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5881   evpandq(dst, xtmp1, src, vec_enc);
5882   vpsllq(dst, dst, nbits, vec_enc);
5883   vpandn(xtmp1, xtmp1, src, vec_enc);
5884   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5885   evporq(dst, dst, xtmp1, vec_enc);
5886 }
5887 
5888 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5889                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5890   // Shift based bit reversal.
5891   assert(VM_Version::supports_evex(), "");
5892   switch(bt) {
5893     case T_LONG:
5894       // Swap upper and lower double word of each quad word.
5895       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5896       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5897       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5898       break;
5899     case T_INT:
5900       // Swap upper and lower word of each double word.
5901       evprord(xtmp1, k0, src, 16, true, vec_enc);
5902       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5903       break;
5904     case T_CHAR:
5905     case T_SHORT:
5906       // Swap upper and lower byte of each word.
5907       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5908       break;
5909     case T_BYTE:
5910       evmovdquq(dst, k0, src, true, vec_enc);
5911       break;
5912     default:
5913       fatal("Unsupported type %s", type2name(bt));
5914       break;
5915   }
5916 }
5917 
5918 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5919   if (bt == T_BYTE) {
5920     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5921       evmovdquq(dst, k0, src, true, vec_enc);
5922     } else {
5923       vmovdqu(dst, src);
5924     }
5925     return;
5926   }
5927   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5928   // pre-computed shuffle indices.
5929   switch(bt) {
5930     case T_LONG:
5931       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5932       break;
5933     case T_INT:
5934       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5935       break;
5936     case T_CHAR:
5937     case T_SHORT:
5938       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5939       break;
5940     default:
5941       fatal("Unsupported type %s", type2name(bt));
5942       break;
5943   }
5944   vpshufb(dst, src, dst, vec_enc);
5945 }
5946 
5947 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5948                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5949                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5950   assert(is_integral_type(bt), "");
5951   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5952   assert(VM_Version::supports_avx512cd(), "");
5953   switch(bt) {
5954     case T_LONG:
5955       evplzcntq(dst, ktmp, src, merge, vec_enc);
5956       break;
5957     case T_INT:
5958       evplzcntd(dst, ktmp, src, merge, vec_enc);
5959       break;
5960     case T_SHORT:
5961       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5962       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5963       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5964       vpunpckhwd(dst, xtmp1, src, vec_enc);
5965       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5966       vpackusdw(dst, xtmp2, dst, vec_enc);
5967       break;
5968     case T_BYTE:
5969       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5970       // accessing the lookup table.
5971       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5972       // accessing the lookup table.
5973       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5974       assert(VM_Version::supports_avx512bw(), "");
5975       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5976       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5977       vpand(xtmp2, dst, src, vec_enc);
5978       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5979       vpsrlw(xtmp3, src, 4, vec_enc);
5980       vpand(xtmp3, dst, xtmp3, vec_enc);
5981       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5982       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5983       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5984       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5985       break;
5986     default:
5987       fatal("Unsupported type %s", type2name(bt));
5988       break;
5989   }
5990 }
5991 
5992 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5993                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5994   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5995   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5996   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5997   // accessing the lookup table.
5998   vpand(dst, xtmp2, src, vec_enc);
5999   vpshufb(dst, xtmp1, dst, vec_enc);
6000   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6001   // accessing the lookup table.
6002   vpsrlw(xtmp3, src, 4, vec_enc);
6003   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6004   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6005   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6006   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6007   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6008   vpaddb(dst, dst, xtmp2, vec_enc);
6009   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6010 }
6011 
6012 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6013                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6014   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6015   // Add zero counts of lower byte and upper byte of a word if
6016   // upper byte holds a zero value.
6017   vpsrlw(xtmp3, src, 8, vec_enc);
6018   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6019   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6020   vpsllw(xtmp2, dst, 8, vec_enc);
6021   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6022   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6023   vpsrlw(dst, dst, 8, vec_enc);
6024 }
6025 
6026 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6027                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6028   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6029   // hence biased exponent can be used to compute leading zero count as per
6030   // following formula:-
6031   // LZCNT = 32 - (biased_exp - 127)
6032   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6033 
6034   // Broadcast 0xFF
6035   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6036   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6037 
6038   // Extract biased exponent.
6039   vcvtdq2ps(dst, src, vec_enc);
6040   vpsrld(dst, dst, 23, vec_enc);
6041   vpand(dst, dst, xtmp1, vec_enc);
6042 
6043   // Broadcast 127.
6044   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6045   // Exponent = biased_exp - 127
6046   vpsubd(dst, dst, xtmp1, vec_enc);
6047 
6048   // Exponent = Exponent  + 1
6049   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6050   vpaddd(dst, dst, xtmp3, vec_enc);
6051 
6052   // Replace -ve exponent with zero, exponent is -ve when src
6053   // lane contains a zero value.
6054   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6055   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6056 
6057   // Rematerialize broadcast 32.
6058   vpslld(xtmp1, xtmp3, 5, vec_enc);
6059   // Exponent is 32 if corresponding source lane contains max_int value.
6060   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6061   // LZCNT = 32 - exponent
6062   vpsubd(dst, xtmp1, dst, vec_enc);
6063 
6064   // Replace LZCNT with a value 1 if corresponding source lane
6065   // contains max_int value.
6066   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6067 
6068   // Replace biased_exp with 0 if source lane value is less than zero.
6069   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6070   vblendvps(dst, dst, xtmp2, src, vec_enc);
6071 }
6072 
6073 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6074                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6075   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6076   // Add zero counts of lower word and upper word of a double word if
6077   // upper word holds a zero value.
6078   vpsrld(xtmp3, src, 16, vec_enc);
6079   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6080   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6081   vpslld(xtmp2, dst, 16, vec_enc);
6082   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6083   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6084   vpsrld(dst, dst, 16, vec_enc);
6085   // Add zero counts of lower doubleword and upper doubleword of a
6086   // quadword if upper doubleword holds a zero value.
6087   vpsrlq(xtmp3, src, 32, vec_enc);
6088   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6089   vpsllq(xtmp2, dst, 32, vec_enc);
6090   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6091   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6092   vpsrlq(dst, dst, 32, vec_enc);
6093 }
6094 
6095 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6096                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6097                                                        Register rtmp, int vec_enc) {
6098   assert(is_integral_type(bt), "unexpected type");
6099   assert(vec_enc < Assembler::AVX_512bit, "");
6100   switch(bt) {
6101     case T_LONG:
6102       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6103       break;
6104     case T_INT:
6105       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6106       break;
6107     case T_SHORT:
6108       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6109       break;
6110     case T_BYTE:
6111       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6112       break;
6113     default:
6114       fatal("Unsupported type %s", type2name(bt));
6115       break;
6116   }
6117 }
6118 
6119 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6120   switch(bt) {
6121     case T_BYTE:
6122       vpsubb(dst, src1, src2, vec_enc);
6123       break;
6124     case T_SHORT:
6125       vpsubw(dst, src1, src2, vec_enc);
6126       break;
6127     case T_INT:
6128       vpsubd(dst, src1, src2, vec_enc);
6129       break;
6130     case T_LONG:
6131       vpsubq(dst, src1, src2, vec_enc);
6132       break;
6133     default:
6134       fatal("Unsupported type %s", type2name(bt));
6135       break;
6136   }
6137 }
6138 
6139 // Trailing zero count computation is based on leading zero count operation as per
6140 // following equation. All AVX3 targets support AVX512CD feature which offers
6141 // direct vector instruction to compute leading zero count.
6142 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6143 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6144                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6145                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6146   assert(is_integral_type(bt), "");
6147   // xtmp = -1
6148   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6149   // xtmp = xtmp + src
6150   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6151   // xtmp = xtmp & ~src
6152   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6153   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6154   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6155   vpsub(bt, dst, xtmp4, dst, vec_enc);
6156 }
6157 
6158 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6159 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6160 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6161                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6162   assert(is_integral_type(bt), "");
6163   // xtmp = 0
6164   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6165   // xtmp = 0 - src
6166   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6167   // xtmp = xtmp | src
6168   vpor(xtmp3, xtmp3, src, vec_enc);
6169   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6170   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6171   vpsub(bt, dst, xtmp1, dst, vec_enc);
6172 }
6173 
6174 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6175   Label done;
6176   Label neg_divisor_fastpath;
6177   cmpl(divisor, 0);
6178   jccb(Assembler::less, neg_divisor_fastpath);
6179   xorl(rdx, rdx);
6180   divl(divisor);
6181   jmpb(done);
6182   bind(neg_divisor_fastpath);
6183   // Fastpath for divisor < 0:
6184   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6185   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6186   movl(rdx, rax);
6187   subl(rdx, divisor);
6188   if (VM_Version::supports_bmi1()) {
6189     andnl(rax, rdx, rax);
6190   } else {
6191     notl(rdx);
6192     andl(rax, rdx);
6193   }
6194   shrl(rax, 31);
6195   bind(done);
6196 }
6197 
6198 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6199   Label done;
6200   Label neg_divisor_fastpath;
6201   cmpl(divisor, 0);
6202   jccb(Assembler::less, neg_divisor_fastpath);
6203   xorl(rdx, rdx);
6204   divl(divisor);
6205   jmpb(done);
6206   bind(neg_divisor_fastpath);
6207   // Fastpath when divisor < 0:
6208   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6209   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6210   movl(rdx, rax);
6211   subl(rax, divisor);
6212   if (VM_Version::supports_bmi1()) {
6213     andnl(rax, rax, rdx);
6214   } else {
6215     notl(rax);
6216     andl(rax, rdx);
6217   }
6218   sarl(rax, 31);
6219   andl(rax, divisor);
6220   subl(rdx, rax);
6221   bind(done);
6222 }
6223 
6224 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6225   Label done;
6226   Label neg_divisor_fastpath;
6227 
6228   cmpl(divisor, 0);
6229   jccb(Assembler::less, neg_divisor_fastpath);
6230   xorl(rdx, rdx);
6231   divl(divisor);
6232   jmpb(done);
6233   bind(neg_divisor_fastpath);
6234   // Fastpath for divisor < 0:
6235   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6236   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6237   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6238   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6239   movl(rdx, rax);
6240   subl(rax, divisor);
6241   if (VM_Version::supports_bmi1()) {
6242     andnl(rax, rax, rdx);
6243   } else {
6244     notl(rax);
6245     andl(rax, rdx);
6246   }
6247   movl(tmp, rax);
6248   shrl(rax, 31); // quotient
6249   sarl(tmp, 31);
6250   andl(tmp, divisor);
6251   subl(rdx, tmp); // remainder
6252   bind(done);
6253 }
6254 
6255 #ifdef _LP64
6256 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6257                                  XMMRegister xtmp2, Register rtmp) {
6258   if(VM_Version::supports_gfni()) {
6259     // Galois field instruction based bit reversal based on following algorithm.
6260     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6261     mov64(rtmp, 0x8040201008040201L);
6262     movq(xtmp1, src);
6263     movq(xtmp2, rtmp);
6264     gf2p8affineqb(xtmp1, xtmp2, 0);
6265     movq(dst, xtmp1);
6266   } else {
6267     // Swap even and odd numbered bits.
6268     movl(rtmp, src);
6269     andl(rtmp, 0x55555555);
6270     shll(rtmp, 1);
6271     movl(dst, src);
6272     andl(dst, 0xAAAAAAAA);
6273     shrl(dst, 1);
6274     orl(dst, rtmp);
6275 
6276     // Swap LSB and MSB 2 bits of each nibble.
6277     movl(rtmp, dst);
6278     andl(rtmp, 0x33333333);
6279     shll(rtmp, 2);
6280     andl(dst, 0xCCCCCCCC);
6281     shrl(dst, 2);
6282     orl(dst, rtmp);
6283 
6284     // Swap LSB and MSB 4 bits of each byte.
6285     movl(rtmp, dst);
6286     andl(rtmp, 0x0F0F0F0F);
6287     shll(rtmp, 4);
6288     andl(dst, 0xF0F0F0F0);
6289     shrl(dst, 4);
6290     orl(dst, rtmp);
6291   }
6292   bswapl(dst);
6293 }
6294 
6295 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6296                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6297   if(VM_Version::supports_gfni()) {
6298     // Galois field instruction based bit reversal based on following algorithm.
6299     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6300     mov64(rtmp1, 0x8040201008040201L);
6301     movq(xtmp1, src);
6302     movq(xtmp2, rtmp1);
6303     gf2p8affineqb(xtmp1, xtmp2, 0);
6304     movq(dst, xtmp1);
6305   } else {
6306     // Swap even and odd numbered bits.
6307     movq(rtmp1, src);
6308     mov64(rtmp2, 0x5555555555555555L);
6309     andq(rtmp1, rtmp2);
6310     shlq(rtmp1, 1);
6311     movq(dst, src);
6312     notq(rtmp2);
6313     andq(dst, rtmp2);
6314     shrq(dst, 1);
6315     orq(dst, rtmp1);
6316 
6317     // Swap LSB and MSB 2 bits of each nibble.
6318     movq(rtmp1, dst);
6319     mov64(rtmp2, 0x3333333333333333L);
6320     andq(rtmp1, rtmp2);
6321     shlq(rtmp1, 2);
6322     notq(rtmp2);
6323     andq(dst, rtmp2);
6324     shrq(dst, 2);
6325     orq(dst, rtmp1);
6326 
6327     // Swap LSB and MSB 4 bits of each byte.
6328     movq(rtmp1, dst);
6329     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6330     andq(rtmp1, rtmp2);
6331     shlq(rtmp1, 4);
6332     notq(rtmp2);
6333     andq(dst, rtmp2);
6334     shrq(dst, 4);
6335     orq(dst, rtmp1);
6336   }
6337   bswapq(dst);
6338 }
6339 
6340 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6341   Label done;
6342   Label neg_divisor_fastpath;
6343   cmpq(divisor, 0);
6344   jccb(Assembler::less, neg_divisor_fastpath);
6345   xorl(rdx, rdx);
6346   divq(divisor);
6347   jmpb(done);
6348   bind(neg_divisor_fastpath);
6349   // Fastpath for divisor < 0:
6350   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6351   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6352   movq(rdx, rax);
6353   subq(rdx, divisor);
6354   if (VM_Version::supports_bmi1()) {
6355     andnq(rax, rdx, rax);
6356   } else {
6357     notq(rdx);
6358     andq(rax, rdx);
6359   }
6360   shrq(rax, 63);
6361   bind(done);
6362 }
6363 
6364 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6365   Label done;
6366   Label neg_divisor_fastpath;
6367   cmpq(divisor, 0);
6368   jccb(Assembler::less, neg_divisor_fastpath);
6369   xorq(rdx, rdx);
6370   divq(divisor);
6371   jmp(done);
6372   bind(neg_divisor_fastpath);
6373   // Fastpath when divisor < 0:
6374   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6375   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6376   movq(rdx, rax);
6377   subq(rax, divisor);
6378   if (VM_Version::supports_bmi1()) {
6379     andnq(rax, rax, rdx);
6380   } else {
6381     notq(rax);
6382     andq(rax, rdx);
6383   }
6384   sarq(rax, 63);
6385   andq(rax, divisor);
6386   subq(rdx, rax);
6387   bind(done);
6388 }
6389 
6390 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6391   Label done;
6392   Label neg_divisor_fastpath;
6393   cmpq(divisor, 0);
6394   jccb(Assembler::less, neg_divisor_fastpath);
6395   xorq(rdx, rdx);
6396   divq(divisor);
6397   jmp(done);
6398   bind(neg_divisor_fastpath);
6399   // Fastpath for divisor < 0:
6400   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6401   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6402   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6403   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6404   movq(rdx, rax);
6405   subq(rax, divisor);
6406   if (VM_Version::supports_bmi1()) {
6407     andnq(rax, rax, rdx);
6408   } else {
6409     notq(rax);
6410     andq(rax, rdx);
6411   }
6412   movq(tmp, rax);
6413   shrq(rax, 63); // quotient
6414   sarq(tmp, 63);
6415   andq(tmp, divisor);
6416   subq(rdx, tmp); // remainder
6417   bind(done);
6418 }
6419 #endif
6420 
6421 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6422                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6423                                         int vlen_enc) {
6424   assert(VM_Version::supports_avx512bw(), "");
6425   // Byte shuffles are inlane operations and indices are determined using
6426   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6427   // normalized to index range 0-15. This makes sure that all the multiples
6428   // of an index value are placed at same relative position in 128 bit
6429   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6430   // will be 16th element in their respective 128 bit lanes.
6431   movl(rtmp, 16);
6432   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6433 
6434   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6435   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6436   // original shuffle indices and move the shuffled lanes corresponding to true
6437   // mask to destination vector.
6438   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6439   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6440   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6441 
6442   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6443   // and broadcasting second 128 bit lane.
6444   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6445   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6446   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6447   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6448   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6449 
6450   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6451   // and broadcasting third 128 bit lane.
6452   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6453   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6454   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6455   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6456   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6457 
6458   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6459   // and broadcasting third 128 bit lane.
6460   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6461   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6462   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6463   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6464   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6465 }
6466 
6467 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6468                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6469   if (vlen_enc == AVX_128bit) {
6470     vpermilps(dst, src, shuffle, vlen_enc);
6471   } else if (bt == T_INT) {
6472     vpermd(dst, shuffle, src, vlen_enc);
6473   } else {
6474     assert(bt == T_FLOAT, "");
6475     vpermps(dst, shuffle, src, vlen_enc);
6476   }
6477 }
6478 
6479 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6480                                                      XMMRegister src2, int vlen_enc) {
6481   switch(elem_bt) {
6482     case T_BYTE:
6483       evpermi2b(dst, src1, src2, vlen_enc);
6484       break;
6485     case T_SHORT:
6486       evpermi2w(dst, src1, src2, vlen_enc);
6487       break;
6488     case T_INT:
6489       evpermi2d(dst, src1, src2, vlen_enc);
6490       break;
6491     case T_LONG:
6492       evpermi2q(dst, src1, src2, vlen_enc);
6493       break;
6494     case T_FLOAT:
6495       evpermi2ps(dst, src1, src2, vlen_enc);
6496       break;
6497     case T_DOUBLE:
6498       evpermi2pd(dst, src1, src2, vlen_enc);
6499       break;
6500     default:
6501       fatal("Unsupported type %s", type2name(elem_bt));
6502       break;
6503   }
6504 }