1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  54 
  55   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  56   // NativeJump::patch_verified_entry will be able to patch out the entry
  57   // code safely. The push to verify stack depth is ok at 5 bytes,
  58   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  59   // stack bang then we must use the 6 byte frame allocation even if
  60   // we have no frame. :-(
  61   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  62 
  63   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  64   // Remove word for return addr
  65   framesize -= wordSize;
  66   stack_bang_size -= wordSize;
  67 
  68   // Calls to C2R adapters often do not accept exceptional returns.
  69   // We require that their callers must bang for them.  But be careful, because
  70   // some VM calls (such as call site linkage) can use several kilobytes of
  71   // stack.  But the stack safety zone should account for that.
  72   // See bugs 4446381, 4468289, 4497237.
  73   if (stack_bang_size > 0) {
  74     generate_stack_overflow_check(stack_bang_size);
  75 
  76     // We always push rbp, so that on return to interpreter rbp, will be
  77     // restored correctly and we can correct the stack.
  78     push(rbp);
  79     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  80     if (PreserveFramePointer) {
  81       mov(rbp, rsp);
  82     }
  83     // Remove word for ebp
  84     framesize -= wordSize;
  85 
  86     // Create frame
  87     if (framesize) {
  88       subptr(rsp, framesize);
  89     }
  90   } else {
  91     // Create frame (force generation of a 4 byte immediate value)
  92     subptr_imm32(rsp, framesize);
  93 
  94     // Save RBP register now.
  95     framesize -= wordSize;
  96     movptr(Address(rsp, framesize), rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       movptr(rbp, rsp);
 100       if (framesize > 0) {
 101         addptr(rbp, framesize);
 102       }
 103     }
 104   }
 105 
 106   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 107     framesize -= wordSize;
 108     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 109   }
 110 
 111 #ifndef _LP64
 112   // If method sets FPU control word do it now
 113   if (fp_mode_24b) {
 114     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 115   }
 116   if (UseSSE >= 2 && VerifyFPU) {
 117     verify_FPU(0, "FPU stack must be clean on entry");
 118   }
 119 #endif
 120 
 121 #ifdef ASSERT
 122   if (VerifyStackAtCalls) {
 123     Label L;
 124     push(rax);
 125     mov(rax, rsp);
 126     andptr(rax, StackAlignmentInBytes-1);
 127     cmpptr(rax, StackAlignmentInBytes-wordSize);
 128     pop(rax);
 129     jcc(Assembler::equal, L);
 130     STOP("Stack is not properly aligned!");
 131     bind(L);
 132   }
 133 #endif
 134 
 135   if (!is_stub) {
 136     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137  #ifdef _LP64
 138     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 139       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 140       Label dummy_slow_path;
 141       Label dummy_continuation;
 142       Label* slow_path = &dummy_slow_path;
 143       Label* continuation = &dummy_continuation;
 144       if (!Compile::current()->output()->in_scratch_emit_size()) {
 145         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 146         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 147         Compile::current()->output()->add_stub(stub);
 148         slow_path = &stub->entry();
 149         continuation = &stub->continuation();
 150       }
 151       bs->nmethod_entry_barrier(this, slow_path, continuation);
 152     }
 153 #else
 154     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 155     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 156 #endif
 157   }
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 // fast_lock and fast_unlock used by C2
 176 
 177 // Because the transitions from emitted code to the runtime
 178 // monitorenter/exit helper stubs are so slow it's critical that
 179 // we inline both the stack-locking fast path and the inflated fast path.
 180 //
 181 // See also: cmpFastLock and cmpFastUnlock.
 182 //
 183 // What follows is a specialized inline transliteration of the code
 184 // in enter() and exit(). If we're concerned about I$ bloat another
 185 // option would be to emit TrySlowEnter and TrySlowExit methods
 186 // at startup-time.  These methods would accept arguments as
 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 188 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 190 // In practice, however, the # of lock sites is bounded and is usually small.
 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 192 // if the processor uses simple bimodal branch predictors keyed by EIP
 193 // Since the helper routines would be called from multiple synchronization
 194 // sites.
 195 //
 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 198 // to those specialized methods.  That'd give us a mostly platform-independent
 199 // implementation that the JITs could optimize and inline at their pleasure.
 200 // Done correctly, the only time we'd need to cross to native could would be
 201 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 203 // (b) explicit barriers or fence operations.
 204 //
 205 // TODO:
 206 //
 207 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 208 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 209 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 210 //    the lock operators would typically be faster than reifying Self.
 211 //
 212 // *  Ideally I'd define the primitives as:
 213 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 214 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 215 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 216 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 217 //    Furthermore the register assignments are overconstrained, possibly resulting in
 218 //    sub-optimal code near the synchronization site.
 219 //
 220 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 221 //    Alternately, use a better sp-proximity test.
 222 //
 223 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 224 //    Either one is sufficient to uniquely identify a thread.
 225 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 226 //
 227 // *  Intrinsify notify() and notifyAll() for the common cases where the
 228 //    object is locked by the calling thread but the waitlist is empty.
 229 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 230 //
 231 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 232 //    But beware of excessive branch density on AMD Opterons.
 233 //
 234 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 235 //    or failure of the fast path.  If the fast path fails then we pass
 236 //    control to the slow path, typically in C.  In fast_lock and
 237 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 238 //    will emit a conditional branch immediately after the node.
 239 //    So we have branches to branches and lots of ICC.ZF games.
 240 //    Instead, it might be better to have C2 pass a "FailureLabel"
 241 //    into fast_lock and fast_unlock.  In the case of success, control
 242 //    will drop through the node.  ICC.ZF is undefined at exit.
 243 //    In the case of failure, the node will branch directly to the
 244 //    FailureLabel
 245 
 246 
 247 // obj: object to lock
 248 // box: on-stack box address (displaced header location) - KILLED
 249 // rax,: tmp -- KILLED
 250 // scr: tmp -- KILLED
 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 252                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 253                                  Metadata* method_data) {
 254   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 255   // Ensure the register assignments are disjoint
 256   assert(tmpReg == rax, "");
 257   assert(cx1Reg == noreg, "");
 258   assert(cx2Reg == noreg, "");
 259   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 260 
 261   // Possible cases that we'll encounter in fast_lock
 262   // ------------------------------------------------
 263   // * Inflated
 264   //    -- unlocked
 265   //    -- Locked
 266   //       = by self
 267   //       = by other
 268   // * neutral
 269   // * stack-locked
 270   //    -- by self
 271   //       = sp-proximity test hits
 272   //       = sp-proximity test generates false-negative
 273   //    -- by other
 274   //
 275 
 276   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 277 
 278   if (DiagnoseSyncOnValueBasedClasses != 0) {
 279     load_klass(tmpReg, objReg, scrReg);
 280     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 281     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 282     jcc(Assembler::notZero, DONE_LABEL);
 283   }
 284 
 285   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 286   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 287   jcc(Assembler::notZero, IsInflated);
 288 
 289   if (LockingMode == LM_MONITOR) {
 290     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 291     testptr(objReg, objReg);
 292   } else {
 293     assert(LockingMode == LM_LEGACY, "must be");
 294     // Attempt stack-locking ...
 295     orptr (tmpReg, markWord::unlocked_value);
 296     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 297     lock();
 298     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 299     jcc(Assembler::equal, COUNT);           // Success
 300 
 301     // Recursive locking.
 302     // The object is stack-locked: markword contains stack pointer to BasicLock.
 303     // Locked by current thread if difference with current SP is less than one page.
 304     subptr(tmpReg, rsp);
 305     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 306     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 307     movptr(Address(boxReg, 0), tmpReg);
 308   }
 309   jmp(DONE_LABEL);
 310 
 311   bind(IsInflated);
 312   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 313 
 314 #ifndef _LP64
 315   // The object is inflated.
 316 
 317   // boxReg refers to the on-stack BasicLock in the current frame.
 318   // We'd like to write:
 319   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 320   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 321   // additional latency as we have another ST in the store buffer that must drain.
 322 
 323   // avoid ST-before-CAS
 324   // register juggle because we need tmpReg for cmpxchgptr below
 325   movptr(scrReg, boxReg);
 326   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 327 
 328   // Optimistic form: consider XORL tmpReg,tmpReg
 329   movptr(tmpReg, NULL_WORD);
 330 
 331   // Appears unlocked - try to swing _owner from null to non-null.
 332   // Ideally, I'd manifest "Self" with get_thread and then attempt
 333   // to CAS the register containing Self into m->Owner.
 334   // But we don't have enough registers, so instead we can either try to CAS
 335   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 336   // we later store "Self" into m->Owner.  Transiently storing a stack address
 337   // (rsp or the address of the box) into  m->owner is harmless.
 338   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 339   lock();
 340   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 341   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 342   // If we weren't able to swing _owner from null to the BasicLock
 343   // then take the slow path.
 344   jccb  (Assembler::notZero, NO_COUNT);
 345   // update _owner from BasicLock to thread
 346   get_thread (scrReg);                    // beware: clobbers ICCs
 347   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 348   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 349 
 350   // If the CAS fails we can either retry or pass control to the slow path.
 351   // We use the latter tactic.
 352   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 353   // If the CAS was successful ...
 354   //   Self has acquired the lock
 355   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 356   // Intentional fall-through into DONE_LABEL ...
 357 #else // _LP64
 358   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 359   movq(scrReg, tmpReg);
 360   xorq(tmpReg, tmpReg);
 361   lock();
 362   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 363   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 364   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 365   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 366   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 367   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 368 
 369   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 370   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 371   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 372   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 373 #endif // _LP64
 374   bind(DONE_LABEL);
 375 
 376   // ZFlag == 1 count in fast path
 377   // ZFlag == 0 count in slow path
 378   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 379 
 380   bind(COUNT);
 381   // Count monitors in fast path
 382   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 383 
 384   xorl(tmpReg, tmpReg); // Set ZF == 1
 385 
 386   bind(NO_COUNT);
 387 
 388   // At NO_COUNT the icc ZFlag is set as follows ...
 389   // fast_unlock uses the same protocol.
 390   // ZFlag == 1 -> Success
 391   // ZFlag == 0 -> Failure - force control through the slow path
 392 }
 393 
 394 // obj: object to unlock
 395 // box: box address (displaced header location), killed.  Must be EAX.
 396 // tmp: killed, cannot be obj nor box.
 397 //
 398 // Some commentary on balanced locking:
 399 //
 400 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 401 // Methods that don't have provably balanced locking are forced to run in the
 402 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 403 // The interpreter provides two properties:
 404 // I1:  At return-time the interpreter automatically and quietly unlocks any
 405 //      objects acquired the current activation (frame).  Recall that the
 406 //      interpreter maintains an on-stack list of locks currently held by
 407 //      a frame.
 408 // I2:  If a method attempts to unlock an object that is not held by the
 409 //      the frame the interpreter throws IMSX.
 410 //
 411 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 412 // B() doesn't have provably balanced locking so it runs in the interpreter.
 413 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 414 // is still locked by A().
 415 //
 416 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 417 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 418 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 419 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 420 // Arguably given that the spec legislates the JNI case as undefined our implementation
 421 // could reasonably *avoid* checking owner in fast_unlock().
 422 // In the interest of performance we elide m->Owner==Self check in unlock.
 423 // A perfectly viable alternative is to elide the owner check except when
 424 // Xcheck:jni is enabled.
 425 
 426 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 427   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 428   assert(boxReg == rax, "");
 429   assert_different_registers(objReg, boxReg, tmpReg);
 430 
 431   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 432 
 433   if (LockingMode == LM_LEGACY) {
 434     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 435     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 436   }
 437   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 438   if (LockingMode != LM_MONITOR) {
 439     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 440     jcc(Assembler::zero, Stacked);
 441   }
 442 
 443   // It's inflated.
 444 
 445   // Despite our balanced locking property we still check that m->_owner == Self
 446   // as java routines or native JNI code called by this thread might
 447   // have released the lock.
 448   // Refer to the comments in synchronizer.cpp for how we might encode extra
 449   // state in _succ so we can avoid fetching EntryList|cxq.
 450   //
 451   // If there's no contention try a 1-0 exit.  That is, exit without
 452   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 453   // we detect and recover from the race that the 1-0 exit admits.
 454   //
 455   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 456   // before it STs null into _owner, releasing the lock.  Updates
 457   // to data protected by the critical section must be visible before
 458   // we drop the lock (and thus before any other thread could acquire
 459   // the lock and observe the fields protected by the lock).
 460   // IA32's memory-model is SPO, so STs are ordered with respect to
 461   // each other and there's no need for an explicit barrier (fence).
 462   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 463 #ifndef _LP64
 464   // Note that we could employ various encoding schemes to reduce
 465   // the number of loads below (currently 4) to just 2 or 3.
 466   // Refer to the comments in synchronizer.cpp.
 467   // In practice the chain of fetches doesn't seem to impact performance, however.
 468   xorptr(boxReg, boxReg);
 469   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 470   jccb  (Assembler::notZero, DONE_LABEL);
 471   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 472   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 473   jccb  (Assembler::notZero, DONE_LABEL);
 474   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 475   jmpb  (DONE_LABEL);
 476 #else // _LP64
 477   // It's inflated
 478   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 479 
 480   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 481   jccb(Assembler::equal, LNotRecursive);
 482 
 483   // Recursive inflated unlock
 484   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 485   jmpb(LSuccess);
 486 
 487   bind(LNotRecursive);
 488   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 489   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 490   jccb  (Assembler::notZero, CheckSucc);
 491   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 492   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 493   jmpb  (DONE_LABEL);
 494 
 495   // Try to avoid passing control into the slow_path ...
 496   bind  (CheckSucc);
 497 
 498   // The following optional optimization can be elided if necessary
 499   // Effectively: if (succ == null) goto slow path
 500   // The code reduces the window for a race, however,
 501   // and thus benefits performance.
 502   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 503   jccb  (Assembler::zero, LGoSlowPath);
 504 
 505   xorptr(boxReg, boxReg);
 506   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 507   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 508 
 509   // Memory barrier/fence
 510   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 511   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 512   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 513   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 514   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 515   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 516   lock(); addl(Address(rsp, 0), 0);
 517 
 518   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 519   jccb  (Assembler::notZero, LSuccess);
 520 
 521   // Rare inopportune interleaving - race.
 522   // The successor vanished in the small window above.
 523   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 524   // We need to ensure progress and succession.
 525   // Try to reacquire the lock.
 526   // If that fails then the new owner is responsible for succession and this
 527   // thread needs to take no further action and can exit via the fast path (success).
 528   // If the re-acquire succeeds then pass control into the slow path.
 529   // As implemented, this latter mode is horrible because we generated more
 530   // coherence traffic on the lock *and* artificially extended the critical section
 531   // length while by virtue of passing control into the slow path.
 532 
 533   // box is really RAX -- the following CMPXCHG depends on that binding
 534   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 535   lock();
 536   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 537   // There's no successor so we tried to regrab the lock.
 538   // If that didn't work, then another thread grabbed the
 539   // lock so we're done (and exit was a success).
 540   jccb  (Assembler::notEqual, LSuccess);
 541   // Intentional fall-through into slow path
 542 
 543   bind  (LGoSlowPath);
 544   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 545   jmpb  (DONE_LABEL);
 546 
 547   bind  (LSuccess);
 548   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 549   jmpb  (DONE_LABEL);
 550 
 551 #endif
 552   if (LockingMode == LM_LEGACY) {
 553     bind  (Stacked);
 554     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 555     lock();
 556     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 557     // Intentional fall-thru into DONE_LABEL
 558   }
 559 
 560   bind(DONE_LABEL);
 561 
 562   // ZFlag == 1 count in fast path
 563   // ZFlag == 0 count in slow path
 564   jccb(Assembler::notZero, NO_COUNT);
 565 
 566   bind(COUNT);
 567   // Count monitors in fast path
 568 #ifndef _LP64
 569   get_thread(tmpReg);
 570   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 571 #else // _LP64
 572   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 573 #endif
 574 
 575   xorl(tmpReg, tmpReg); // Set ZF == 1
 576 
 577   bind(NO_COUNT);
 578 }
 579 
 580 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 581                                               Register t, Register thread) {
 582   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 583   assert(rax_reg == rax, "Used for CAS");
 584   assert_different_registers(obj, box, rax_reg, t, thread);
 585 
 586   // Handle inflated monitor.
 587   Label inflated;
 588   // Finish fast lock successfully. ZF value is irrelevant.
 589   Label locked;
 590   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 591   Label slow_path;
 592 
 593   if (DiagnoseSyncOnValueBasedClasses != 0) {
 594     load_klass(rax_reg, obj, t);
 595     movl(rax_reg, Address(rax_reg, Klass::access_flags_offset()));
 596     testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS);
 597     jcc(Assembler::notZero, slow_path);
 598   }
 599 
 600   const Register mark = t;
 601 
 602   { // Lightweight Lock
 603 
 604     Label push;
 605 
 606     const Register top = box;
 607 
 608     // Load the mark.
 609     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 610 
 611     // Prefetch top.
 612     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 613 
 614     // Check for monitor (0b10).
 615     testptr(mark, markWord::monitor_value);
 616     jcc(Assembler::notZero, inflated);
 617 
 618     // Check if lock-stack is full.
 619     cmpl(top, LockStack::end_offset() - 1);
 620     jcc(Assembler::greater, slow_path);
 621 
 622     // Check if recursive.
 623     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 624     jccb(Assembler::equal, push);
 625 
 626     // Try to lock. Transition lock bits 0b01 => 0b00
 627     movptr(rax_reg, mark);
 628     orptr(rax_reg, markWord::unlocked_value);
 629     andptr(mark, ~(int32_t)markWord::unlocked_value);
 630     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 631     jcc(Assembler::notEqual, slow_path);
 632 
 633     bind(push);
 634     // After successful lock, push object on lock-stack.
 635     movptr(Address(thread, top), obj);
 636     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 637     jmpb(locked);
 638   }
 639 
 640   { // Handle inflated monitor.
 641     bind(inflated);
 642 
 643     const Register tagged_monitor = mark;
 644 
 645     // CAS owner (null => current thread).
 646     xorptr(rax_reg, rax_reg);
 647     lock(); cmpxchgptr(thread, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 648     jccb(Assembler::equal, locked);
 649 
 650     // Check if recursive.
 651     cmpptr(thread, rax_reg);
 652     jccb(Assembler::notEqual, slow_path);
 653 
 654     // Recursive.
 655     increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 656   }
 657 
 658   bind(locked);
 659   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 660   // Set ZF = 1
 661   xorl(rax_reg, rax_reg);
 662 
 663 #ifdef ASSERT
 664   // Check that locked label is reached with ZF set.
 665   Label zf_correct;
 666   Label zf_bad_zero;
 667   jcc(Assembler::zero, zf_correct);
 668   jmp(zf_bad_zero);
 669 #endif
 670 
 671   bind(slow_path);
 672 #ifdef ASSERT
 673   // Check that slow_path label is reached with ZF not set.
 674   jcc(Assembler::notZero, zf_correct);
 675   stop("Fast Lock ZF != 0");
 676   bind(zf_bad_zero);
 677   stop("Fast Lock ZF != 1");
 678   bind(zf_correct);
 679 #endif
 680   // C2 uses the value of ZF to determine the continuation.
 681 }
 682 
 683 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 684   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 685   assert(reg_rax == rax, "Used for CAS");
 686   assert_different_registers(obj, reg_rax, t);
 687 
 688   // Handle inflated monitor.
 689   Label inflated, inflated_check_lock_stack;
 690   // Finish fast unlock successfully.  MUST jump with ZF == 1
 691   Label unlocked;
 692 
 693   // Assume success.
 694   decrement(Address(thread, JavaThread::held_monitor_count_offset()));
 695 
 696   const Register mark = t;
 697   const Register top = reg_rax;
 698 
 699   Label dummy;
 700   C2FastUnlockLightweightStub* stub = nullptr;
 701 
 702   if (!Compile::current()->output()->in_scratch_emit_size()) {
 703     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 704     Compile::current()->output()->add_stub(stub);
 705   }
 706 
 707   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 708   Label& check_successor = stub == nullptr ? dummy : stub->check_successor();
 709 
 710   { // Lightweight Unlock
 711 
 712     // Load top.
 713     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 714 
 715     // Prefetch mark.
 716     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 717 
 718     // Check if obj is top of lock-stack.
 719     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 720     // Top of lock stack was not obj. Must be monitor.
 721     jcc(Assembler::notEqual, inflated_check_lock_stack);
 722 
 723     // Pop lock-stack.
 724     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 725     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 726 
 727     // Check if recursive.
 728     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 729     jcc(Assembler::equal, unlocked);
 730 
 731     // We elide the monitor check, let the CAS fail instead.
 732 
 733     // Try to unlock. Transition lock bits 0b00 => 0b01
 734     movptr(reg_rax, mark);
 735     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 736     orptr(mark, markWord::unlocked_value);
 737     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 738     jcc(Assembler::notEqual, push_and_slow_path);
 739     jmp(unlocked);
 740   }
 741 
 742 
 743   { // Handle inflated monitor.
 744     bind(inflated_check_lock_stack);
 745 #ifdef ASSERT
 746     Label check_done;
 747     subl(top, oopSize);
 748     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 749     jcc(Assembler::below, check_done);
 750     cmpptr(obj, Address(thread, top));
 751     jccb(Assembler::notEqual, inflated_check_lock_stack);
 752     stop("Fast Unlock lock on stack");
 753     bind(check_done);
 754     testptr(mark, markWord::monitor_value);
 755     jccb(Assembler::notZero, inflated);
 756     stop("Fast Unlock not monitor");
 757 #endif
 758 
 759     bind(inflated);
 760 
 761     // mark contains the tagged ObjectMonitor*.
 762     const Register monitor = mark;
 763 
 764 #ifndef _LP64
 765     // Check if recursive.
 766     xorptr(reg_rax, reg_rax);
 767     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 768     jcc(Assembler::notZero, check_successor);
 769 
 770     // Check if the entry lists are empty.
 771     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 772     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 773     jcc(Assembler::notZero, check_successor);
 774 
 775     // Release lock.
 776     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 777 #else // _LP64
 778     Label recursive;
 779 
 780     // Check if recursive.
 781     cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 782     jccb(Assembler::notEqual, recursive);
 783 
 784     // Check if the entry lists are empty.
 785     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 786     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 787     jcc(Assembler::notZero, check_successor);
 788 
 789     // Release lock.
 790     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 791     jmpb(unlocked);
 792 
 793     // Recursive unlock.
 794     bind(recursive);
 795     decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 796     xorl(t, t);
 797 #endif
 798   }
 799 
 800   bind(unlocked);
 801   if (stub != nullptr) {
 802     bind(stub->unlocked_continuation());
 803   }
 804 
 805 #ifdef ASSERT
 806   // Check that unlocked label is reached with ZF set.
 807   Label zf_correct;
 808   jcc(Assembler::zero, zf_correct);
 809   stop("Fast Unlock ZF != 1");
 810 #endif
 811 
 812   if (stub != nullptr) {
 813     bind(stub->slow_path_continuation());
 814   }
 815 #ifdef ASSERT
 816   // Check that stub->continuation() label is reached with ZF not set.
 817   jccb(Assembler::notZero, zf_correct);
 818   stop("Fast Unlock ZF != 0");
 819   bind(zf_correct);
 820 #endif
 821   // C2 uses the value of ZF to determine the continuation.
 822 }
 823 
 824 //-------------------------------------------------------------------------------------------
 825 // Generic instructions support for use in .ad files C2 code generation
 826 
 827 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 828   if (dst != src) {
 829     movdqu(dst, src);
 830   }
 831   if (opcode == Op_AbsVD) {
 832     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 833   } else {
 834     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 835     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 836   }
 837 }
 838 
 839 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 840   if (opcode == Op_AbsVD) {
 841     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 842   } else {
 843     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 844     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 845   }
 846 }
 847 
 848 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 849   if (dst != src) {
 850     movdqu(dst, src);
 851   }
 852   if (opcode == Op_AbsVF) {
 853     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 854   } else {
 855     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 856     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 857   }
 858 }
 859 
 860 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 861   if (opcode == Op_AbsVF) {
 862     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 863   } else {
 864     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 865     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 866   }
 867 }
 868 
 869 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 870   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 871   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 872 
 873   if (opcode == Op_MinV) {
 874     if (elem_bt == T_BYTE) {
 875       pminsb(dst, src);
 876     } else if (elem_bt == T_SHORT) {
 877       pminsw(dst, src);
 878     } else if (elem_bt == T_INT) {
 879       pminsd(dst, src);
 880     } else {
 881       assert(elem_bt == T_LONG, "required");
 882       assert(tmp == xmm0, "required");
 883       assert_different_registers(dst, src, tmp);
 884       movdqu(xmm0, dst);
 885       pcmpgtq(xmm0, src);
 886       blendvpd(dst, src);  // xmm0 as mask
 887     }
 888   } else { // opcode == Op_MaxV
 889     if (elem_bt == T_BYTE) {
 890       pmaxsb(dst, src);
 891     } else if (elem_bt == T_SHORT) {
 892       pmaxsw(dst, src);
 893     } else if (elem_bt == T_INT) {
 894       pmaxsd(dst, src);
 895     } else {
 896       assert(elem_bt == T_LONG, "required");
 897       assert(tmp == xmm0, "required");
 898       assert_different_registers(dst, src, tmp);
 899       movdqu(xmm0, src);
 900       pcmpgtq(xmm0, dst);
 901       blendvpd(dst, src);  // xmm0 as mask
 902     }
 903   }
 904 }
 905 
 906 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 907                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 908                                  int vlen_enc) {
 909   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 910 
 911   if (opcode == Op_MinV) {
 912     if (elem_bt == T_BYTE) {
 913       vpminsb(dst, src1, src2, vlen_enc);
 914     } else if (elem_bt == T_SHORT) {
 915       vpminsw(dst, src1, src2, vlen_enc);
 916     } else if (elem_bt == T_INT) {
 917       vpminsd(dst, src1, src2, vlen_enc);
 918     } else {
 919       assert(elem_bt == T_LONG, "required");
 920       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 921         vpminsq(dst, src1, src2, vlen_enc);
 922       } else {
 923         assert_different_registers(dst, src1, src2);
 924         vpcmpgtq(dst, src1, src2, vlen_enc);
 925         vblendvpd(dst, src1, src2, dst, vlen_enc);
 926       }
 927     }
 928   } else { // opcode == Op_MaxV
 929     if (elem_bt == T_BYTE) {
 930       vpmaxsb(dst, src1, src2, vlen_enc);
 931     } else if (elem_bt == T_SHORT) {
 932       vpmaxsw(dst, src1, src2, vlen_enc);
 933     } else if (elem_bt == T_INT) {
 934       vpmaxsd(dst, src1, src2, vlen_enc);
 935     } else {
 936       assert(elem_bt == T_LONG, "required");
 937       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 938         vpmaxsq(dst, src1, src2, vlen_enc);
 939       } else {
 940         assert_different_registers(dst, src1, src2);
 941         vpcmpgtq(dst, src1, src2, vlen_enc);
 942         vblendvpd(dst, src2, src1, dst, vlen_enc);
 943       }
 944     }
 945   }
 946 }
 947 
 948 // Float/Double min max
 949 
 950 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 951                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 952                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 953                                    int vlen_enc) {
 954   assert(UseAVX > 0, "required");
 955   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 956          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 957   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 958   assert_different_registers(a, tmp, atmp, btmp);
 959   assert_different_registers(b, tmp, atmp, btmp);
 960 
 961   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 962   bool is_double_word = is_double_word_type(elem_bt);
 963 
 964   /* Note on 'non-obvious' assembly sequence:
 965    *
 966    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 967    * and Java on how they handle floats:
 968    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 969    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 970    *
 971    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 972    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 973    *                (only useful when signs differ, noop otherwise)
 974    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 975 
 976    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 977    *   btmp = (b < +0.0) ? a : b
 978    *   atmp = (b < +0.0) ? b : a
 979    *   Tmp  = Max_Float(atmp , btmp)
 980    *   Res  = (atmp == NaN) ? atmp : Tmp
 981    */
 982 
 983   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 984   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 985   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 986   XMMRegister mask;
 987 
 988   if (!is_double_word && is_min) {
 989     mask = a;
 990     vblend = &MacroAssembler::vblendvps;
 991     vmaxmin = &MacroAssembler::vminps;
 992     vcmp = &MacroAssembler::vcmpps;
 993   } else if (!is_double_word && !is_min) {
 994     mask = b;
 995     vblend = &MacroAssembler::vblendvps;
 996     vmaxmin = &MacroAssembler::vmaxps;
 997     vcmp = &MacroAssembler::vcmpps;
 998   } else if (is_double_word && is_min) {
 999     mask = a;
1000     vblend = &MacroAssembler::vblendvpd;
1001     vmaxmin = &MacroAssembler::vminpd;
1002     vcmp = &MacroAssembler::vcmppd;
1003   } else {
1004     assert(is_double_word && !is_min, "sanity");
1005     mask = b;
1006     vblend = &MacroAssembler::vblendvpd;
1007     vmaxmin = &MacroAssembler::vmaxpd;
1008     vcmp = &MacroAssembler::vcmppd;
1009   }
1010 
1011   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1012   XMMRegister maxmin, scratch;
1013   if (dst == btmp) {
1014     maxmin = btmp;
1015     scratch = tmp;
1016   } else {
1017     maxmin = tmp;
1018     scratch = btmp;
1019   }
1020 
1021   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1022   if (precompute_mask && !is_double_word) {
1023     vpsrad(tmp, mask, 32, vlen_enc);
1024     mask = tmp;
1025   } else if (precompute_mask && is_double_word) {
1026     vpxor(tmp, tmp, tmp, vlen_enc);
1027     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1028     mask = tmp;
1029   }
1030 
1031   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1032   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1033   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1034   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1035   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1036 }
1037 
1038 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1039                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1040                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1041                                     int vlen_enc) {
1042   assert(UseAVX > 2, "required");
1043   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1044          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1045   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1046   assert_different_registers(dst, a, atmp, btmp);
1047   assert_different_registers(dst, b, atmp, btmp);
1048 
1049   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1050   bool is_double_word = is_double_word_type(elem_bt);
1051   bool merge = true;
1052 
1053   if (!is_double_word && is_min) {
1054     evpmovd2m(ktmp, a, vlen_enc);
1055     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1056     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1057     vminps(dst, atmp, btmp, vlen_enc);
1058     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1059     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1060   } else if (!is_double_word && !is_min) {
1061     evpmovd2m(ktmp, b, vlen_enc);
1062     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1063     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1064     vmaxps(dst, atmp, btmp, vlen_enc);
1065     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1066     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1067   } else if (is_double_word && is_min) {
1068     evpmovq2m(ktmp, a, vlen_enc);
1069     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1070     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1071     vminpd(dst, atmp, btmp, vlen_enc);
1072     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1073     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1074   } else {
1075     assert(is_double_word && !is_min, "sanity");
1076     evpmovq2m(ktmp, b, vlen_enc);
1077     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1078     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1079     vmaxpd(dst, atmp, btmp, vlen_enc);
1080     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1081     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1082   }
1083 }
1084 
1085 // Float/Double signum
1086 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1087   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1088 
1089   Label DONE_LABEL;
1090 
1091   if (opcode == Op_SignumF) {
1092     assert(UseSSE > 0, "required");
1093     ucomiss(dst, zero);
1094     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1095     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1096     movflt(dst, one);
1097     jcc(Assembler::above, DONE_LABEL);
1098     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1099   } else if (opcode == Op_SignumD) {
1100     assert(UseSSE > 1, "required");
1101     ucomisd(dst, zero);
1102     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1103     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1104     movdbl(dst, one);
1105     jcc(Assembler::above, DONE_LABEL);
1106     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1107   }
1108 
1109   bind(DONE_LABEL);
1110 }
1111 
1112 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1113   if (sign) {
1114     pmovsxbw(dst, src);
1115   } else {
1116     pmovzxbw(dst, src);
1117   }
1118 }
1119 
1120 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1121   if (sign) {
1122     vpmovsxbw(dst, src, vector_len);
1123   } else {
1124     vpmovzxbw(dst, src, vector_len);
1125   }
1126 }
1127 
1128 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1129   if (sign) {
1130     vpmovsxbd(dst, src, vector_len);
1131   } else {
1132     vpmovzxbd(dst, src, vector_len);
1133   }
1134 }
1135 
1136 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1137   if (sign) {
1138     vpmovsxwd(dst, src, vector_len);
1139   } else {
1140     vpmovzxwd(dst, src, vector_len);
1141   }
1142 }
1143 
1144 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1145                                      int shift, int vector_len) {
1146   if (opcode == Op_RotateLeftV) {
1147     if (etype == T_INT) {
1148       evprold(dst, src, shift, vector_len);
1149     } else {
1150       assert(etype == T_LONG, "expected type T_LONG");
1151       evprolq(dst, src, shift, vector_len);
1152     }
1153   } else {
1154     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1155     if (etype == T_INT) {
1156       evprord(dst, src, shift, vector_len);
1157     } else {
1158       assert(etype == T_LONG, "expected type T_LONG");
1159       evprorq(dst, src, shift, vector_len);
1160     }
1161   }
1162 }
1163 
1164 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1165                                      XMMRegister shift, int vector_len) {
1166   if (opcode == Op_RotateLeftV) {
1167     if (etype == T_INT) {
1168       evprolvd(dst, src, shift, vector_len);
1169     } else {
1170       assert(etype == T_LONG, "expected type T_LONG");
1171       evprolvq(dst, src, shift, vector_len);
1172     }
1173   } else {
1174     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1175     if (etype == T_INT) {
1176       evprorvd(dst, src, shift, vector_len);
1177     } else {
1178       assert(etype == T_LONG, "expected type T_LONG");
1179       evprorvq(dst, src, shift, vector_len);
1180     }
1181   }
1182 }
1183 
1184 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1185   if (opcode == Op_RShiftVI) {
1186     psrad(dst, shift);
1187   } else if (opcode == Op_LShiftVI) {
1188     pslld(dst, shift);
1189   } else {
1190     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1191     psrld(dst, shift);
1192   }
1193 }
1194 
1195 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1196   switch (opcode) {
1197     case Op_RShiftVI:  psrad(dst, shift); break;
1198     case Op_LShiftVI:  pslld(dst, shift); break;
1199     case Op_URShiftVI: psrld(dst, shift); break;
1200 
1201     default: assert(false, "%s", NodeClassNames[opcode]);
1202   }
1203 }
1204 
1205 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1206   if (opcode == Op_RShiftVI) {
1207     vpsrad(dst, nds, shift, vector_len);
1208   } else if (opcode == Op_LShiftVI) {
1209     vpslld(dst, nds, shift, vector_len);
1210   } else {
1211     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1212     vpsrld(dst, nds, shift, vector_len);
1213   }
1214 }
1215 
1216 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1217   switch (opcode) {
1218     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1219     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1220     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1221 
1222     default: assert(false, "%s", NodeClassNames[opcode]);
1223   }
1224 }
1225 
1226 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1227   switch (opcode) {
1228     case Op_RShiftVB:  // fall-through
1229     case Op_RShiftVS:  psraw(dst, shift); break;
1230 
1231     case Op_LShiftVB:  // fall-through
1232     case Op_LShiftVS:  psllw(dst, shift);   break;
1233 
1234     case Op_URShiftVS: // fall-through
1235     case Op_URShiftVB: psrlw(dst, shift);  break;
1236 
1237     default: assert(false, "%s", NodeClassNames[opcode]);
1238   }
1239 }
1240 
1241 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1242   switch (opcode) {
1243     case Op_RShiftVB:  // fall-through
1244     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1245 
1246     case Op_LShiftVB:  // fall-through
1247     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1248 
1249     case Op_URShiftVS: // fall-through
1250     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1251 
1252     default: assert(false, "%s", NodeClassNames[opcode]);
1253   }
1254 }
1255 
1256 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1257   switch (opcode) {
1258     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1259     case Op_LShiftVL:  psllq(dst, shift); break;
1260     case Op_URShiftVL: psrlq(dst, shift); break;
1261 
1262     default: assert(false, "%s", NodeClassNames[opcode]);
1263   }
1264 }
1265 
1266 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1267   if (opcode == Op_RShiftVL) {
1268     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1269   } else if (opcode == Op_LShiftVL) {
1270     psllq(dst, shift);
1271   } else {
1272     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1273     psrlq(dst, shift);
1274   }
1275 }
1276 
1277 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1278   switch (opcode) {
1279     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1280     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1281     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1282 
1283     default: assert(false, "%s", NodeClassNames[opcode]);
1284   }
1285 }
1286 
1287 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1288   if (opcode == Op_RShiftVL) {
1289     evpsraq(dst, nds, shift, vector_len);
1290   } else if (opcode == Op_LShiftVL) {
1291     vpsllq(dst, nds, shift, vector_len);
1292   } else {
1293     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1294     vpsrlq(dst, nds, shift, vector_len);
1295   }
1296 }
1297 
1298 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1299   switch (opcode) {
1300     case Op_RShiftVB:  // fall-through
1301     case Op_RShiftVS:  // fall-through
1302     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1303 
1304     case Op_LShiftVB:  // fall-through
1305     case Op_LShiftVS:  // fall-through
1306     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1307 
1308     case Op_URShiftVB: // fall-through
1309     case Op_URShiftVS: // fall-through
1310     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1311 
1312     default: assert(false, "%s", NodeClassNames[opcode]);
1313   }
1314 }
1315 
1316 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1317   switch (opcode) {
1318     case Op_RShiftVB:  // fall-through
1319     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1320 
1321     case Op_LShiftVB:  // fall-through
1322     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1323 
1324     case Op_URShiftVB: // fall-through
1325     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1326 
1327     default: assert(false, "%s", NodeClassNames[opcode]);
1328   }
1329 }
1330 
1331 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1332   assert(UseAVX >= 2, "required");
1333   switch (opcode) {
1334     case Op_RShiftVL: {
1335       if (UseAVX > 2) {
1336         assert(tmp == xnoreg, "not used");
1337         if (!VM_Version::supports_avx512vl()) {
1338           vlen_enc = Assembler::AVX_512bit;
1339         }
1340         evpsravq(dst, src, shift, vlen_enc);
1341       } else {
1342         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1343         vpsrlvq(dst, src, shift, vlen_enc);
1344         vpsrlvq(tmp, tmp, shift, vlen_enc);
1345         vpxor(dst, dst, tmp, vlen_enc);
1346         vpsubq(dst, dst, tmp, vlen_enc);
1347       }
1348       break;
1349     }
1350     case Op_LShiftVL: {
1351       assert(tmp == xnoreg, "not used");
1352       vpsllvq(dst, src, shift, vlen_enc);
1353       break;
1354     }
1355     case Op_URShiftVL: {
1356       assert(tmp == xnoreg, "not used");
1357       vpsrlvq(dst, src, shift, vlen_enc);
1358       break;
1359     }
1360     default: assert(false, "%s", NodeClassNames[opcode]);
1361   }
1362 }
1363 
1364 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1365 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1366   assert(opcode == Op_LShiftVB ||
1367          opcode == Op_RShiftVB ||
1368          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1369   bool sign = (opcode != Op_URShiftVB);
1370   assert(vector_len == 0, "required");
1371   vextendbd(sign, dst, src, 1);
1372   vpmovzxbd(vtmp, shift, 1);
1373   varshiftd(opcode, dst, dst, vtmp, 1);
1374   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1375   vextracti128_high(vtmp, dst);
1376   vpackusdw(dst, dst, vtmp, 0);
1377 }
1378 
1379 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1380 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1381   assert(opcode == Op_LShiftVB ||
1382          opcode == Op_RShiftVB ||
1383          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1384   bool sign = (opcode != Op_URShiftVB);
1385   int ext_vector_len = vector_len + 1;
1386   vextendbw(sign, dst, src, ext_vector_len);
1387   vpmovzxbw(vtmp, shift, ext_vector_len);
1388   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1389   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1390   if (vector_len == 0) {
1391     vextracti128_high(vtmp, dst);
1392     vpackuswb(dst, dst, vtmp, vector_len);
1393   } else {
1394     vextracti64x4_high(vtmp, dst);
1395     vpackuswb(dst, dst, vtmp, vector_len);
1396     vpermq(dst, dst, 0xD8, vector_len);
1397   }
1398 }
1399 
1400 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1401   switch(typ) {
1402     case T_BYTE:
1403       pinsrb(dst, val, idx);
1404       break;
1405     case T_SHORT:
1406       pinsrw(dst, val, idx);
1407       break;
1408     case T_INT:
1409       pinsrd(dst, val, idx);
1410       break;
1411     case T_LONG:
1412       pinsrq(dst, val, idx);
1413       break;
1414     default:
1415       assert(false,"Should not reach here.");
1416       break;
1417   }
1418 }
1419 
1420 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1421   switch(typ) {
1422     case T_BYTE:
1423       vpinsrb(dst, src, val, idx);
1424       break;
1425     case T_SHORT:
1426       vpinsrw(dst, src, val, idx);
1427       break;
1428     case T_INT:
1429       vpinsrd(dst, src, val, idx);
1430       break;
1431     case T_LONG:
1432       vpinsrq(dst, src, val, idx);
1433       break;
1434     default:
1435       assert(false,"Should not reach here.");
1436       break;
1437   }
1438 }
1439 
1440 #ifdef _LP64
1441 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1442                                                 XMMRegister dst, Register base,
1443                                                 Register idx_base,
1444                                                 Register offset, Register mask,
1445                                                 Register mask_idx, Register rtmp,
1446                                                 int vlen_enc) {
1447   vpxor(dst, dst, dst, vlen_enc);
1448   if (elem_bt == T_SHORT) {
1449     for (int i = 0; i < 4; i++) {
1450       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1451       Label skip_load;
1452       btq(mask, mask_idx);
1453       jccb(Assembler::carryClear, skip_load);
1454       movl(rtmp, Address(idx_base, i * 4));
1455       if (offset != noreg) {
1456         addl(rtmp, offset);
1457       }
1458       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1459       bind(skip_load);
1460       incq(mask_idx);
1461     }
1462   } else {
1463     assert(elem_bt == T_BYTE, "");
1464     for (int i = 0; i < 8; i++) {
1465       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1466       Label skip_load;
1467       btq(mask, mask_idx);
1468       jccb(Assembler::carryClear, skip_load);
1469       movl(rtmp, Address(idx_base, i * 4));
1470       if (offset != noreg) {
1471         addl(rtmp, offset);
1472       }
1473       pinsrb(dst, Address(base, rtmp), i);
1474       bind(skip_load);
1475       incq(mask_idx);
1476     }
1477   }
1478 }
1479 #endif // _LP64
1480 
1481 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1482                                          Register base, Register idx_base,
1483                                          Register offset, Register rtmp,
1484                                          int vlen_enc) {
1485   vpxor(dst, dst, dst, vlen_enc);
1486   if (elem_bt == T_SHORT) {
1487     for (int i = 0; i < 4; i++) {
1488       // dst[i] = src[offset + idx_base[i]]
1489       movl(rtmp, Address(idx_base, i * 4));
1490       if (offset != noreg) {
1491         addl(rtmp, offset);
1492       }
1493       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1494     }
1495   } else {
1496     assert(elem_bt == T_BYTE, "");
1497     for (int i = 0; i < 8; i++) {
1498       // dst[i] = src[offset + idx_base[i]]
1499       movl(rtmp, Address(idx_base, i * 4));
1500       if (offset != noreg) {
1501         addl(rtmp, offset);
1502       }
1503       pinsrb(dst, Address(base, rtmp), i);
1504     }
1505   }
1506 }
1507 
1508 /*
1509  * Gather using hybrid algorithm, first partially unroll scalar loop
1510  * to accumulate values from gather indices into a quad-word(64bit) slice.
1511  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1512  * permutation to place the slice into appropriate vector lane
1513  * locations in destination vector. Following pseudo code describes the
1514  * algorithm in detail:
1515  *
1516  * DST_VEC = ZERO_VEC
1517  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1518  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1519  * FOREACH_ITER:
1520  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1521  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1522  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1523  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1524  *
1525  * With each iteration, doubleword permute indices (0,1) corresponding
1526  * to gathered quadword gets right shifted by two lane positions.
1527  *
1528  */
1529 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1530                                         Register base, Register idx_base,
1531                                         Register offset, Register mask,
1532                                         XMMRegister xtmp1, XMMRegister xtmp2,
1533                                         XMMRegister temp_dst, Register rtmp,
1534                                         Register mask_idx, Register length,
1535                                         int vector_len, int vlen_enc) {
1536   Label GATHER8_LOOP;
1537   assert(is_subword_type(elem_ty), "");
1538   movl(length, vector_len);
1539   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1540   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1541   vallones(xtmp2, vlen_enc);
1542   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1543   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1544   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1545 
1546   bind(GATHER8_LOOP);
1547     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1548     if (mask == noreg) {
1549       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1550     } else {
1551       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1552     }
1553     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1554     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1555     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1556     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1557     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1558     vpor(dst, dst, temp_dst, vlen_enc);
1559     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1560     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1561     jcc(Assembler::notEqual, GATHER8_LOOP);
1562 }
1563 
1564 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1565   switch(typ) {
1566     case T_INT:
1567       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1568       break;
1569     case T_FLOAT:
1570       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1571       break;
1572     case T_LONG:
1573       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1574       break;
1575     case T_DOUBLE:
1576       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1577       break;
1578     default:
1579       assert(false,"Should not reach here.");
1580       break;
1581   }
1582 }
1583 
1584 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1585   switch(typ) {
1586     case T_INT:
1587       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1588       break;
1589     case T_FLOAT:
1590       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1591       break;
1592     case T_LONG:
1593       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1594       break;
1595     case T_DOUBLE:
1596       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1597       break;
1598     default:
1599       assert(false,"Should not reach here.");
1600       break;
1601   }
1602 }
1603 
1604 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1605   switch(typ) {
1606     case T_INT:
1607       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1608       break;
1609     case T_FLOAT:
1610       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1611       break;
1612     case T_LONG:
1613       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1614       break;
1615     case T_DOUBLE:
1616       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1617       break;
1618     default:
1619       assert(false,"Should not reach here.");
1620       break;
1621   }
1622 }
1623 
1624 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1625   if (vlen_in_bytes <= 16) {
1626     pxor (dst, dst);
1627     psubb(dst, src);
1628     switch (elem_bt) {
1629       case T_BYTE:   /* nothing to do */ break;
1630       case T_SHORT:  pmovsxbw(dst, dst); break;
1631       case T_INT:    pmovsxbd(dst, dst); break;
1632       case T_FLOAT:  pmovsxbd(dst, dst); break;
1633       case T_LONG:   pmovsxbq(dst, dst); break;
1634       case T_DOUBLE: pmovsxbq(dst, dst); break;
1635 
1636       default: assert(false, "%s", type2name(elem_bt));
1637     }
1638   } else {
1639     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1640     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1641 
1642     vpxor (dst, dst, dst, vlen_enc);
1643     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1644 
1645     switch (elem_bt) {
1646       case T_BYTE:   /* nothing to do */            break;
1647       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1648       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1649       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1650       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1651       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1652 
1653       default: assert(false, "%s", type2name(elem_bt));
1654     }
1655   }
1656 }
1657 
1658 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1659   if (novlbwdq) {
1660     vpmovsxbd(xtmp, src, vlen_enc);
1661     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1662             Assembler::eq, true, vlen_enc, noreg);
1663   } else {
1664     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1665     vpsubb(xtmp, xtmp, src, vlen_enc);
1666     evpmovb2m(dst, xtmp, vlen_enc);
1667   }
1668 }
1669 
1670 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1671   switch (vlen_in_bytes) {
1672     case 4:  movdl(dst, src);   break;
1673     case 8:  movq(dst, src);    break;
1674     case 16: movdqu(dst, src);  break;
1675     case 32: vmovdqu(dst, src); break;
1676     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1677     default: ShouldNotReachHere();
1678   }
1679 }
1680 
1681 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1682   assert(rscratch != noreg || always_reachable(src), "missing");
1683 
1684   if (reachable(src)) {
1685     load_vector(dst, as_Address(src), vlen_in_bytes);
1686   } else {
1687     lea(rscratch, src);
1688     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1689   }
1690 }
1691 
1692 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1693   int vlen_enc = vector_length_encoding(vlen);
1694   if (VM_Version::supports_avx()) {
1695     if (bt == T_LONG) {
1696       if (VM_Version::supports_avx2()) {
1697         vpbroadcastq(dst, src, vlen_enc);
1698       } else {
1699         vmovddup(dst, src, vlen_enc);
1700       }
1701     } else if (bt == T_DOUBLE) {
1702       if (vlen_enc != Assembler::AVX_128bit) {
1703         vbroadcastsd(dst, src, vlen_enc, noreg);
1704       } else {
1705         vmovddup(dst, src, vlen_enc);
1706       }
1707     } else {
1708       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1709         vpbroadcastd(dst, src, vlen_enc);
1710       } else {
1711         vbroadcastss(dst, src, vlen_enc);
1712       }
1713     }
1714   } else if (VM_Version::supports_sse3()) {
1715     movddup(dst, src);
1716   } else {
1717     movq(dst, src);
1718     if (vlen == 16) {
1719       punpcklqdq(dst, dst);
1720     }
1721   }
1722 }
1723 
1724 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1725   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1726   int offset = exact_log2(type2aelembytes(bt)) << 6;
1727   if (is_floating_point_type(bt)) {
1728     offset += 128;
1729   }
1730   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1731   load_vector(dst, addr, vlen_in_bytes);
1732 }
1733 
1734 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1735 
1736 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1737   int vector_len = Assembler::AVX_128bit;
1738 
1739   switch (opcode) {
1740     case Op_AndReductionV:  pand(dst, src); break;
1741     case Op_OrReductionV:   por (dst, src); break;
1742     case Op_XorReductionV:  pxor(dst, src); break;
1743     case Op_MinReductionV:
1744       switch (typ) {
1745         case T_BYTE:        pminsb(dst, src); break;
1746         case T_SHORT:       pminsw(dst, src); break;
1747         case T_INT:         pminsd(dst, src); break;
1748         case T_LONG:        assert(UseAVX > 2, "required");
1749                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1750         default:            assert(false, "wrong type");
1751       }
1752       break;
1753     case Op_MaxReductionV:
1754       switch (typ) {
1755         case T_BYTE:        pmaxsb(dst, src); break;
1756         case T_SHORT:       pmaxsw(dst, src); break;
1757         case T_INT:         pmaxsd(dst, src); break;
1758         case T_LONG:        assert(UseAVX > 2, "required");
1759                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1760         default:            assert(false, "wrong type");
1761       }
1762       break;
1763     case Op_AddReductionVF: addss(dst, src); break;
1764     case Op_AddReductionVD: addsd(dst, src); break;
1765     case Op_AddReductionVI:
1766       switch (typ) {
1767         case T_BYTE:        paddb(dst, src); break;
1768         case T_SHORT:       paddw(dst, src); break;
1769         case T_INT:         paddd(dst, src); break;
1770         default:            assert(false, "wrong type");
1771       }
1772       break;
1773     case Op_AddReductionVL: paddq(dst, src); break;
1774     case Op_MulReductionVF: mulss(dst, src); break;
1775     case Op_MulReductionVD: mulsd(dst, src); break;
1776     case Op_MulReductionVI:
1777       switch (typ) {
1778         case T_SHORT:       pmullw(dst, src); break;
1779         case T_INT:         pmulld(dst, src); break;
1780         default:            assert(false, "wrong type");
1781       }
1782       break;
1783     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1784                             evpmullq(dst, dst, src, vector_len); break;
1785     default:                assert(false, "wrong opcode");
1786   }
1787 }
1788 
1789 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1790   int vector_len = Assembler::AVX_256bit;
1791 
1792   switch (opcode) {
1793     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1794     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1795     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1796     case Op_MinReductionV:
1797       switch (typ) {
1798         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1799         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1800         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1801         case T_LONG:        assert(UseAVX > 2, "required");
1802                             vpminsq(dst, src1, src2, vector_len); break;
1803         default:            assert(false, "wrong type");
1804       }
1805       break;
1806     case Op_MaxReductionV:
1807       switch (typ) {
1808         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1809         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1810         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1811         case T_LONG:        assert(UseAVX > 2, "required");
1812                             vpmaxsq(dst, src1, src2, vector_len); break;
1813         default:            assert(false, "wrong type");
1814       }
1815       break;
1816     case Op_AddReductionVI:
1817       switch (typ) {
1818         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1819         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1820         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1821         default:            assert(false, "wrong type");
1822       }
1823       break;
1824     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1825     case Op_MulReductionVI:
1826       switch (typ) {
1827         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1828         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1829         default:            assert(false, "wrong type");
1830       }
1831       break;
1832     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1833     default:                assert(false, "wrong opcode");
1834   }
1835 }
1836 
1837 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1838                                   XMMRegister dst, XMMRegister src,
1839                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1840   switch (opcode) {
1841     case Op_AddReductionVF:
1842     case Op_MulReductionVF:
1843       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1844       break;
1845 
1846     case Op_AddReductionVD:
1847     case Op_MulReductionVD:
1848       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1849       break;
1850 
1851     default: assert(false, "wrong opcode");
1852   }
1853 }
1854 
1855 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1856                              Register dst, Register src1, XMMRegister src2,
1857                              XMMRegister vtmp1, XMMRegister vtmp2) {
1858   switch (vlen) {
1859     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1860     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1861     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1862     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1863 
1864     default: assert(false, "wrong vector length");
1865   }
1866 }
1867 
1868 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1869                              Register dst, Register src1, XMMRegister src2,
1870                              XMMRegister vtmp1, XMMRegister vtmp2) {
1871   switch (vlen) {
1872     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1873     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1874     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1875     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1876 
1877     default: assert(false, "wrong vector length");
1878   }
1879 }
1880 
1881 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1882                              Register dst, Register src1, XMMRegister src2,
1883                              XMMRegister vtmp1, XMMRegister vtmp2) {
1884   switch (vlen) {
1885     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1886     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1887     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1888     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1889 
1890     default: assert(false, "wrong vector length");
1891   }
1892 }
1893 
1894 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1895                              Register dst, Register src1, XMMRegister src2,
1896                              XMMRegister vtmp1, XMMRegister vtmp2) {
1897   switch (vlen) {
1898     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1899     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1900     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1901     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1902 
1903     default: assert(false, "wrong vector length");
1904   }
1905 }
1906 
1907 #ifdef _LP64
1908 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1909                              Register dst, Register src1, XMMRegister src2,
1910                              XMMRegister vtmp1, XMMRegister vtmp2) {
1911   switch (vlen) {
1912     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1913     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1914     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1915 
1916     default: assert(false, "wrong vector length");
1917   }
1918 }
1919 #endif // _LP64
1920 
1921 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1922   switch (vlen) {
1923     case 2:
1924       assert(vtmp2 == xnoreg, "");
1925       reduce2F(opcode, dst, src, vtmp1);
1926       break;
1927     case 4:
1928       assert(vtmp2 == xnoreg, "");
1929       reduce4F(opcode, dst, src, vtmp1);
1930       break;
1931     case 8:
1932       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1933       break;
1934     case 16:
1935       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1936       break;
1937     default: assert(false, "wrong vector length");
1938   }
1939 }
1940 
1941 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1942   switch (vlen) {
1943     case 2:
1944       assert(vtmp2 == xnoreg, "");
1945       reduce2D(opcode, dst, src, vtmp1);
1946       break;
1947     case 4:
1948       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1949       break;
1950     case 8:
1951       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1952       break;
1953     default: assert(false, "wrong vector length");
1954   }
1955 }
1956 
1957 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1958   if (opcode == Op_AddReductionVI) {
1959     if (vtmp1 != src2) {
1960       movdqu(vtmp1, src2);
1961     }
1962     phaddd(vtmp1, vtmp1);
1963   } else {
1964     pshufd(vtmp1, src2, 0x1);
1965     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1966   }
1967   movdl(vtmp2, src1);
1968   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1969   movdl(dst, vtmp1);
1970 }
1971 
1972 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1973   if (opcode == Op_AddReductionVI) {
1974     if (vtmp1 != src2) {
1975       movdqu(vtmp1, src2);
1976     }
1977     phaddd(vtmp1, src2);
1978     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1979   } else {
1980     pshufd(vtmp2, src2, 0xE);
1981     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1982     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1983   }
1984 }
1985 
1986 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1987   if (opcode == Op_AddReductionVI) {
1988     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1989     vextracti128_high(vtmp2, vtmp1);
1990     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1991     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1992   } else {
1993     vextracti128_high(vtmp1, src2);
1994     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1995     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1996   }
1997 }
1998 
1999 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2000   vextracti64x4_high(vtmp2, src2);
2001   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2002   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2003 }
2004 
2005 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2006   pshufd(vtmp2, src2, 0x1);
2007   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2008   movdqu(vtmp1, vtmp2);
2009   psrldq(vtmp1, 2);
2010   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2011   movdqu(vtmp2, vtmp1);
2012   psrldq(vtmp2, 1);
2013   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2014   movdl(vtmp2, src1);
2015   pmovsxbd(vtmp1, vtmp1);
2016   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2017   pextrb(dst, vtmp1, 0x0);
2018   movsbl(dst, dst);
2019 }
2020 
2021 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2022   pshufd(vtmp1, src2, 0xE);
2023   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2024   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2025 }
2026 
2027 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2028   vextracti128_high(vtmp2, src2);
2029   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2030   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2031 }
2032 
2033 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2034   vextracti64x4_high(vtmp1, src2);
2035   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2036   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2037 }
2038 
2039 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2040   pmovsxbw(vtmp2, src2);
2041   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2042 }
2043 
2044 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2045   if (UseAVX > 1) {
2046     int vector_len = Assembler::AVX_256bit;
2047     vpmovsxbw(vtmp1, src2, vector_len);
2048     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2049   } else {
2050     pmovsxbw(vtmp2, src2);
2051     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2052     pshufd(vtmp2, src2, 0x1);
2053     pmovsxbw(vtmp2, src2);
2054     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2055   }
2056 }
2057 
2058 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2059   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2060     int vector_len = Assembler::AVX_512bit;
2061     vpmovsxbw(vtmp1, src2, vector_len);
2062     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2063   } else {
2064     assert(UseAVX >= 2,"Should not reach here.");
2065     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2066     vextracti128_high(vtmp2, src2);
2067     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2068   }
2069 }
2070 
2071 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2072   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2073   vextracti64x4_high(vtmp2, src2);
2074   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2075 }
2076 
2077 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2078   if (opcode == Op_AddReductionVI) {
2079     if (vtmp1 != src2) {
2080       movdqu(vtmp1, src2);
2081     }
2082     phaddw(vtmp1, vtmp1);
2083     phaddw(vtmp1, vtmp1);
2084   } else {
2085     pshufd(vtmp2, src2, 0x1);
2086     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2087     movdqu(vtmp1, vtmp2);
2088     psrldq(vtmp1, 2);
2089     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2090   }
2091   movdl(vtmp2, src1);
2092   pmovsxwd(vtmp1, vtmp1);
2093   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2094   pextrw(dst, vtmp1, 0x0);
2095   movswl(dst, dst);
2096 }
2097 
2098 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2099   if (opcode == Op_AddReductionVI) {
2100     if (vtmp1 != src2) {
2101       movdqu(vtmp1, src2);
2102     }
2103     phaddw(vtmp1, src2);
2104   } else {
2105     pshufd(vtmp1, src2, 0xE);
2106     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2107   }
2108   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2109 }
2110 
2111 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2112   if (opcode == Op_AddReductionVI) {
2113     int vector_len = Assembler::AVX_256bit;
2114     vphaddw(vtmp2, src2, src2, vector_len);
2115     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2116   } else {
2117     vextracti128_high(vtmp2, src2);
2118     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2119   }
2120   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2121 }
2122 
2123 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2124   int vector_len = Assembler::AVX_256bit;
2125   vextracti64x4_high(vtmp1, src2);
2126   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2127   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2128 }
2129 
2130 #ifdef _LP64
2131 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2132   pshufd(vtmp2, src2, 0xE);
2133   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2134   movdq(vtmp1, src1);
2135   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2136   movdq(dst, vtmp1);
2137 }
2138 
2139 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2140   vextracti128_high(vtmp1, src2);
2141   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2142   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2143 }
2144 
2145 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2146   vextracti64x4_high(vtmp2, src2);
2147   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2148   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2149 }
2150 
2151 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2152   mov64(temp, -1L);
2153   bzhiq(temp, temp, len);
2154   kmovql(dst, temp);
2155 }
2156 #endif // _LP64
2157 
2158 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2159   reduce_operation_128(T_FLOAT, opcode, dst, src);
2160   pshufd(vtmp, src, 0x1);
2161   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2162 }
2163 
2164 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2165   reduce2F(opcode, dst, src, vtmp);
2166   pshufd(vtmp, src, 0x2);
2167   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2168   pshufd(vtmp, src, 0x3);
2169   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2170 }
2171 
2172 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2173   reduce4F(opcode, dst, src, vtmp2);
2174   vextractf128_high(vtmp2, src);
2175   reduce4F(opcode, dst, vtmp2, vtmp1);
2176 }
2177 
2178 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2179   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2180   vextracti64x4_high(vtmp1, src);
2181   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2182 }
2183 
2184 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2185   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2186   pshufd(vtmp, src, 0xE);
2187   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2188 }
2189 
2190 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2191   reduce2D(opcode, dst, src, vtmp2);
2192   vextractf128_high(vtmp2, src);
2193   reduce2D(opcode, dst, vtmp2, vtmp1);
2194 }
2195 
2196 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2197   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2198   vextracti64x4_high(vtmp1, src);
2199   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2200 }
2201 
2202 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2203   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2204 }
2205 
2206 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2207   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2208 }
2209 
2210 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2211                                  int vec_enc) {
2212   switch(elem_bt) {
2213     case T_INT:
2214     case T_FLOAT:
2215       vmaskmovps(dst, src, mask, vec_enc);
2216       break;
2217     case T_LONG:
2218     case T_DOUBLE:
2219       vmaskmovpd(dst, src, mask, vec_enc);
2220       break;
2221     default:
2222       fatal("Unsupported type %s", type2name(elem_bt));
2223       break;
2224   }
2225 }
2226 
2227 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2228                                  int vec_enc) {
2229   switch(elem_bt) {
2230     case T_INT:
2231     case T_FLOAT:
2232       vmaskmovps(dst, src, mask, vec_enc);
2233       break;
2234     case T_LONG:
2235     case T_DOUBLE:
2236       vmaskmovpd(dst, src, mask, vec_enc);
2237       break;
2238     default:
2239       fatal("Unsupported type %s", type2name(elem_bt));
2240       break;
2241   }
2242 }
2243 
2244 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2245                                           XMMRegister dst, XMMRegister src,
2246                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2247                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2248   const int permconst[] = {1, 14};
2249   XMMRegister wsrc = src;
2250   XMMRegister wdst = xmm_0;
2251   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2252 
2253   int vlen_enc = Assembler::AVX_128bit;
2254   if (vlen == 16) {
2255     vlen_enc = Assembler::AVX_256bit;
2256   }
2257 
2258   for (int i = log2(vlen) - 1; i >=0; i--) {
2259     if (i == 0 && !is_dst_valid) {
2260       wdst = dst;
2261     }
2262     if (i == 3) {
2263       vextracti64x4_high(wtmp, wsrc);
2264     } else if (i == 2) {
2265       vextracti128_high(wtmp, wsrc);
2266     } else { // i = [0,1]
2267       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2268     }
2269     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2270     wsrc = wdst;
2271     vlen_enc = Assembler::AVX_128bit;
2272   }
2273   if (is_dst_valid) {
2274     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2275   }
2276 }
2277 
2278 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2279                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2280                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2281   XMMRegister wsrc = src;
2282   XMMRegister wdst = xmm_0;
2283   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2284   int vlen_enc = Assembler::AVX_128bit;
2285   if (vlen == 8) {
2286     vlen_enc = Assembler::AVX_256bit;
2287   }
2288   for (int i = log2(vlen) - 1; i >=0; i--) {
2289     if (i == 0 && !is_dst_valid) {
2290       wdst = dst;
2291     }
2292     if (i == 1) {
2293       vextracti128_high(wtmp, wsrc);
2294     } else if (i == 2) {
2295       vextracti64x4_high(wtmp, wsrc);
2296     } else {
2297       assert(i == 0, "%d", i);
2298       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2299     }
2300     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2301     wsrc = wdst;
2302     vlen_enc = Assembler::AVX_128bit;
2303   }
2304   if (is_dst_valid) {
2305     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2306   }
2307 }
2308 
2309 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2310   switch (bt) {
2311     case T_BYTE:  pextrb(dst, src, idx); break;
2312     case T_SHORT: pextrw(dst, src, idx); break;
2313     case T_INT:   pextrd(dst, src, idx); break;
2314     case T_LONG:  pextrq(dst, src, idx); break;
2315 
2316     default:
2317       assert(false,"Should not reach here.");
2318       break;
2319   }
2320 }
2321 
2322 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2323   int esize =  type2aelembytes(typ);
2324   int elem_per_lane = 16/esize;
2325   int lane = elemindex / elem_per_lane;
2326   int eindex = elemindex % elem_per_lane;
2327 
2328   if (lane >= 2) {
2329     assert(UseAVX > 2, "required");
2330     vextractf32x4(dst, src, lane & 3);
2331     return dst;
2332   } else if (lane > 0) {
2333     assert(UseAVX > 0, "required");
2334     vextractf128(dst, src, lane);
2335     return dst;
2336   } else {
2337     return src;
2338   }
2339 }
2340 
2341 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2342   if (typ == T_BYTE) {
2343     movsbl(dst, dst);
2344   } else if (typ == T_SHORT) {
2345     movswl(dst, dst);
2346   }
2347 }
2348 
2349 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2350   int esize =  type2aelembytes(typ);
2351   int elem_per_lane = 16/esize;
2352   int eindex = elemindex % elem_per_lane;
2353   assert(is_integral_type(typ),"required");
2354 
2355   if (eindex == 0) {
2356     if (typ == T_LONG) {
2357       movq(dst, src);
2358     } else {
2359       movdl(dst, src);
2360       movsxl(typ, dst);
2361     }
2362   } else {
2363     extract(typ, dst, src, eindex);
2364     movsxl(typ, dst);
2365   }
2366 }
2367 
2368 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2369   int esize =  type2aelembytes(typ);
2370   int elem_per_lane = 16/esize;
2371   int eindex = elemindex % elem_per_lane;
2372   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2373 
2374   if (eindex == 0) {
2375     movq(dst, src);
2376   } else {
2377     if (typ == T_FLOAT) {
2378       if (UseAVX == 0) {
2379         movdqu(dst, src);
2380         shufps(dst, dst, eindex);
2381       } else {
2382         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2383       }
2384     } else {
2385       if (UseAVX == 0) {
2386         movdqu(dst, src);
2387         psrldq(dst, eindex*esize);
2388       } else {
2389         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2390       }
2391       movq(dst, dst);
2392     }
2393   }
2394   // Zero upper bits
2395   if (typ == T_FLOAT) {
2396     if (UseAVX == 0) {
2397       assert(vtmp != xnoreg, "required.");
2398       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2399       pand(dst, vtmp);
2400     } else {
2401       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2402     }
2403   }
2404 }
2405 
2406 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2407   switch(typ) {
2408     case T_BYTE:
2409     case T_BOOLEAN:
2410       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2411       break;
2412     case T_SHORT:
2413     case T_CHAR:
2414       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2415       break;
2416     case T_INT:
2417     case T_FLOAT:
2418       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2419       break;
2420     case T_LONG:
2421     case T_DOUBLE:
2422       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2423       break;
2424     default:
2425       assert(false,"Should not reach here.");
2426       break;
2427   }
2428 }
2429 
2430 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2431   assert(rscratch != noreg || always_reachable(src2), "missing");
2432 
2433   switch(typ) {
2434     case T_BOOLEAN:
2435     case T_BYTE:
2436       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2437       break;
2438     case T_CHAR:
2439     case T_SHORT:
2440       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2441       break;
2442     case T_INT:
2443     case T_FLOAT:
2444       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2445       break;
2446     case T_LONG:
2447     case T_DOUBLE:
2448       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2449       break;
2450     default:
2451       assert(false,"Should not reach here.");
2452       break;
2453   }
2454 }
2455 
2456 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2457   switch(typ) {
2458     case T_BYTE:
2459       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2460       break;
2461     case T_SHORT:
2462       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2463       break;
2464     case T_INT:
2465     case T_FLOAT:
2466       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2467       break;
2468     case T_LONG:
2469     case T_DOUBLE:
2470       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2471       break;
2472     default:
2473       assert(false,"Should not reach here.");
2474       break;
2475   }
2476 }
2477 
2478 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2479   assert(vlen_in_bytes <= 32, "");
2480   int esize = type2aelembytes(bt);
2481   if (vlen_in_bytes == 32) {
2482     assert(vtmp == xnoreg, "required.");
2483     if (esize >= 4) {
2484       vtestps(src1, src2, AVX_256bit);
2485     } else {
2486       vptest(src1, src2, AVX_256bit);
2487     }
2488     return;
2489   }
2490   if (vlen_in_bytes < 16) {
2491     // Duplicate the lower part to fill the whole register,
2492     // Don't need to do so for src2
2493     assert(vtmp != xnoreg, "required");
2494     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2495     pshufd(vtmp, src1, shuffle_imm);
2496   } else {
2497     assert(vtmp == xnoreg, "required");
2498     vtmp = src1;
2499   }
2500   if (esize >= 4 && VM_Version::supports_avx()) {
2501     vtestps(vtmp, src2, AVX_128bit);
2502   } else {
2503     ptest(vtmp, src2);
2504   }
2505 }
2506 
2507 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2508   assert(UseAVX >= 2, "required");
2509 #ifdef ASSERT
2510   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2511   bool is_bw_supported = VM_Version::supports_avx512bw();
2512   if (is_bw && !is_bw_supported) {
2513     assert(vlen_enc != Assembler::AVX_512bit, "required");
2514     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2515            "XMM register should be 0-15");
2516   }
2517 #endif // ASSERT
2518   switch (elem_bt) {
2519     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2520     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2521     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2522     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2523     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2524     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2525     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2526   }
2527 }
2528 
2529 #ifdef _LP64
2530 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2531   assert(UseAVX >= 2, "required");
2532   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2533   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2534   if ((UseAVX > 2) &&
2535       (!is_bw || VM_Version::supports_avx512bw()) &&
2536       (!is_vl || VM_Version::supports_avx512vl())) {
2537     switch (elem_bt) {
2538       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2539       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2540       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2541       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2542       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2543     }
2544   } else {
2545     assert(vlen_enc != Assembler::AVX_512bit, "required");
2546     assert((dst->encoding() < 16),"XMM register should be 0-15");
2547     switch (elem_bt) {
2548       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2549       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2550       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2551       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2552       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2553       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2554       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2555     }
2556   }
2557 }
2558 #endif
2559 
2560 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2561   switch (to_elem_bt) {
2562     case T_SHORT:
2563       vpmovsxbw(dst, src, vlen_enc);
2564       break;
2565     case T_INT:
2566       vpmovsxbd(dst, src, vlen_enc);
2567       break;
2568     case T_FLOAT:
2569       vpmovsxbd(dst, src, vlen_enc);
2570       vcvtdq2ps(dst, dst, vlen_enc);
2571       break;
2572     case T_LONG:
2573       vpmovsxbq(dst, src, vlen_enc);
2574       break;
2575     case T_DOUBLE: {
2576       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2577       vpmovsxbd(dst, src, mid_vlen_enc);
2578       vcvtdq2pd(dst, dst, vlen_enc);
2579       break;
2580     }
2581     default:
2582       fatal("Unsupported type %s", type2name(to_elem_bt));
2583       break;
2584   }
2585 }
2586 
2587 //-------------------------------------------------------------------------------------------
2588 
2589 // IndexOf for constant substrings with size >= 8 chars
2590 // which don't need to be loaded through stack.
2591 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2592                                          Register cnt1, Register cnt2,
2593                                          int int_cnt2,  Register result,
2594                                          XMMRegister vec, Register tmp,
2595                                          int ae) {
2596   ShortBranchVerifier sbv(this);
2597   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2598   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2599 
2600   // This method uses the pcmpestri instruction with bound registers
2601   //   inputs:
2602   //     xmm - substring
2603   //     rax - substring length (elements count)
2604   //     mem - scanned string
2605   //     rdx - string length (elements count)
2606   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2607   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2608   //   outputs:
2609   //     rcx - matched index in string
2610   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2611   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2612   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2613   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2614   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2615 
2616   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2617         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2618         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2619 
2620   // Note, inline_string_indexOf() generates checks:
2621   // if (substr.count > string.count) return -1;
2622   // if (substr.count == 0) return 0;
2623   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2624 
2625   // Load substring.
2626   if (ae == StrIntrinsicNode::UL) {
2627     pmovzxbw(vec, Address(str2, 0));
2628   } else {
2629     movdqu(vec, Address(str2, 0));
2630   }
2631   movl(cnt2, int_cnt2);
2632   movptr(result, str1); // string addr
2633 
2634   if (int_cnt2 > stride) {
2635     jmpb(SCAN_TO_SUBSTR);
2636 
2637     // Reload substr for rescan, this code
2638     // is executed only for large substrings (> 8 chars)
2639     bind(RELOAD_SUBSTR);
2640     if (ae == StrIntrinsicNode::UL) {
2641       pmovzxbw(vec, Address(str2, 0));
2642     } else {
2643       movdqu(vec, Address(str2, 0));
2644     }
2645     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2646 
2647     bind(RELOAD_STR);
2648     // We came here after the beginning of the substring was
2649     // matched but the rest of it was not so we need to search
2650     // again. Start from the next element after the previous match.
2651 
2652     // cnt2 is number of substring reminding elements and
2653     // cnt1 is number of string reminding elements when cmp failed.
2654     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2655     subl(cnt1, cnt2);
2656     addl(cnt1, int_cnt2);
2657     movl(cnt2, int_cnt2); // Now restore cnt2
2658 
2659     decrementl(cnt1);     // Shift to next element
2660     cmpl(cnt1, cnt2);
2661     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2662 
2663     addptr(result, (1<<scale1));
2664 
2665   } // (int_cnt2 > 8)
2666 
2667   // Scan string for start of substr in 16-byte vectors
2668   bind(SCAN_TO_SUBSTR);
2669   pcmpestri(vec, Address(result, 0), mode);
2670   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2671   subl(cnt1, stride);
2672   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2673   cmpl(cnt1, cnt2);
2674   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2675   addptr(result, 16);
2676   jmpb(SCAN_TO_SUBSTR);
2677 
2678   // Found a potential substr
2679   bind(FOUND_CANDIDATE);
2680   // Matched whole vector if first element matched (tmp(rcx) == 0).
2681   if (int_cnt2 == stride) {
2682     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2683   } else { // int_cnt2 > 8
2684     jccb(Assembler::overflow, FOUND_SUBSTR);
2685   }
2686   // After pcmpestri tmp(rcx) contains matched element index
2687   // Compute start addr of substr
2688   lea(result, Address(result, tmp, scale1));
2689 
2690   // Make sure string is still long enough
2691   subl(cnt1, tmp);
2692   cmpl(cnt1, cnt2);
2693   if (int_cnt2 == stride) {
2694     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2695   } else { // int_cnt2 > 8
2696     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2697   }
2698   // Left less then substring.
2699 
2700   bind(RET_NOT_FOUND);
2701   movl(result, -1);
2702   jmp(EXIT);
2703 
2704   if (int_cnt2 > stride) {
2705     // This code is optimized for the case when whole substring
2706     // is matched if its head is matched.
2707     bind(MATCH_SUBSTR_HEAD);
2708     pcmpestri(vec, Address(result, 0), mode);
2709     // Reload only string if does not match
2710     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2711 
2712     Label CONT_SCAN_SUBSTR;
2713     // Compare the rest of substring (> 8 chars).
2714     bind(FOUND_SUBSTR);
2715     // First 8 chars are already matched.
2716     negptr(cnt2);
2717     addptr(cnt2, stride);
2718 
2719     bind(SCAN_SUBSTR);
2720     subl(cnt1, stride);
2721     cmpl(cnt2, -stride); // Do not read beyond substring
2722     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2723     // Back-up strings to avoid reading beyond substring:
2724     // cnt1 = cnt1 - cnt2 + 8
2725     addl(cnt1, cnt2); // cnt2 is negative
2726     addl(cnt1, stride);
2727     movl(cnt2, stride); negptr(cnt2);
2728     bind(CONT_SCAN_SUBSTR);
2729     if (int_cnt2 < (int)G) {
2730       int tail_off1 = int_cnt2<<scale1;
2731       int tail_off2 = int_cnt2<<scale2;
2732       if (ae == StrIntrinsicNode::UL) {
2733         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2734       } else {
2735         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2736       }
2737       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2738     } else {
2739       // calculate index in register to avoid integer overflow (int_cnt2*2)
2740       movl(tmp, int_cnt2);
2741       addptr(tmp, cnt2);
2742       if (ae == StrIntrinsicNode::UL) {
2743         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2744       } else {
2745         movdqu(vec, Address(str2, tmp, scale2, 0));
2746       }
2747       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2748     }
2749     // Need to reload strings pointers if not matched whole vector
2750     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2751     addptr(cnt2, stride);
2752     jcc(Assembler::negative, SCAN_SUBSTR);
2753     // Fall through if found full substring
2754 
2755   } // (int_cnt2 > 8)
2756 
2757   bind(RET_FOUND);
2758   // Found result if we matched full small substring.
2759   // Compute substr offset
2760   subptr(result, str1);
2761   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2762     shrl(result, 1); // index
2763   }
2764   bind(EXIT);
2765 
2766 } // string_indexofC8
2767 
2768 // Small strings are loaded through stack if they cross page boundary.
2769 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2770                                        Register cnt1, Register cnt2,
2771                                        int int_cnt2,  Register result,
2772                                        XMMRegister vec, Register tmp,
2773                                        int ae) {
2774   ShortBranchVerifier sbv(this);
2775   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2776   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2777 
2778   //
2779   // int_cnt2 is length of small (< 8 chars) constant substring
2780   // or (-1) for non constant substring in which case its length
2781   // is in cnt2 register.
2782   //
2783   // Note, inline_string_indexOf() generates checks:
2784   // if (substr.count > string.count) return -1;
2785   // if (substr.count == 0) return 0;
2786   //
2787   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2788   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2789   // This method uses the pcmpestri instruction with bound registers
2790   //   inputs:
2791   //     xmm - substring
2792   //     rax - substring length (elements count)
2793   //     mem - scanned string
2794   //     rdx - string length (elements count)
2795   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2796   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2797   //   outputs:
2798   //     rcx - matched index in string
2799   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2800   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2801   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2802   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2803 
2804   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2805         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2806         FOUND_CANDIDATE;
2807 
2808   { //========================================================
2809     // We don't know where these strings are located
2810     // and we can't read beyond them. Load them through stack.
2811     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2812 
2813     movptr(tmp, rsp); // save old SP
2814 
2815     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2816       if (int_cnt2 == (1>>scale2)) { // One byte
2817         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2818         load_unsigned_byte(result, Address(str2, 0));
2819         movdl(vec, result); // move 32 bits
2820       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2821         // Not enough header space in 32-bit VM: 12+3 = 15.
2822         movl(result, Address(str2, -1));
2823         shrl(result, 8);
2824         movdl(vec, result); // move 32 bits
2825       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2826         load_unsigned_short(result, Address(str2, 0));
2827         movdl(vec, result); // move 32 bits
2828       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2829         movdl(vec, Address(str2, 0)); // move 32 bits
2830       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2831         movq(vec, Address(str2, 0));  // move 64 bits
2832       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2833         // Array header size is 12 bytes in 32-bit VM
2834         // + 6 bytes for 3 chars == 18 bytes,
2835         // enough space to load vec and shift.
2836         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2837         if (ae == StrIntrinsicNode::UL) {
2838           int tail_off = int_cnt2-8;
2839           pmovzxbw(vec, Address(str2, tail_off));
2840           psrldq(vec, -2*tail_off);
2841         }
2842         else {
2843           int tail_off = int_cnt2*(1<<scale2);
2844           movdqu(vec, Address(str2, tail_off-16));
2845           psrldq(vec, 16-tail_off);
2846         }
2847       }
2848     } else { // not constant substring
2849       cmpl(cnt2, stride);
2850       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2851 
2852       // We can read beyond string if srt+16 does not cross page boundary
2853       // since heaps are aligned and mapped by pages.
2854       assert(os::vm_page_size() < (int)G, "default page should be small");
2855       movl(result, str2); // We need only low 32 bits
2856       andl(result, ((int)os::vm_page_size()-1));
2857       cmpl(result, ((int)os::vm_page_size()-16));
2858       jccb(Assembler::belowEqual, CHECK_STR);
2859 
2860       // Move small strings to stack to allow load 16 bytes into vec.
2861       subptr(rsp, 16);
2862       int stk_offset = wordSize-(1<<scale2);
2863       push(cnt2);
2864 
2865       bind(COPY_SUBSTR);
2866       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2867         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2868         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2869       } else if (ae == StrIntrinsicNode::UU) {
2870         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2871         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2872       }
2873       decrement(cnt2);
2874       jccb(Assembler::notZero, COPY_SUBSTR);
2875 
2876       pop(cnt2);
2877       movptr(str2, rsp);  // New substring address
2878     } // non constant
2879 
2880     bind(CHECK_STR);
2881     cmpl(cnt1, stride);
2882     jccb(Assembler::aboveEqual, BIG_STRINGS);
2883 
2884     // Check cross page boundary.
2885     movl(result, str1); // We need only low 32 bits
2886     andl(result, ((int)os::vm_page_size()-1));
2887     cmpl(result, ((int)os::vm_page_size()-16));
2888     jccb(Assembler::belowEqual, BIG_STRINGS);
2889 
2890     subptr(rsp, 16);
2891     int stk_offset = -(1<<scale1);
2892     if (int_cnt2 < 0) { // not constant
2893       push(cnt2);
2894       stk_offset += wordSize;
2895     }
2896     movl(cnt2, cnt1);
2897 
2898     bind(COPY_STR);
2899     if (ae == StrIntrinsicNode::LL) {
2900       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2901       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2902     } else {
2903       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2904       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2905     }
2906     decrement(cnt2);
2907     jccb(Assembler::notZero, COPY_STR);
2908 
2909     if (int_cnt2 < 0) { // not constant
2910       pop(cnt2);
2911     }
2912     movptr(str1, rsp);  // New string address
2913 
2914     bind(BIG_STRINGS);
2915     // Load substring.
2916     if (int_cnt2 < 0) { // -1
2917       if (ae == StrIntrinsicNode::UL) {
2918         pmovzxbw(vec, Address(str2, 0));
2919       } else {
2920         movdqu(vec, Address(str2, 0));
2921       }
2922       push(cnt2);       // substr count
2923       push(str2);       // substr addr
2924       push(str1);       // string addr
2925     } else {
2926       // Small (< 8 chars) constant substrings are loaded already.
2927       movl(cnt2, int_cnt2);
2928     }
2929     push(tmp);  // original SP
2930 
2931   } // Finished loading
2932 
2933   //========================================================
2934   // Start search
2935   //
2936 
2937   movptr(result, str1); // string addr
2938 
2939   if (int_cnt2  < 0) {  // Only for non constant substring
2940     jmpb(SCAN_TO_SUBSTR);
2941 
2942     // SP saved at sp+0
2943     // String saved at sp+1*wordSize
2944     // Substr saved at sp+2*wordSize
2945     // Substr count saved at sp+3*wordSize
2946 
2947     // Reload substr for rescan, this code
2948     // is executed only for large substrings (> 8 chars)
2949     bind(RELOAD_SUBSTR);
2950     movptr(str2, Address(rsp, 2*wordSize));
2951     movl(cnt2, Address(rsp, 3*wordSize));
2952     if (ae == StrIntrinsicNode::UL) {
2953       pmovzxbw(vec, Address(str2, 0));
2954     } else {
2955       movdqu(vec, Address(str2, 0));
2956     }
2957     // We came here after the beginning of the substring was
2958     // matched but the rest of it was not so we need to search
2959     // again. Start from the next element after the previous match.
2960     subptr(str1, result); // Restore counter
2961     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2962       shrl(str1, 1);
2963     }
2964     addl(cnt1, str1);
2965     decrementl(cnt1);   // Shift to next element
2966     cmpl(cnt1, cnt2);
2967     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2968 
2969     addptr(result, (1<<scale1));
2970   } // non constant
2971 
2972   // Scan string for start of substr in 16-byte vectors
2973   bind(SCAN_TO_SUBSTR);
2974   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2975   pcmpestri(vec, Address(result, 0), mode);
2976   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2977   subl(cnt1, stride);
2978   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2979   cmpl(cnt1, cnt2);
2980   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2981   addptr(result, 16);
2982 
2983   bind(ADJUST_STR);
2984   cmpl(cnt1, stride); // Do not read beyond string
2985   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2986   // Back-up string to avoid reading beyond string.
2987   lea(result, Address(result, cnt1, scale1, -16));
2988   movl(cnt1, stride);
2989   jmpb(SCAN_TO_SUBSTR);
2990 
2991   // Found a potential substr
2992   bind(FOUND_CANDIDATE);
2993   // After pcmpestri tmp(rcx) contains matched element index
2994 
2995   // Make sure string is still long enough
2996   subl(cnt1, tmp);
2997   cmpl(cnt1, cnt2);
2998   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2999   // Left less then substring.
3000 
3001   bind(RET_NOT_FOUND);
3002   movl(result, -1);
3003   jmp(CLEANUP);
3004 
3005   bind(FOUND_SUBSTR);
3006   // Compute start addr of substr
3007   lea(result, Address(result, tmp, scale1));
3008   if (int_cnt2 > 0) { // Constant substring
3009     // Repeat search for small substring (< 8 chars)
3010     // from new point without reloading substring.
3011     // Have to check that we don't read beyond string.
3012     cmpl(tmp, stride-int_cnt2);
3013     jccb(Assembler::greater, ADJUST_STR);
3014     // Fall through if matched whole substring.
3015   } else { // non constant
3016     assert(int_cnt2 == -1, "should be != 0");
3017 
3018     addl(tmp, cnt2);
3019     // Found result if we matched whole substring.
3020     cmpl(tmp, stride);
3021     jcc(Assembler::lessEqual, RET_FOUND);
3022 
3023     // Repeat search for small substring (<= 8 chars)
3024     // from new point 'str1' without reloading substring.
3025     cmpl(cnt2, stride);
3026     // Have to check that we don't read beyond string.
3027     jccb(Assembler::lessEqual, ADJUST_STR);
3028 
3029     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3030     // Compare the rest of substring (> 8 chars).
3031     movptr(str1, result);
3032 
3033     cmpl(tmp, cnt2);
3034     // First 8 chars are already matched.
3035     jccb(Assembler::equal, CHECK_NEXT);
3036 
3037     bind(SCAN_SUBSTR);
3038     pcmpestri(vec, Address(str1, 0), mode);
3039     // Need to reload strings pointers if not matched whole vector
3040     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3041 
3042     bind(CHECK_NEXT);
3043     subl(cnt2, stride);
3044     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3045     addptr(str1, 16);
3046     if (ae == StrIntrinsicNode::UL) {
3047       addptr(str2, 8);
3048     } else {
3049       addptr(str2, 16);
3050     }
3051     subl(cnt1, stride);
3052     cmpl(cnt2, stride); // Do not read beyond substring
3053     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3054     // Back-up strings to avoid reading beyond substring.
3055 
3056     if (ae == StrIntrinsicNode::UL) {
3057       lea(str2, Address(str2, cnt2, scale2, -8));
3058       lea(str1, Address(str1, cnt2, scale1, -16));
3059     } else {
3060       lea(str2, Address(str2, cnt2, scale2, -16));
3061       lea(str1, Address(str1, cnt2, scale1, -16));
3062     }
3063     subl(cnt1, cnt2);
3064     movl(cnt2, stride);
3065     addl(cnt1, stride);
3066     bind(CONT_SCAN_SUBSTR);
3067     if (ae == StrIntrinsicNode::UL) {
3068       pmovzxbw(vec, Address(str2, 0));
3069     } else {
3070       movdqu(vec, Address(str2, 0));
3071     }
3072     jmp(SCAN_SUBSTR);
3073 
3074     bind(RET_FOUND_LONG);
3075     movptr(str1, Address(rsp, wordSize));
3076   } // non constant
3077 
3078   bind(RET_FOUND);
3079   // Compute substr offset
3080   subptr(result, str1);
3081   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3082     shrl(result, 1); // index
3083   }
3084   bind(CLEANUP);
3085   pop(rsp); // restore SP
3086 
3087 } // string_indexof
3088 
3089 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3090                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3091   ShortBranchVerifier sbv(this);
3092   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3093 
3094   int stride = 8;
3095 
3096   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3097         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3098         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3099         FOUND_SEQ_CHAR, DONE_LABEL;
3100 
3101   movptr(result, str1);
3102   if (UseAVX >= 2) {
3103     cmpl(cnt1, stride);
3104     jcc(Assembler::less, SCAN_TO_CHAR);
3105     cmpl(cnt1, 2*stride);
3106     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3107     movdl(vec1, ch);
3108     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3109     vpxor(vec2, vec2);
3110     movl(tmp, cnt1);
3111     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3112     andl(cnt1,0x0000000F);  //tail count (in chars)
3113 
3114     bind(SCAN_TO_16_CHAR_LOOP);
3115     vmovdqu(vec3, Address(result, 0));
3116     vpcmpeqw(vec3, vec3, vec1, 1);
3117     vptest(vec2, vec3);
3118     jcc(Assembler::carryClear, FOUND_CHAR);
3119     addptr(result, 32);
3120     subl(tmp, 2*stride);
3121     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3122     jmp(SCAN_TO_8_CHAR);
3123     bind(SCAN_TO_8_CHAR_INIT);
3124     movdl(vec1, ch);
3125     pshuflw(vec1, vec1, 0x00);
3126     pshufd(vec1, vec1, 0);
3127     pxor(vec2, vec2);
3128   }
3129   bind(SCAN_TO_8_CHAR);
3130   cmpl(cnt1, stride);
3131   jcc(Assembler::less, SCAN_TO_CHAR);
3132   if (UseAVX < 2) {
3133     movdl(vec1, ch);
3134     pshuflw(vec1, vec1, 0x00);
3135     pshufd(vec1, vec1, 0);
3136     pxor(vec2, vec2);
3137   }
3138   movl(tmp, cnt1);
3139   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3140   andl(cnt1,0x00000007);  //tail count (in chars)
3141 
3142   bind(SCAN_TO_8_CHAR_LOOP);
3143   movdqu(vec3, Address(result, 0));
3144   pcmpeqw(vec3, vec1);
3145   ptest(vec2, vec3);
3146   jcc(Assembler::carryClear, FOUND_CHAR);
3147   addptr(result, 16);
3148   subl(tmp, stride);
3149   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3150   bind(SCAN_TO_CHAR);
3151   testl(cnt1, cnt1);
3152   jcc(Assembler::zero, RET_NOT_FOUND);
3153   bind(SCAN_TO_CHAR_LOOP);
3154   load_unsigned_short(tmp, Address(result, 0));
3155   cmpl(ch, tmp);
3156   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3157   addptr(result, 2);
3158   subl(cnt1, 1);
3159   jccb(Assembler::zero, RET_NOT_FOUND);
3160   jmp(SCAN_TO_CHAR_LOOP);
3161 
3162   bind(RET_NOT_FOUND);
3163   movl(result, -1);
3164   jmpb(DONE_LABEL);
3165 
3166   bind(FOUND_CHAR);
3167   if (UseAVX >= 2) {
3168     vpmovmskb(tmp, vec3);
3169   } else {
3170     pmovmskb(tmp, vec3);
3171   }
3172   bsfl(ch, tmp);
3173   addptr(result, ch);
3174 
3175   bind(FOUND_SEQ_CHAR);
3176   subptr(result, str1);
3177   shrl(result, 1);
3178 
3179   bind(DONE_LABEL);
3180 } // string_indexof_char
3181 
3182 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3183                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3184   ShortBranchVerifier sbv(this);
3185   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3186 
3187   int stride = 16;
3188 
3189   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3190         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3191         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3192         FOUND_SEQ_CHAR, DONE_LABEL;
3193 
3194   movptr(result, str1);
3195   if (UseAVX >= 2) {
3196     cmpl(cnt1, stride);
3197     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3198     cmpl(cnt1, stride*2);
3199     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3200     movdl(vec1, ch);
3201     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3202     vpxor(vec2, vec2);
3203     movl(tmp, cnt1);
3204     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3205     andl(cnt1,0x0000001F);  //tail count (in chars)
3206 
3207     bind(SCAN_TO_32_CHAR_LOOP);
3208     vmovdqu(vec3, Address(result, 0));
3209     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3210     vptest(vec2, vec3);
3211     jcc(Assembler::carryClear, FOUND_CHAR);
3212     addptr(result, 32);
3213     subl(tmp, stride*2);
3214     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3215     jmp(SCAN_TO_16_CHAR);
3216 
3217     bind(SCAN_TO_16_CHAR_INIT);
3218     movdl(vec1, ch);
3219     pxor(vec2, vec2);
3220     pshufb(vec1, vec2);
3221   }
3222 
3223   bind(SCAN_TO_16_CHAR);
3224   cmpl(cnt1, stride);
3225   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3226   if (UseAVX < 2) {
3227     movdl(vec1, ch);
3228     pxor(vec2, vec2);
3229     pshufb(vec1, vec2);
3230   }
3231   movl(tmp, cnt1);
3232   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3233   andl(cnt1,0x0000000F);  //tail count (in bytes)
3234 
3235   bind(SCAN_TO_16_CHAR_LOOP);
3236   movdqu(vec3, Address(result, 0));
3237   pcmpeqb(vec3, vec1);
3238   ptest(vec2, vec3);
3239   jcc(Assembler::carryClear, FOUND_CHAR);
3240   addptr(result, 16);
3241   subl(tmp, stride);
3242   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3243 
3244   bind(SCAN_TO_CHAR_INIT);
3245   testl(cnt1, cnt1);
3246   jcc(Assembler::zero, RET_NOT_FOUND);
3247   bind(SCAN_TO_CHAR_LOOP);
3248   load_unsigned_byte(tmp, Address(result, 0));
3249   cmpl(ch, tmp);
3250   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3251   addptr(result, 1);
3252   subl(cnt1, 1);
3253   jccb(Assembler::zero, RET_NOT_FOUND);
3254   jmp(SCAN_TO_CHAR_LOOP);
3255 
3256   bind(RET_NOT_FOUND);
3257   movl(result, -1);
3258   jmpb(DONE_LABEL);
3259 
3260   bind(FOUND_CHAR);
3261   if (UseAVX >= 2) {
3262     vpmovmskb(tmp, vec3);
3263   } else {
3264     pmovmskb(tmp, vec3);
3265   }
3266   bsfl(ch, tmp);
3267   addptr(result, ch);
3268 
3269   bind(FOUND_SEQ_CHAR);
3270   subptr(result, str1);
3271 
3272   bind(DONE_LABEL);
3273 } // stringL_indexof_char
3274 
3275 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3276   switch (eltype) {
3277   case T_BOOLEAN: return sizeof(jboolean);
3278   case T_BYTE:  return sizeof(jbyte);
3279   case T_SHORT: return sizeof(jshort);
3280   case T_CHAR:  return sizeof(jchar);
3281   case T_INT:   return sizeof(jint);
3282   default:
3283     ShouldNotReachHere();
3284     return -1;
3285   }
3286 }
3287 
3288 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3289   switch (eltype) {
3290   // T_BOOLEAN used as surrogate for unsigned byte
3291   case T_BOOLEAN: movzbl(dst, src);   break;
3292   case T_BYTE:    movsbl(dst, src);   break;
3293   case T_SHORT:   movswl(dst, src);   break;
3294   case T_CHAR:    movzwl(dst, src);   break;
3295   case T_INT:     movl(dst, src);     break;
3296   default:
3297     ShouldNotReachHere();
3298   }
3299 }
3300 
3301 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3302   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3303 }
3304 
3305 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3306   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3307 }
3308 
3309 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3310   const int vlen = Assembler::AVX_256bit;
3311   switch (eltype) {
3312   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3313   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3314   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3315   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3316   case T_INT:
3317     // do nothing
3318     break;
3319   default:
3320     ShouldNotReachHere();
3321   }
3322 }
3323 
3324 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3325                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3326                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3327                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3328                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3329                                         BasicType eltype) {
3330   ShortBranchVerifier sbv(this);
3331   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3332   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3333   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3334 
3335   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3336         SHORT_UNROLLED_LOOP_EXIT,
3337         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3338         UNROLLED_VECTOR_LOOP_BEGIN,
3339         END;
3340   switch (eltype) {
3341   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3342   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3343   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3344   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3345   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3346   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3347   }
3348 
3349   // For "renaming" for readibility of the code
3350   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3351                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3352                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3353 
3354   const int elsize = arrays_hashcode_elsize(eltype);
3355 
3356   /*
3357     if (cnt1 >= 2) {
3358       if (cnt1 >= 32) {
3359         UNROLLED VECTOR LOOP
3360       }
3361       UNROLLED SCALAR LOOP
3362     }
3363     SINGLE SCALAR
3364    */
3365 
3366   cmpl(cnt1, 32);
3367   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3368 
3369   // cnt1 >= 32 && generate_vectorized_loop
3370   xorl(index, index);
3371 
3372   // vresult = IntVector.zero(I256);
3373   for (int idx = 0; idx < 4; idx++) {
3374     vpxor(vresult[idx], vresult[idx]);
3375   }
3376   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3377   Register bound = tmp2;
3378   Register next = tmp3;
3379   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3380   movl(next, Address(tmp2, 0));
3381   movdl(vnext, next);
3382   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3383 
3384   // index = 0;
3385   // bound = cnt1 & ~(32 - 1);
3386   movl(bound, cnt1);
3387   andl(bound, ~(32 - 1));
3388   // for (; index < bound; index += 32) {
3389   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3390   // result *= next;
3391   imull(result, next);
3392   // loop fission to upfront the cost of fetching from memory, OOO execution
3393   // can then hopefully do a better job of prefetching
3394   for (int idx = 0; idx < 4; idx++) {
3395     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3396   }
3397   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3398   for (int idx = 0; idx < 4; idx++) {
3399     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3400     arrays_hashcode_elvcast(vtmp[idx], eltype);
3401     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3402   }
3403   // index += 32;
3404   addl(index, 32);
3405   // index < bound;
3406   cmpl(index, bound);
3407   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3408   // }
3409 
3410   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3411   subl(cnt1, bound);
3412   // release bound
3413 
3414   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3415   for (int idx = 0; idx < 4; idx++) {
3416     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3417     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3418     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3419   }
3420   // result += vresult.reduceLanes(ADD);
3421   for (int idx = 0; idx < 4; idx++) {
3422     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3423   }
3424 
3425   // } else if (cnt1 < 32) {
3426 
3427   bind(SHORT_UNROLLED_BEGIN);
3428   // int i = 1;
3429   movl(index, 1);
3430   cmpl(index, cnt1);
3431   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3432 
3433   // for (; i < cnt1 ; i += 2) {
3434   bind(SHORT_UNROLLED_LOOP_BEGIN);
3435   movl(tmp3, 961);
3436   imull(result, tmp3);
3437   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3438   movl(tmp3, tmp2);
3439   shll(tmp3, 5);
3440   subl(tmp3, tmp2);
3441   addl(result, tmp3);
3442   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3443   addl(result, tmp3);
3444   addl(index, 2);
3445   cmpl(index, cnt1);
3446   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3447 
3448   // }
3449   // if (i >= cnt1) {
3450   bind(SHORT_UNROLLED_LOOP_EXIT);
3451   jccb(Assembler::greater, END);
3452   movl(tmp2, result);
3453   shll(result, 5);
3454   subl(result, tmp2);
3455   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3456   addl(result, tmp3);
3457   // }
3458   bind(END);
3459 
3460   BLOCK_COMMENT("} // arrays_hashcode");
3461 
3462 } // arrays_hashcode
3463 
3464 // helper function for string_compare
3465 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3466                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3467                                            Address::ScaleFactor scale2, Register index, int ae) {
3468   if (ae == StrIntrinsicNode::LL) {
3469     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3470     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3471   } else if (ae == StrIntrinsicNode::UU) {
3472     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3473     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3474   } else {
3475     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3476     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3477   }
3478 }
3479 
3480 // Compare strings, used for char[] and byte[].
3481 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3482                                        Register cnt1, Register cnt2, Register result,
3483                                        XMMRegister vec1, int ae, KRegister mask) {
3484   ShortBranchVerifier sbv(this);
3485   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3486   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3487   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3488   int stride2x2 = 0x40;
3489   Address::ScaleFactor scale = Address::no_scale;
3490   Address::ScaleFactor scale1 = Address::no_scale;
3491   Address::ScaleFactor scale2 = Address::no_scale;
3492 
3493   if (ae != StrIntrinsicNode::LL) {
3494     stride2x2 = 0x20;
3495   }
3496 
3497   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3498     shrl(cnt2, 1);
3499   }
3500   // Compute the minimum of the string lengths and the
3501   // difference of the string lengths (stack).
3502   // Do the conditional move stuff
3503   movl(result, cnt1);
3504   subl(cnt1, cnt2);
3505   push(cnt1);
3506   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3507 
3508   // Is the minimum length zero?
3509   testl(cnt2, cnt2);
3510   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3511   if (ae == StrIntrinsicNode::LL) {
3512     // Load first bytes
3513     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3514     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3515   } else if (ae == StrIntrinsicNode::UU) {
3516     // Load first characters
3517     load_unsigned_short(result, Address(str1, 0));
3518     load_unsigned_short(cnt1, Address(str2, 0));
3519   } else {
3520     load_unsigned_byte(result, Address(str1, 0));
3521     load_unsigned_short(cnt1, Address(str2, 0));
3522   }
3523   subl(result, cnt1);
3524   jcc(Assembler::notZero,  POP_LABEL);
3525 
3526   if (ae == StrIntrinsicNode::UU) {
3527     // Divide length by 2 to get number of chars
3528     shrl(cnt2, 1);
3529   }
3530   cmpl(cnt2, 1);
3531   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3532 
3533   // Check if the strings start at the same location and setup scale and stride
3534   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3535     cmpptr(str1, str2);
3536     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3537     if (ae == StrIntrinsicNode::LL) {
3538       scale = Address::times_1;
3539       stride = 16;
3540     } else {
3541       scale = Address::times_2;
3542       stride = 8;
3543     }
3544   } else {
3545     scale1 = Address::times_1;
3546     scale2 = Address::times_2;
3547     // scale not used
3548     stride = 8;
3549   }
3550 
3551   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3552     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3553     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3554     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3555     Label COMPARE_TAIL_LONG;
3556     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3557 
3558     int pcmpmask = 0x19;
3559     if (ae == StrIntrinsicNode::LL) {
3560       pcmpmask &= ~0x01;
3561     }
3562 
3563     // Setup to compare 16-chars (32-bytes) vectors,
3564     // start from first character again because it has aligned address.
3565     if (ae == StrIntrinsicNode::LL) {
3566       stride2 = 32;
3567     } else {
3568       stride2 = 16;
3569     }
3570     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3571       adr_stride = stride << scale;
3572     } else {
3573       adr_stride1 = 8;  //stride << scale1;
3574       adr_stride2 = 16; //stride << scale2;
3575     }
3576 
3577     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3578     // rax and rdx are used by pcmpestri as elements counters
3579     movl(result, cnt2);
3580     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3581     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3582 
3583     // fast path : compare first 2 8-char vectors.
3584     bind(COMPARE_16_CHARS);
3585     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3586       movdqu(vec1, Address(str1, 0));
3587     } else {
3588       pmovzxbw(vec1, Address(str1, 0));
3589     }
3590     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3591     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3592 
3593     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3594       movdqu(vec1, Address(str1, adr_stride));
3595       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3596     } else {
3597       pmovzxbw(vec1, Address(str1, adr_stride1));
3598       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3599     }
3600     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3601     addl(cnt1, stride);
3602 
3603     // Compare the characters at index in cnt1
3604     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3605     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3606     subl(result, cnt2);
3607     jmp(POP_LABEL);
3608 
3609     // Setup the registers to start vector comparison loop
3610     bind(COMPARE_WIDE_VECTORS);
3611     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3612       lea(str1, Address(str1, result, scale));
3613       lea(str2, Address(str2, result, scale));
3614     } else {
3615       lea(str1, Address(str1, result, scale1));
3616       lea(str2, Address(str2, result, scale2));
3617     }
3618     subl(result, stride2);
3619     subl(cnt2, stride2);
3620     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3621     negptr(result);
3622 
3623     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3624     bind(COMPARE_WIDE_VECTORS_LOOP);
3625 
3626 #ifdef _LP64
3627     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3628       cmpl(cnt2, stride2x2);
3629       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3630       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3631       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3632 
3633       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3634       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3635         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3636         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3637       } else {
3638         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3639         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3640       }
3641       kortestql(mask, mask);
3642       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3643       addptr(result, stride2x2);  // update since we already compared at this addr
3644       subl(cnt2, stride2x2);      // and sub the size too
3645       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3646 
3647       vpxor(vec1, vec1);
3648       jmpb(COMPARE_WIDE_TAIL);
3649     }//if (VM_Version::supports_avx512vlbw())
3650 #endif // _LP64
3651 
3652 
3653     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3654     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3655       vmovdqu(vec1, Address(str1, result, scale));
3656       vpxor(vec1, Address(str2, result, scale));
3657     } else {
3658       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3659       vpxor(vec1, Address(str2, result, scale2));
3660     }
3661     vptest(vec1, vec1);
3662     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3663     addptr(result, stride2);
3664     subl(cnt2, stride2);
3665     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3666     // clean upper bits of YMM registers
3667     vpxor(vec1, vec1);
3668 
3669     // compare wide vectors tail
3670     bind(COMPARE_WIDE_TAIL);
3671     testptr(result, result);
3672     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3673 
3674     movl(result, stride2);
3675     movl(cnt2, result);
3676     negptr(result);
3677     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3678 
3679     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3680     bind(VECTOR_NOT_EQUAL);
3681     // clean upper bits of YMM registers
3682     vpxor(vec1, vec1);
3683     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3684       lea(str1, Address(str1, result, scale));
3685       lea(str2, Address(str2, result, scale));
3686     } else {
3687       lea(str1, Address(str1, result, scale1));
3688       lea(str2, Address(str2, result, scale2));
3689     }
3690     jmp(COMPARE_16_CHARS);
3691 
3692     // Compare tail chars, length between 1 to 15 chars
3693     bind(COMPARE_TAIL_LONG);
3694     movl(cnt2, result);
3695     cmpl(cnt2, stride);
3696     jcc(Assembler::less, COMPARE_SMALL_STR);
3697 
3698     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3699       movdqu(vec1, Address(str1, 0));
3700     } else {
3701       pmovzxbw(vec1, Address(str1, 0));
3702     }
3703     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3704     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3705     subptr(cnt2, stride);
3706     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3707     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3708       lea(str1, Address(str1, result, scale));
3709       lea(str2, Address(str2, result, scale));
3710     } else {
3711       lea(str1, Address(str1, result, scale1));
3712       lea(str2, Address(str2, result, scale2));
3713     }
3714     negptr(cnt2);
3715     jmpb(WHILE_HEAD_LABEL);
3716 
3717     bind(COMPARE_SMALL_STR);
3718   } else if (UseSSE42Intrinsics) {
3719     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3720     int pcmpmask = 0x19;
3721     // Setup to compare 8-char (16-byte) vectors,
3722     // start from first character again because it has aligned address.
3723     movl(result, cnt2);
3724     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3725     if (ae == StrIntrinsicNode::LL) {
3726       pcmpmask &= ~0x01;
3727     }
3728     jcc(Assembler::zero, COMPARE_TAIL);
3729     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3730       lea(str1, Address(str1, result, scale));
3731       lea(str2, Address(str2, result, scale));
3732     } else {
3733       lea(str1, Address(str1, result, scale1));
3734       lea(str2, Address(str2, result, scale2));
3735     }
3736     negptr(result);
3737 
3738     // pcmpestri
3739     //   inputs:
3740     //     vec1- substring
3741     //     rax - negative string length (elements count)
3742     //     mem - scanned string
3743     //     rdx - string length (elements count)
3744     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3745     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3746     //   outputs:
3747     //     rcx - first mismatched element index
3748     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3749 
3750     bind(COMPARE_WIDE_VECTORS);
3751     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3752       movdqu(vec1, Address(str1, result, scale));
3753       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3754     } else {
3755       pmovzxbw(vec1, Address(str1, result, scale1));
3756       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3757     }
3758     // After pcmpestri cnt1(rcx) contains mismatched element index
3759 
3760     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3761     addptr(result, stride);
3762     subptr(cnt2, stride);
3763     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3764 
3765     // compare wide vectors tail
3766     testptr(result, result);
3767     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3768 
3769     movl(cnt2, stride);
3770     movl(result, stride);
3771     negptr(result);
3772     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3773       movdqu(vec1, Address(str1, result, scale));
3774       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3775     } else {
3776       pmovzxbw(vec1, Address(str1, result, scale1));
3777       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3778     }
3779     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3780 
3781     // Mismatched characters in the vectors
3782     bind(VECTOR_NOT_EQUAL);
3783     addptr(cnt1, result);
3784     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3785     subl(result, cnt2);
3786     jmpb(POP_LABEL);
3787 
3788     bind(COMPARE_TAIL); // limit is zero
3789     movl(cnt2, result);
3790     // Fallthru to tail compare
3791   }
3792   // Shift str2 and str1 to the end of the arrays, negate min
3793   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3794     lea(str1, Address(str1, cnt2, scale));
3795     lea(str2, Address(str2, cnt2, scale));
3796   } else {
3797     lea(str1, Address(str1, cnt2, scale1));
3798     lea(str2, Address(str2, cnt2, scale2));
3799   }
3800   decrementl(cnt2);  // first character was compared already
3801   negptr(cnt2);
3802 
3803   // Compare the rest of the elements
3804   bind(WHILE_HEAD_LABEL);
3805   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3806   subl(result, cnt1);
3807   jccb(Assembler::notZero, POP_LABEL);
3808   increment(cnt2);
3809   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3810 
3811   // Strings are equal up to min length.  Return the length difference.
3812   bind(LENGTH_DIFF_LABEL);
3813   pop(result);
3814   if (ae == StrIntrinsicNode::UU) {
3815     // Divide diff by 2 to get number of chars
3816     sarl(result, 1);
3817   }
3818   jmpb(DONE_LABEL);
3819 
3820 #ifdef _LP64
3821   if (VM_Version::supports_avx512vlbw()) {
3822 
3823     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3824 
3825     kmovql(cnt1, mask);
3826     notq(cnt1);
3827     bsfq(cnt2, cnt1);
3828     if (ae != StrIntrinsicNode::LL) {
3829       // Divide diff by 2 to get number of chars
3830       sarl(cnt2, 1);
3831     }
3832     addq(result, cnt2);
3833     if (ae == StrIntrinsicNode::LL) {
3834       load_unsigned_byte(cnt1, Address(str2, result));
3835       load_unsigned_byte(result, Address(str1, result));
3836     } else if (ae == StrIntrinsicNode::UU) {
3837       load_unsigned_short(cnt1, Address(str2, result, scale));
3838       load_unsigned_short(result, Address(str1, result, scale));
3839     } else {
3840       load_unsigned_short(cnt1, Address(str2, result, scale2));
3841       load_unsigned_byte(result, Address(str1, result, scale1));
3842     }
3843     subl(result, cnt1);
3844     jmpb(POP_LABEL);
3845   }//if (VM_Version::supports_avx512vlbw())
3846 #endif // _LP64
3847 
3848   // Discard the stored length difference
3849   bind(POP_LABEL);
3850   pop(cnt1);
3851 
3852   // That's it
3853   bind(DONE_LABEL);
3854   if(ae == StrIntrinsicNode::UL) {
3855     negl(result);
3856   }
3857 
3858 }
3859 
3860 // Search for Non-ASCII character (Negative byte value) in a byte array,
3861 // return the index of the first such character, otherwise the length
3862 // of the array segment searched.
3863 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3864 //   @IntrinsicCandidate
3865 //   public static int countPositives(byte[] ba, int off, int len) {
3866 //     for (int i = off; i < off + len; i++) {
3867 //       if (ba[i] < 0) {
3868 //         return i - off;
3869 //       }
3870 //     }
3871 //     return len;
3872 //   }
3873 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3874   Register result, Register tmp1,
3875   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3876   // rsi: byte array
3877   // rcx: len
3878   // rax: result
3879   ShortBranchVerifier sbv(this);
3880   assert_different_registers(ary1, len, result, tmp1);
3881   assert_different_registers(vec1, vec2);
3882   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3883 
3884   movl(result, len); // copy
3885   // len == 0
3886   testl(len, len);
3887   jcc(Assembler::zero, DONE);
3888 
3889   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3890     VM_Version::supports_avx512vlbw() &&
3891     VM_Version::supports_bmi2()) {
3892 
3893     Label test_64_loop, test_tail, BREAK_LOOP;
3894     movl(tmp1, len);
3895     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3896 
3897     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
3898     andl(len,  0xffffffc0); // vector count (in chars)
3899     jccb(Assembler::zero, test_tail);
3900 
3901     lea(ary1, Address(ary1, len, Address::times_1));
3902     negptr(len);
3903 
3904     bind(test_64_loop);
3905     // Check whether our 64 elements of size byte contain negatives
3906     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3907     kortestql(mask1, mask1);
3908     jcc(Assembler::notZero, BREAK_LOOP);
3909 
3910     addptr(len, 64);
3911     jccb(Assembler::notZero, test_64_loop);
3912 
3913     bind(test_tail);
3914     // bail out when there is nothing to be done
3915     testl(tmp1, -1);
3916     jcc(Assembler::zero, DONE);
3917 
3918 
3919     // check the tail for absense of negatives
3920     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3921 #ifdef _LP64
3922     {
3923       Register tmp3_aliased = len;
3924       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3925       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3926       notq(tmp3_aliased);
3927       kmovql(mask2, tmp3_aliased);
3928     }
3929 #else
3930     Label k_init;
3931     jmp(k_init);
3932 
3933     // We could not read 64-bits from a general purpose register thus we move
3934     // data required to compose 64 1's to the instruction stream
3935     // We emit 64 byte wide series of elements from 0..63 which later on would
3936     // be used as a compare targets with tail count contained in tmp1 register.
3937     // Result would be a k register having tmp1 consecutive number or 1
3938     // counting from least significant bit.
3939     address tmp = pc();
3940     emit_int64(0x0706050403020100);
3941     emit_int64(0x0F0E0D0C0B0A0908);
3942     emit_int64(0x1716151413121110);
3943     emit_int64(0x1F1E1D1C1B1A1918);
3944     emit_int64(0x2726252423222120);
3945     emit_int64(0x2F2E2D2C2B2A2928);
3946     emit_int64(0x3736353433323130);
3947     emit_int64(0x3F3E3D3C3B3A3938);
3948 
3949     bind(k_init);
3950     lea(len, InternalAddress(tmp));
3951     // create mask to test for negative byte inside a vector
3952     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3953     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3954 
3955 #endif
3956     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3957     ktestq(mask1, mask2);
3958     jcc(Assembler::zero, DONE);
3959 
3960     // do a full check for negative registers in the tail
3961     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
3962                      // ary1 already pointing to the right place
3963     jmpb(TAIL_START);
3964 
3965     bind(BREAK_LOOP);
3966     // At least one byte in the last 64 byte block was negative.
3967     // Set up to look at the last 64 bytes as if they were a tail
3968     lea(ary1, Address(ary1, len, Address::times_1));
3969     addptr(result, len);
3970     // Ignore the very last byte: if all others are positive,
3971     // it must be negative, so we can skip right to the 2+1 byte
3972     // end comparison at this point
3973     orl(result, 63);
3974     movl(len, 63);
3975     // Fallthru to tail compare
3976   } else {
3977 
3978     if (UseAVX >= 2 && UseSSE >= 2) {
3979       // With AVX2, use 32-byte vector compare
3980       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3981 
3982       // Compare 32-byte vectors
3983       testl(len, 0xffffffe0);   // vector count (in bytes)
3984       jccb(Assembler::zero, TAIL_START);
3985 
3986       andl(len, 0xffffffe0);
3987       lea(ary1, Address(ary1, len, Address::times_1));
3988       negptr(len);
3989 
3990       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3991       movdl(vec2, tmp1);
3992       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3993 
3994       bind(COMPARE_WIDE_VECTORS);
3995       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3996       vptest(vec1, vec2);
3997       jccb(Assembler::notZero, BREAK_LOOP);
3998       addptr(len, 32);
3999       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4000 
4001       testl(result, 0x0000001f);   // any bytes remaining?
4002       jcc(Assembler::zero, DONE);
4003 
4004       // Quick test using the already prepared vector mask
4005       movl(len, result);
4006       andl(len, 0x0000001f);
4007       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4008       vptest(vec1, vec2);
4009       jcc(Assembler::zero, DONE);
4010       // There are zeros, jump to the tail to determine exactly where
4011       jmpb(TAIL_START);
4012 
4013       bind(BREAK_LOOP);
4014       // At least one byte in the last 32-byte vector is negative.
4015       // Set up to look at the last 32 bytes as if they were a tail
4016       lea(ary1, Address(ary1, len, Address::times_1));
4017       addptr(result, len);
4018       // Ignore the very last byte: if all others are positive,
4019       // it must be negative, so we can skip right to the 2+1 byte
4020       // end comparison at this point
4021       orl(result, 31);
4022       movl(len, 31);
4023       // Fallthru to tail compare
4024     } else if (UseSSE42Intrinsics) {
4025       // With SSE4.2, use double quad vector compare
4026       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4027 
4028       // Compare 16-byte vectors
4029       testl(len, 0xfffffff0);   // vector count (in bytes)
4030       jcc(Assembler::zero, TAIL_START);
4031 
4032       andl(len, 0xfffffff0);
4033       lea(ary1, Address(ary1, len, Address::times_1));
4034       negptr(len);
4035 
4036       movl(tmp1, 0x80808080);
4037       movdl(vec2, tmp1);
4038       pshufd(vec2, vec2, 0);
4039 
4040       bind(COMPARE_WIDE_VECTORS);
4041       movdqu(vec1, Address(ary1, len, Address::times_1));
4042       ptest(vec1, vec2);
4043       jccb(Assembler::notZero, BREAK_LOOP);
4044       addptr(len, 16);
4045       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4046 
4047       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4048       jcc(Assembler::zero, DONE);
4049 
4050       // Quick test using the already prepared vector mask
4051       movl(len, result);
4052       andl(len, 0x0000000f);   // tail count (in bytes)
4053       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4054       ptest(vec1, vec2);
4055       jcc(Assembler::zero, DONE);
4056       jmpb(TAIL_START);
4057 
4058       bind(BREAK_LOOP);
4059       // At least one byte in the last 16-byte vector is negative.
4060       // Set up and look at the last 16 bytes as if they were a tail
4061       lea(ary1, Address(ary1, len, Address::times_1));
4062       addptr(result, len);
4063       // Ignore the very last byte: if all others are positive,
4064       // it must be negative, so we can skip right to the 2+1 byte
4065       // end comparison at this point
4066       orl(result, 15);
4067       movl(len, 15);
4068       // Fallthru to tail compare
4069     }
4070   }
4071 
4072   bind(TAIL_START);
4073   // Compare 4-byte vectors
4074   andl(len, 0xfffffffc); // vector count (in bytes)
4075   jccb(Assembler::zero, COMPARE_CHAR);
4076 
4077   lea(ary1, Address(ary1, len, Address::times_1));
4078   negptr(len);
4079 
4080   bind(COMPARE_VECTORS);
4081   movl(tmp1, Address(ary1, len, Address::times_1));
4082   andl(tmp1, 0x80808080);
4083   jccb(Assembler::notZero, TAIL_ADJUST);
4084   addptr(len, 4);
4085   jccb(Assembler::notZero, COMPARE_VECTORS);
4086 
4087   // Compare trailing char (final 2-3 bytes), if any
4088   bind(COMPARE_CHAR);
4089 
4090   testl(result, 0x2);   // tail  char
4091   jccb(Assembler::zero, COMPARE_BYTE);
4092   load_unsigned_short(tmp1, Address(ary1, 0));
4093   andl(tmp1, 0x00008080);
4094   jccb(Assembler::notZero, CHAR_ADJUST);
4095   lea(ary1, Address(ary1, 2));
4096 
4097   bind(COMPARE_BYTE);
4098   testl(result, 0x1);   // tail  byte
4099   jccb(Assembler::zero, DONE);
4100   load_unsigned_byte(tmp1, Address(ary1, 0));
4101   testl(tmp1, 0x00000080);
4102   jccb(Assembler::zero, DONE);
4103   subptr(result, 1);
4104   jmpb(DONE);
4105 
4106   bind(TAIL_ADJUST);
4107   // there are negative bits in the last 4 byte block.
4108   // Adjust result and check the next three bytes
4109   addptr(result, len);
4110   orl(result, 3);
4111   lea(ary1, Address(ary1, len, Address::times_1));
4112   jmpb(COMPARE_CHAR);
4113 
4114   bind(CHAR_ADJUST);
4115   // We are looking at a char + optional byte tail, and found that one
4116   // of the bytes in the char is negative. Adjust the result, check the
4117   // first byte and readjust if needed.
4118   andl(result, 0xfffffffc);
4119   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4120   jccb(Assembler::notZero, DONE);
4121   addptr(result, 1);
4122 
4123   // That's it
4124   bind(DONE);
4125   if (UseAVX >= 2 && UseSSE >= 2) {
4126     // clean upper bits of YMM registers
4127     vpxor(vec1, vec1);
4128     vpxor(vec2, vec2);
4129   }
4130 }
4131 
4132 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4133 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4134                                       Register limit, Register result, Register chr,
4135                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4136                                       KRegister mask, bool expand_ary2) {
4137   // for expand_ary2, limit is the (smaller) size of the second array.
4138   ShortBranchVerifier sbv(this);
4139   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4140 
4141   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4142          "Expansion only implemented for AVX2");
4143 
4144   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4145   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4146 
4147   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4148   int scaleIncr = expand_ary2 ? 8 : 16;
4149 
4150   if (is_array_equ) {
4151     // Check the input args
4152     cmpoop(ary1, ary2);
4153     jcc(Assembler::equal, TRUE_LABEL);
4154 
4155     // Need additional checks for arrays_equals.
4156     testptr(ary1, ary1);
4157     jcc(Assembler::zero, FALSE_LABEL);
4158     testptr(ary2, ary2);
4159     jcc(Assembler::zero, FALSE_LABEL);
4160 
4161     // Check the lengths
4162     movl(limit, Address(ary1, length_offset));
4163     cmpl(limit, Address(ary2, length_offset));
4164     jcc(Assembler::notEqual, FALSE_LABEL);
4165   }
4166 
4167   // count == 0
4168   testl(limit, limit);
4169   jcc(Assembler::zero, TRUE_LABEL);
4170 
4171   if (is_array_equ) {
4172     // Load array address
4173     lea(ary1, Address(ary1, base_offset));
4174     lea(ary2, Address(ary2, base_offset));
4175   }
4176 
4177   if (is_array_equ && is_char) {
4178     // arrays_equals when used for char[].
4179     shll(limit, 1);      // byte count != 0
4180   }
4181   movl(result, limit); // copy
4182 
4183   if (UseAVX >= 2) {
4184     // With AVX2, use 32-byte vector compare
4185     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4186 
4187     // Compare 32-byte vectors
4188     if (expand_ary2) {
4189       andl(result, 0x0000000f);  //   tail count (in bytes)
4190       andl(limit, 0xfffffff0);   // vector count (in bytes)
4191       jcc(Assembler::zero, COMPARE_TAIL);
4192     } else {
4193       andl(result, 0x0000001f);  //   tail count (in bytes)
4194       andl(limit, 0xffffffe0);   // vector count (in bytes)
4195       jcc(Assembler::zero, COMPARE_TAIL_16);
4196     }
4197 
4198     lea(ary1, Address(ary1, limit, scaleFactor));
4199     lea(ary2, Address(ary2, limit, Address::times_1));
4200     negptr(limit);
4201 
4202 #ifdef _LP64
4203     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4204       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4205 
4206       cmpl(limit, -64);
4207       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4208 
4209       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4210 
4211       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4212       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4213       kortestql(mask, mask);
4214       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4215       addptr(limit, 64);  // update since we already compared at this addr
4216       cmpl(limit, -64);
4217       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4218 
4219       // At this point we may still need to compare -limit+result bytes.
4220       // We could execute the next two instruction and just continue via non-wide path:
4221       //  cmpl(limit, 0);
4222       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4223       // But since we stopped at the points ary{1,2}+limit which are
4224       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4225       // (|limit| <= 32 and result < 32),
4226       // we may just compare the last 64 bytes.
4227       //
4228       addptr(result, -64);   // it is safe, bc we just came from this area
4229       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4230       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4231       kortestql(mask, mask);
4232       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4233 
4234       jmp(TRUE_LABEL);
4235 
4236       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4237 
4238     }//if (VM_Version::supports_avx512vlbw())
4239 #endif //_LP64
4240     bind(COMPARE_WIDE_VECTORS);
4241     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4242     if (expand_ary2) {
4243       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4244     } else {
4245       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4246     }
4247     vpxor(vec1, vec2);
4248 
4249     vptest(vec1, vec1);
4250     jcc(Assembler::notZero, FALSE_LABEL);
4251     addptr(limit, scaleIncr * 2);
4252     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4253 
4254     testl(result, result);
4255     jcc(Assembler::zero, TRUE_LABEL);
4256 
4257     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4258     if (expand_ary2) {
4259       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4260     } else {
4261       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4262     }
4263     vpxor(vec1, vec2);
4264 
4265     vptest(vec1, vec1);
4266     jcc(Assembler::notZero, FALSE_LABEL);
4267     jmp(TRUE_LABEL);
4268 
4269     bind(COMPARE_TAIL_16); // limit is zero
4270     movl(limit, result);
4271 
4272     // Compare 16-byte chunks
4273     andl(result, 0x0000000f);  //   tail count (in bytes)
4274     andl(limit, 0xfffffff0);   // vector count (in bytes)
4275     jcc(Assembler::zero, COMPARE_TAIL);
4276 
4277     lea(ary1, Address(ary1, limit, scaleFactor));
4278     lea(ary2, Address(ary2, limit, Address::times_1));
4279     negptr(limit);
4280 
4281     bind(COMPARE_WIDE_VECTORS_16);
4282     movdqu(vec1, Address(ary1, limit, scaleFactor));
4283     if (expand_ary2) {
4284       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4285     } else {
4286       movdqu(vec2, Address(ary2, limit, Address::times_1));
4287     }
4288     pxor(vec1, vec2);
4289 
4290     ptest(vec1, vec1);
4291     jcc(Assembler::notZero, FALSE_LABEL);
4292     addptr(limit, scaleIncr);
4293     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4294 
4295     bind(COMPARE_TAIL); // limit is zero
4296     movl(limit, result);
4297     // Fallthru to tail compare
4298   } else if (UseSSE42Intrinsics) {
4299     // With SSE4.2, use double quad vector compare
4300     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4301 
4302     // Compare 16-byte vectors
4303     andl(result, 0x0000000f);  //   tail count (in bytes)
4304     andl(limit, 0xfffffff0);   // vector count (in bytes)
4305     jcc(Assembler::zero, COMPARE_TAIL);
4306 
4307     lea(ary1, Address(ary1, limit, Address::times_1));
4308     lea(ary2, Address(ary2, limit, Address::times_1));
4309     negptr(limit);
4310 
4311     bind(COMPARE_WIDE_VECTORS);
4312     movdqu(vec1, Address(ary1, limit, Address::times_1));
4313     movdqu(vec2, Address(ary2, limit, Address::times_1));
4314     pxor(vec1, vec2);
4315 
4316     ptest(vec1, vec1);
4317     jcc(Assembler::notZero, FALSE_LABEL);
4318     addptr(limit, 16);
4319     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4320 
4321     testl(result, result);
4322     jcc(Assembler::zero, TRUE_LABEL);
4323 
4324     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4325     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4326     pxor(vec1, vec2);
4327 
4328     ptest(vec1, vec1);
4329     jccb(Assembler::notZero, FALSE_LABEL);
4330     jmpb(TRUE_LABEL);
4331 
4332     bind(COMPARE_TAIL); // limit is zero
4333     movl(limit, result);
4334     // Fallthru to tail compare
4335   }
4336 
4337   // Compare 4-byte vectors
4338   if (expand_ary2) {
4339     testl(result, result);
4340     jccb(Assembler::zero, TRUE_LABEL);
4341   } else {
4342     andl(limit, 0xfffffffc); // vector count (in bytes)
4343     jccb(Assembler::zero, COMPARE_CHAR);
4344   }
4345 
4346   lea(ary1, Address(ary1, limit, scaleFactor));
4347   lea(ary2, Address(ary2, limit, Address::times_1));
4348   negptr(limit);
4349 
4350   bind(COMPARE_VECTORS);
4351   if (expand_ary2) {
4352     // There are no "vector" operations for bytes to shorts
4353     movzbl(chr, Address(ary2, limit, Address::times_1));
4354     cmpw(Address(ary1, limit, Address::times_2), chr);
4355     jccb(Assembler::notEqual, FALSE_LABEL);
4356     addptr(limit, 1);
4357     jcc(Assembler::notZero, COMPARE_VECTORS);
4358     jmp(TRUE_LABEL);
4359   } else {
4360     movl(chr, Address(ary1, limit, Address::times_1));
4361     cmpl(chr, Address(ary2, limit, Address::times_1));
4362     jccb(Assembler::notEqual, FALSE_LABEL);
4363     addptr(limit, 4);
4364     jcc(Assembler::notZero, COMPARE_VECTORS);
4365   }
4366 
4367   // Compare trailing char (final 2 bytes), if any
4368   bind(COMPARE_CHAR);
4369   testl(result, 0x2);   // tail  char
4370   jccb(Assembler::zero, COMPARE_BYTE);
4371   load_unsigned_short(chr, Address(ary1, 0));
4372   load_unsigned_short(limit, Address(ary2, 0));
4373   cmpl(chr, limit);
4374   jccb(Assembler::notEqual, FALSE_LABEL);
4375 
4376   if (is_array_equ && is_char) {
4377     bind(COMPARE_BYTE);
4378   } else {
4379     lea(ary1, Address(ary1, 2));
4380     lea(ary2, Address(ary2, 2));
4381 
4382     bind(COMPARE_BYTE);
4383     testl(result, 0x1);   // tail  byte
4384     jccb(Assembler::zero, TRUE_LABEL);
4385     load_unsigned_byte(chr, Address(ary1, 0));
4386     load_unsigned_byte(limit, Address(ary2, 0));
4387     cmpl(chr, limit);
4388     jccb(Assembler::notEqual, FALSE_LABEL);
4389   }
4390   bind(TRUE_LABEL);
4391   movl(result, 1);   // return true
4392   jmpb(DONE);
4393 
4394   bind(FALSE_LABEL);
4395   xorl(result, result); // return false
4396 
4397   // That's it
4398   bind(DONE);
4399   if (UseAVX >= 2) {
4400     // clean upper bits of YMM registers
4401     vpxor(vec1, vec1);
4402     vpxor(vec2, vec2);
4403   }
4404 }
4405 
4406 #ifdef _LP64
4407 
4408 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4409 #define __ masm.
4410   Register dst = stub.data<0>();
4411   XMMRegister src = stub.data<1>();
4412   address target = stub.data<2>();
4413   __ bind(stub.entry());
4414   __ subptr(rsp, 8);
4415   __ movdbl(Address(rsp), src);
4416   __ call(RuntimeAddress(target));
4417   __ pop(dst);
4418   __ jmp(stub.continuation());
4419 #undef __
4420 }
4421 
4422 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4423   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4424   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4425 
4426   address slowpath_target;
4427   if (dst_bt == T_INT) {
4428     if (src_bt == T_FLOAT) {
4429       cvttss2sil(dst, src);
4430       cmpl(dst, 0x80000000);
4431       slowpath_target = StubRoutines::x86::f2i_fixup();
4432     } else {
4433       cvttsd2sil(dst, src);
4434       cmpl(dst, 0x80000000);
4435       slowpath_target = StubRoutines::x86::d2i_fixup();
4436     }
4437   } else {
4438     if (src_bt == T_FLOAT) {
4439       cvttss2siq(dst, src);
4440       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4441       slowpath_target = StubRoutines::x86::f2l_fixup();
4442     } else {
4443       cvttsd2siq(dst, src);
4444       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4445       slowpath_target = StubRoutines::x86::d2l_fixup();
4446     }
4447   }
4448 
4449   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4450   jcc(Assembler::equal, stub->entry());
4451   bind(stub->continuation());
4452 }
4453 
4454 #endif // _LP64
4455 
4456 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4457                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4458   switch(ideal_opc) {
4459     case Op_LShiftVS:
4460       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4461     case Op_LShiftVI:
4462       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4463     case Op_LShiftVL:
4464       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4465     case Op_RShiftVS:
4466       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4467     case Op_RShiftVI:
4468       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4469     case Op_RShiftVL:
4470       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4471     case Op_URShiftVS:
4472       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4473     case Op_URShiftVI:
4474       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4475     case Op_URShiftVL:
4476       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4477     case Op_RotateRightV:
4478       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4479     case Op_RotateLeftV:
4480       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4481     default:
4482       fatal("Unsupported masked operation"); break;
4483   }
4484 }
4485 
4486 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4487                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4488                                     bool is_varshift) {
4489   switch (ideal_opc) {
4490     case Op_AddVB:
4491       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4492     case Op_AddVS:
4493       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4494     case Op_AddVI:
4495       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4496     case Op_AddVL:
4497       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4498     case Op_AddVF:
4499       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4500     case Op_AddVD:
4501       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4502     case Op_SubVB:
4503       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4504     case Op_SubVS:
4505       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4506     case Op_SubVI:
4507       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4508     case Op_SubVL:
4509       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4510     case Op_SubVF:
4511       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4512     case Op_SubVD:
4513       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4514     case Op_MulVS:
4515       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4516     case Op_MulVI:
4517       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4518     case Op_MulVL:
4519       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4520     case Op_MulVF:
4521       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4522     case Op_MulVD:
4523       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4524     case Op_DivVF:
4525       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4526     case Op_DivVD:
4527       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4528     case Op_SqrtVF:
4529       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4530     case Op_SqrtVD:
4531       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4532     case Op_AbsVB:
4533       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4534     case Op_AbsVS:
4535       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4536     case Op_AbsVI:
4537       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4538     case Op_AbsVL:
4539       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4540     case Op_FmaVF:
4541       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4542     case Op_FmaVD:
4543       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4544     case Op_VectorRearrange:
4545       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4546     case Op_LShiftVS:
4547       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4548     case Op_LShiftVI:
4549       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4550     case Op_LShiftVL:
4551       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4552     case Op_RShiftVS:
4553       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4554     case Op_RShiftVI:
4555       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4556     case Op_RShiftVL:
4557       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4558     case Op_URShiftVS:
4559       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4560     case Op_URShiftVI:
4561       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4562     case Op_URShiftVL:
4563       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4564     case Op_RotateLeftV:
4565       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4566     case Op_RotateRightV:
4567       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4568     case Op_MaxV:
4569       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4570     case Op_MinV:
4571       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4572     case Op_XorV:
4573       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4574     case Op_OrV:
4575       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4576     case Op_AndV:
4577       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4578     default:
4579       fatal("Unsupported masked operation"); break;
4580   }
4581 }
4582 
4583 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4584                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4585   switch (ideal_opc) {
4586     case Op_AddVB:
4587       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4588     case Op_AddVS:
4589       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4590     case Op_AddVI:
4591       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4592     case Op_AddVL:
4593       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4594     case Op_AddVF:
4595       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4596     case Op_AddVD:
4597       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4598     case Op_SubVB:
4599       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4600     case Op_SubVS:
4601       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4602     case Op_SubVI:
4603       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4604     case Op_SubVL:
4605       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4606     case Op_SubVF:
4607       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4608     case Op_SubVD:
4609       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4610     case Op_MulVS:
4611       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4612     case Op_MulVI:
4613       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4614     case Op_MulVL:
4615       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4616     case Op_MulVF:
4617       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4618     case Op_MulVD:
4619       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4620     case Op_DivVF:
4621       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4622     case Op_DivVD:
4623       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4624     case Op_FmaVF:
4625       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4626     case Op_FmaVD:
4627       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4628     case Op_MaxV:
4629       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4630     case Op_MinV:
4631       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4632     case Op_XorV:
4633       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4634     case Op_OrV:
4635       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4636     case Op_AndV:
4637       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4638     default:
4639       fatal("Unsupported masked operation"); break;
4640   }
4641 }
4642 
4643 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4644                                   KRegister src1, KRegister src2) {
4645   BasicType etype = T_ILLEGAL;
4646   switch(mask_len) {
4647     case 2:
4648     case 4:
4649     case 8:  etype = T_BYTE; break;
4650     case 16: etype = T_SHORT; break;
4651     case 32: etype = T_INT; break;
4652     case 64: etype = T_LONG; break;
4653     default: fatal("Unsupported type"); break;
4654   }
4655   assert(etype != T_ILLEGAL, "");
4656   switch(ideal_opc) {
4657     case Op_AndVMask:
4658       kand(etype, dst, src1, src2); break;
4659     case Op_OrVMask:
4660       kor(etype, dst, src1, src2); break;
4661     case Op_XorVMask:
4662       kxor(etype, dst, src1, src2); break;
4663     default:
4664       fatal("Unsupported masked operation"); break;
4665   }
4666 }
4667 
4668 /*
4669  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4670  * If src is NaN, the result is 0.
4671  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4672  * the result is equal to the value of Integer.MIN_VALUE.
4673  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4674  * the result is equal to the value of Integer.MAX_VALUE.
4675  */
4676 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4677                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4678                                                                    Register rscratch, AddressLiteral float_sign_flip,
4679                                                                    int vec_enc) {
4680   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4681   Label done;
4682   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4683   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4684   vptest(xtmp2, xtmp2, vec_enc);
4685   jccb(Assembler::equal, done);
4686 
4687   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4688   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4689 
4690   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4691   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4692   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4693 
4694   // Recompute the mask for remaining special value.
4695   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4696   // Extract SRC values corresponding to TRUE mask lanes.
4697   vpand(xtmp4, xtmp2, src, vec_enc);
4698   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4699   // values are set.
4700   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4701 
4702   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4703   bind(done);
4704 }
4705 
4706 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4707                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4708                                                                     Register rscratch, AddressLiteral float_sign_flip,
4709                                                                     int vec_enc) {
4710   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4711   Label done;
4712   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4713   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4714   kortestwl(ktmp1, ktmp1);
4715   jccb(Assembler::equal, done);
4716 
4717   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4718   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4719   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4720 
4721   kxorwl(ktmp1, ktmp1, ktmp2);
4722   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4723   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4724   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4725   bind(done);
4726 }
4727 
4728 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4729                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4730                                                                      Register rscratch, AddressLiteral double_sign_flip,
4731                                                                      int vec_enc) {
4732   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4733 
4734   Label done;
4735   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4736   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4737   kortestwl(ktmp1, ktmp1);
4738   jccb(Assembler::equal, done);
4739 
4740   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4741   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4742   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4743 
4744   kxorwl(ktmp1, ktmp1, ktmp2);
4745   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4746   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4747   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4748   bind(done);
4749 }
4750 
4751 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4752                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4753                                                                      Register rscratch, AddressLiteral float_sign_flip,
4754                                                                      int vec_enc) {
4755   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4756   Label done;
4757   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4758   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4759   kortestwl(ktmp1, ktmp1);
4760   jccb(Assembler::equal, done);
4761 
4762   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4763   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4764   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4765 
4766   kxorwl(ktmp1, ktmp1, ktmp2);
4767   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4768   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4769   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4770   bind(done);
4771 }
4772 
4773 /*
4774  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4775  * If src is NaN, the result is 0.
4776  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4777  * the result is equal to the value of Long.MIN_VALUE.
4778  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4779  * the result is equal to the value of Long.MAX_VALUE.
4780  */
4781 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4782                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4783                                                                       Register rscratch, AddressLiteral double_sign_flip,
4784                                                                       int vec_enc) {
4785   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4786 
4787   Label done;
4788   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4789   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4790   kortestwl(ktmp1, ktmp1);
4791   jccb(Assembler::equal, done);
4792 
4793   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4794   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4795   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4796 
4797   kxorwl(ktmp1, ktmp1, ktmp2);
4798   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4799   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4800   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4801   bind(done);
4802 }
4803 
4804 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4805                                                              XMMRegister xtmp, int index, int vec_enc) {
4806    assert(vec_enc < Assembler::AVX_512bit, "");
4807    if (vec_enc == Assembler::AVX_256bit) {
4808      vextractf128_high(xtmp, src);
4809      vshufps(dst, src, xtmp, index, vec_enc);
4810    } else {
4811      vshufps(dst, src, zero, index, vec_enc);
4812    }
4813 }
4814 
4815 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4816                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4817                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4818   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4819 
4820   Label done;
4821   // Compare the destination lanes with float_sign_flip
4822   // value to get mask for all special values.
4823   movdqu(xtmp1, float_sign_flip, rscratch);
4824   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4825   ptest(xtmp2, xtmp2);
4826   jccb(Assembler::equal, done);
4827 
4828   // Flip float_sign_flip to get max integer value.
4829   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4830   pxor(xtmp1, xtmp4);
4831 
4832   // Set detination lanes corresponding to unordered source lanes as zero.
4833   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
4834   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
4835 
4836   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4837   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4838   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
4839 
4840   // Recompute the mask for remaining special value.
4841   pxor(xtmp2, xtmp3);
4842   // Extract mask corresponding to non-negative source lanes.
4843   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
4844 
4845   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4846   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4847   pand(xtmp3, xtmp2);
4848 
4849   // Replace destination lanes holding special value(0x80000000) with max int
4850   // if corresponding source lane holds a +ve value.
4851   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
4852   bind(done);
4853 }
4854 
4855 
4856 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
4857                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
4858   switch(to_elem_bt) {
4859     case T_SHORT:
4860       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
4861       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
4862       vpackusdw(dst, dst, zero, vec_enc);
4863       if (vec_enc == Assembler::AVX_256bit) {
4864         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4865       }
4866       break;
4867     case  T_BYTE:
4868       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
4869       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
4870       vpackusdw(dst, dst, zero, vec_enc);
4871       if (vec_enc == Assembler::AVX_256bit) {
4872         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4873       }
4874       vpackuswb(dst, dst, zero, vec_enc);
4875       break;
4876     default: assert(false, "%s", type2name(to_elem_bt));
4877   }
4878 }
4879 
4880 /*
4881  * Algorithm for vector D2L and F2I conversions:-
4882  * a) Perform vector D2L/F2I cast.
4883  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4884  *    It signifies that source value could be any of the special floating point
4885  *    values(NaN,-Inf,Inf,Max,-Min).
4886  * c) Set destination to zero if source is NaN value.
4887  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4888  */
4889 
4890 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4891                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4892                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4893   int to_elem_sz = type2aelembytes(to_elem_bt);
4894   assert(to_elem_sz <= 4, "");
4895   vcvttps2dq(dst, src, vec_enc);
4896   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
4897   if (to_elem_sz < 4) {
4898     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4899     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
4900   }
4901 }
4902 
4903 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4904                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
4905                                             Register rscratch, int vec_enc) {
4906   int to_elem_sz = type2aelembytes(to_elem_bt);
4907   assert(to_elem_sz <= 4, "");
4908   vcvttps2dq(dst, src, vec_enc);
4909   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
4910   switch(to_elem_bt) {
4911     case T_INT:
4912       break;
4913     case T_SHORT:
4914       evpmovdw(dst, dst, vec_enc);
4915       break;
4916     case T_BYTE:
4917       evpmovdb(dst, dst, vec_enc);
4918       break;
4919     default: assert(false, "%s", type2name(to_elem_bt));
4920   }
4921 }
4922 
4923 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4924                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
4925                                             Register rscratch, int vec_enc) {
4926   evcvttps2qq(dst, src, vec_enc);
4927   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
4928 }
4929 
4930 // Handling for downcasting from double to integer or sub-word types on AVX2.
4931 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4932                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
4933                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4934   int to_elem_sz = type2aelembytes(to_elem_bt);
4935   assert(to_elem_sz < 8, "");
4936   vcvttpd2dq(dst, src, vec_enc);
4937   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
4938                                               float_sign_flip, vec_enc);
4939   if (to_elem_sz < 4) {
4940     // xtmp4 holds all zero lanes.
4941     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
4942   }
4943 }
4944 
4945 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
4946                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
4947                                             KRegister ktmp2, AddressLiteral sign_flip,
4948                                             Register rscratch, int vec_enc) {
4949   if (VM_Version::supports_avx512dq()) {
4950     evcvttpd2qq(dst, src, vec_enc);
4951     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4952     switch(to_elem_bt) {
4953       case T_LONG:
4954         break;
4955       case T_INT:
4956         evpmovsqd(dst, dst, vec_enc);
4957         break;
4958       case T_SHORT:
4959         evpmovsqd(dst, dst, vec_enc);
4960         evpmovdw(dst, dst, vec_enc);
4961         break;
4962       case T_BYTE:
4963         evpmovsqd(dst, dst, vec_enc);
4964         evpmovdb(dst, dst, vec_enc);
4965         break;
4966       default: assert(false, "%s", type2name(to_elem_bt));
4967     }
4968   } else {
4969     assert(type2aelembytes(to_elem_bt) <= 4, "");
4970     vcvttpd2dq(dst, src, vec_enc);
4971     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4972     switch(to_elem_bt) {
4973       case T_INT:
4974         break;
4975       case T_SHORT:
4976         evpmovdw(dst, dst, vec_enc);
4977         break;
4978       case T_BYTE:
4979         evpmovdb(dst, dst, vec_enc);
4980         break;
4981       default: assert(false, "%s", type2name(to_elem_bt));
4982     }
4983   }
4984 }
4985 
4986 #ifdef _LP64
4987 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
4988                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4989                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4990   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4991   // and re-instantiate original MXCSR.RC mode after that.
4992   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4993 
4994   mov64(tmp, julong_cast(0.5L));
4995   evpbroadcastq(xtmp1, tmp, vec_enc);
4996   vaddpd(xtmp1, src , xtmp1, vec_enc);
4997   evcvtpd2qq(dst, xtmp1, vec_enc);
4998   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4999                                                 double_sign_flip, vec_enc);;
5000 
5001   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5002 }
5003 
5004 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5005                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5006                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5007   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5008   // and re-instantiate original MXCSR.RC mode after that.
5009   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5010 
5011   movl(tmp, jint_cast(0.5));
5012   movq(xtmp1, tmp);
5013   vbroadcastss(xtmp1, xtmp1, vec_enc);
5014   vaddps(xtmp1, src , xtmp1, vec_enc);
5015   vcvtps2dq(dst, xtmp1, vec_enc);
5016   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5017                                               float_sign_flip, vec_enc);
5018 
5019   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5020 }
5021 
5022 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5023                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5024                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5025   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5026   // and re-instantiate original MXCSR.RC mode after that.
5027   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5028 
5029   movl(tmp, jint_cast(0.5));
5030   movq(xtmp1, tmp);
5031   vbroadcastss(xtmp1, xtmp1, vec_enc);
5032   vaddps(xtmp1, src , xtmp1, vec_enc);
5033   vcvtps2dq(dst, xtmp1, vec_enc);
5034   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5035 
5036   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5037 }
5038 #endif // _LP64
5039 
5040 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5041                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5042   switch (from_elem_bt) {
5043     case T_BYTE:
5044       switch (to_elem_bt) {
5045         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5046         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5047         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5048         default: ShouldNotReachHere();
5049       }
5050       break;
5051     case T_SHORT:
5052       switch (to_elem_bt) {
5053         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5054         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5055         default: ShouldNotReachHere();
5056       }
5057       break;
5058     case T_INT:
5059       assert(to_elem_bt == T_LONG, "");
5060       vpmovzxdq(dst, src, vlen_enc);
5061       break;
5062     default:
5063       ShouldNotReachHere();
5064   }
5065 }
5066 
5067 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5068                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5069   switch (from_elem_bt) {
5070     case T_BYTE:
5071       switch (to_elem_bt) {
5072         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5073         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5074         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5075         default: ShouldNotReachHere();
5076       }
5077       break;
5078     case T_SHORT:
5079       switch (to_elem_bt) {
5080         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5081         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5082         default: ShouldNotReachHere();
5083       }
5084       break;
5085     case T_INT:
5086       assert(to_elem_bt == T_LONG, "");
5087       vpmovsxdq(dst, src, vlen_enc);
5088       break;
5089     default:
5090       ShouldNotReachHere();
5091   }
5092 }
5093 
5094 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5095                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5096   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5097   assert(vlen_enc != AVX_512bit, "");
5098 
5099   int dst_bt_size = type2aelembytes(dst_bt);
5100   int src_bt_size = type2aelembytes(src_bt);
5101   if (dst_bt_size > src_bt_size) {
5102     switch (dst_bt_size / src_bt_size) {
5103       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5104       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5105       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5106       default: ShouldNotReachHere();
5107     }
5108   } else {
5109     assert(dst_bt_size < src_bt_size, "");
5110     switch (src_bt_size / dst_bt_size) {
5111       case 2: {
5112         if (vlen_enc == AVX_128bit) {
5113           vpacksswb(dst, src, src, vlen_enc);
5114         } else {
5115           vpacksswb(dst, src, src, vlen_enc);
5116           vpermq(dst, dst, 0x08, vlen_enc);
5117         }
5118         break;
5119       }
5120       case 4: {
5121         if (vlen_enc == AVX_128bit) {
5122           vpackssdw(dst, src, src, vlen_enc);
5123           vpacksswb(dst, dst, dst, vlen_enc);
5124         } else {
5125           vpackssdw(dst, src, src, vlen_enc);
5126           vpermq(dst, dst, 0x08, vlen_enc);
5127           vpacksswb(dst, dst, dst, AVX_128bit);
5128         }
5129         break;
5130       }
5131       case 8: {
5132         if (vlen_enc == AVX_128bit) {
5133           vpshufd(dst, src, 0x08, vlen_enc);
5134           vpackssdw(dst, dst, dst, vlen_enc);
5135           vpacksswb(dst, dst, dst, vlen_enc);
5136         } else {
5137           vpshufd(dst, src, 0x08, vlen_enc);
5138           vpermq(dst, dst, 0x08, vlen_enc);
5139           vpackssdw(dst, dst, dst, AVX_128bit);
5140           vpacksswb(dst, dst, dst, AVX_128bit);
5141         }
5142         break;
5143       }
5144       default: ShouldNotReachHere();
5145     }
5146   }
5147 }
5148 
5149 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5150                                    bool merge, BasicType bt, int vlen_enc) {
5151   if (bt == T_INT) {
5152     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5153   } else {
5154     assert(bt == T_LONG, "");
5155     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5156   }
5157 }
5158 
5159 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5160                                    bool merge, BasicType bt, int vlen_enc) {
5161   if (bt == T_INT) {
5162     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5163   } else {
5164     assert(bt == T_LONG, "");
5165     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5166   }
5167 }
5168 
5169 #ifdef _LP64
5170 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5171                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5172                                                int vec_enc) {
5173   int index = 0;
5174   int vindex = 0;
5175   mov64(rtmp1, 0x0101010101010101L);
5176   pdepq(rtmp1, src, rtmp1);
5177   if (mask_len > 8) {
5178     movq(rtmp2, src);
5179     vpxor(xtmp, xtmp, xtmp, vec_enc);
5180     movq(xtmp, rtmp1);
5181   }
5182   movq(dst, rtmp1);
5183 
5184   mask_len -= 8;
5185   while (mask_len > 0) {
5186     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5187     index++;
5188     if ((index % 2) == 0) {
5189       pxor(xtmp, xtmp);
5190     }
5191     mov64(rtmp1, 0x0101010101010101L);
5192     shrq(rtmp2, 8);
5193     pdepq(rtmp1, rtmp2, rtmp1);
5194     pinsrq(xtmp, rtmp1, index % 2);
5195     vindex = index / 2;
5196     if (vindex) {
5197       // Write entire 16 byte vector when both 64 bit
5198       // lanes are update to save redundant instructions.
5199       if (index % 2) {
5200         vinsertf128(dst, dst, xtmp, vindex);
5201       }
5202     } else {
5203       vmovdqu(dst, xtmp);
5204     }
5205     mask_len -= 8;
5206   }
5207 }
5208 
5209 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5210   switch(opc) {
5211     case Op_VectorMaskTrueCount:
5212       popcntq(dst, tmp);
5213       break;
5214     case Op_VectorMaskLastTrue:
5215       if (VM_Version::supports_lzcnt()) {
5216         lzcntq(tmp, tmp);
5217         movl(dst, 63);
5218         subl(dst, tmp);
5219       } else {
5220         movl(dst, -1);
5221         bsrq(tmp, tmp);
5222         cmov32(Assembler::notZero, dst, tmp);
5223       }
5224       break;
5225     case Op_VectorMaskFirstTrue:
5226       if (VM_Version::supports_bmi1()) {
5227         if (masklen < 32) {
5228           orl(tmp, 1 << masklen);
5229           tzcntl(dst, tmp);
5230         } else if (masklen == 32) {
5231           tzcntl(dst, tmp);
5232         } else {
5233           assert(masklen == 64, "");
5234           tzcntq(dst, tmp);
5235         }
5236       } else {
5237         if (masklen < 32) {
5238           orl(tmp, 1 << masklen);
5239           bsfl(dst, tmp);
5240         } else {
5241           assert(masklen == 32 || masklen == 64, "");
5242           movl(dst, masklen);
5243           if (masklen == 32)  {
5244             bsfl(tmp, tmp);
5245           } else {
5246             bsfq(tmp, tmp);
5247           }
5248           cmov32(Assembler::notZero, dst, tmp);
5249         }
5250       }
5251       break;
5252     case Op_VectorMaskToLong:
5253       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5254       break;
5255     default: assert(false, "Unhandled mask operation");
5256   }
5257 }
5258 
5259 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5260                                               int masklen, int masksize, int vec_enc) {
5261   assert(VM_Version::supports_popcnt(), "");
5262 
5263   if(VM_Version::supports_avx512bw()) {
5264     kmovql(tmp, mask);
5265   } else {
5266     assert(masklen <= 16, "");
5267     kmovwl(tmp, mask);
5268   }
5269 
5270   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5271   // operations needs to be clipped.
5272   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5273     andq(tmp, (1 << masklen) - 1);
5274   }
5275 
5276   vector_mask_operation_helper(opc, dst, tmp, masklen);
5277 }
5278 
5279 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5280                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5281   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5282          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5283   assert(VM_Version::supports_popcnt(), "");
5284 
5285   bool need_clip = false;
5286   switch(bt) {
5287     case T_BOOLEAN:
5288       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5289       vpxor(xtmp, xtmp, xtmp, vec_enc);
5290       vpsubb(xtmp, xtmp, mask, vec_enc);
5291       vpmovmskb(tmp, xtmp, vec_enc);
5292       need_clip = masklen < 16;
5293       break;
5294     case T_BYTE:
5295       vpmovmskb(tmp, mask, vec_enc);
5296       need_clip = masklen < 16;
5297       break;
5298     case T_SHORT:
5299       vpacksswb(xtmp, mask, mask, vec_enc);
5300       if (masklen >= 16) {
5301         vpermpd(xtmp, xtmp, 8, vec_enc);
5302       }
5303       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5304       need_clip = masklen < 16;
5305       break;
5306     case T_INT:
5307     case T_FLOAT:
5308       vmovmskps(tmp, mask, vec_enc);
5309       need_clip = masklen < 4;
5310       break;
5311     case T_LONG:
5312     case T_DOUBLE:
5313       vmovmskpd(tmp, mask, vec_enc);
5314       need_clip = masklen < 2;
5315       break;
5316     default: assert(false, "Unhandled type, %s", type2name(bt));
5317   }
5318 
5319   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5320   // operations needs to be clipped.
5321   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5322     // need_clip implies masklen < 32
5323     andq(tmp, (1 << masklen) - 1);
5324   }
5325 
5326   vector_mask_operation_helper(opc, dst, tmp, masklen);
5327 }
5328 
5329 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5330                                              Register rtmp2, int mask_len) {
5331   kmov(rtmp1, src);
5332   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5333   mov64(rtmp2, -1L);
5334   pextq(rtmp2, rtmp2, rtmp1);
5335   kmov(dst, rtmp2);
5336 }
5337 
5338 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5339                                                     XMMRegister mask, Register rtmp, Register rscratch,
5340                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5341                                                     int vec_enc) {
5342   assert(type2aelembytes(bt) >= 4, "");
5343   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5344   address compress_perm_table = nullptr;
5345   address expand_perm_table = nullptr;
5346   if (type2aelembytes(bt) == 8) {
5347     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5348     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5349     vmovmskpd(rtmp, mask, vec_enc);
5350   } else {
5351     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5352     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5353     vmovmskps(rtmp, mask, vec_enc);
5354   }
5355   shlq(rtmp, 5); // for 32 byte permute row.
5356   if (opcode == Op_CompressV) {
5357     lea(rscratch, ExternalAddress(compress_perm_table));
5358   } else {
5359     lea(rscratch, ExternalAddress(expand_perm_table));
5360   }
5361   addptr(rtmp, rscratch);
5362   vmovdqu(permv, Address(rtmp));
5363   vpermps(dst, permv, src, Assembler::AVX_256bit);
5364   vpxor(xtmp, xtmp, xtmp, vec_enc);
5365   // Blend the result with zero vector using permute mask, each column entry
5366   // in a permute table row contains either a valid permute index or a -1 (default)
5367   // value, this can potentially be used as a blending mask after
5368   // compressing/expanding the source vector lanes.
5369   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5370 }
5371 
5372 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5373                                                bool merge, BasicType bt, int vec_enc) {
5374   if (opcode == Op_CompressV) {
5375     switch(bt) {
5376     case T_BYTE:
5377       evpcompressb(dst, mask, src, merge, vec_enc);
5378       break;
5379     case T_CHAR:
5380     case T_SHORT:
5381       evpcompressw(dst, mask, src, merge, vec_enc);
5382       break;
5383     case T_INT:
5384       evpcompressd(dst, mask, src, merge, vec_enc);
5385       break;
5386     case T_FLOAT:
5387       evcompressps(dst, mask, src, merge, vec_enc);
5388       break;
5389     case T_LONG:
5390       evpcompressq(dst, mask, src, merge, vec_enc);
5391       break;
5392     case T_DOUBLE:
5393       evcompresspd(dst, mask, src, merge, vec_enc);
5394       break;
5395     default:
5396       fatal("Unsupported type %s", type2name(bt));
5397       break;
5398     }
5399   } else {
5400     assert(opcode == Op_ExpandV, "");
5401     switch(bt) {
5402     case T_BYTE:
5403       evpexpandb(dst, mask, src, merge, vec_enc);
5404       break;
5405     case T_CHAR:
5406     case T_SHORT:
5407       evpexpandw(dst, mask, src, merge, vec_enc);
5408       break;
5409     case T_INT:
5410       evpexpandd(dst, mask, src, merge, vec_enc);
5411       break;
5412     case T_FLOAT:
5413       evexpandps(dst, mask, src, merge, vec_enc);
5414       break;
5415     case T_LONG:
5416       evpexpandq(dst, mask, src, merge, vec_enc);
5417       break;
5418     case T_DOUBLE:
5419       evexpandpd(dst, mask, src, merge, vec_enc);
5420       break;
5421     default:
5422       fatal("Unsupported type %s", type2name(bt));
5423       break;
5424     }
5425   }
5426 }
5427 #endif
5428 
5429 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5430                                            KRegister ktmp1, int vec_enc) {
5431   if (opcode == Op_SignumVD) {
5432     vsubpd(dst, zero, one, vec_enc);
5433     // if src < 0 ? -1 : 1
5434     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5435     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5436     // if src == NaN, -0.0 or 0.0 return src.
5437     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5438     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5439   } else {
5440     assert(opcode == Op_SignumVF, "");
5441     vsubps(dst, zero, one, vec_enc);
5442     // if src < 0 ? -1 : 1
5443     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5444     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5445     // if src == NaN, -0.0 or 0.0 return src.
5446     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5447     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5448   }
5449 }
5450 
5451 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5452                                           XMMRegister xtmp1, int vec_enc) {
5453   if (opcode == Op_SignumVD) {
5454     vsubpd(dst, zero, one, vec_enc);
5455     // if src < 0 ? -1 : 1
5456     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5457     // if src == NaN, -0.0 or 0.0 return src.
5458     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5459     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5460   } else {
5461     assert(opcode == Op_SignumVF, "");
5462     vsubps(dst, zero, one, vec_enc);
5463     // if src < 0 ? -1 : 1
5464     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5465     // if src == NaN, -0.0 or 0.0 return src.
5466     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5467     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5468   }
5469 }
5470 
5471 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5472   if (VM_Version::supports_avx512bw()) {
5473     if (mask_len > 32) {
5474       kmovql(dst, src);
5475     } else {
5476       kmovdl(dst, src);
5477       if (mask_len != 32) {
5478         kshiftrdl(dst, dst, 32 - mask_len);
5479       }
5480     }
5481   } else {
5482     assert(mask_len <= 16, "");
5483     kmovwl(dst, src);
5484     if (mask_len != 16) {
5485       kshiftrwl(dst, dst, 16 - mask_len);
5486     }
5487   }
5488 }
5489 
5490 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5491   int lane_size = type2aelembytes(bt);
5492   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5493   if ((is_LP64 || lane_size < 8) &&
5494       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5495        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5496     movptr(rtmp, imm32);
5497     switch(lane_size) {
5498       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5499       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5500       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5501       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5502       fatal("Unsupported lane size %d", lane_size);
5503       break;
5504     }
5505   } else {
5506     movptr(rtmp, imm32);
5507     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5508     switch(lane_size) {
5509       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5510       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5511       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5512       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5513       fatal("Unsupported lane size %d", lane_size);
5514       break;
5515     }
5516   }
5517 }
5518 
5519 //
5520 // Following is lookup table based popcount computation algorithm:-
5521 //       Index   Bit set count
5522 //     [ 0000 ->   0,
5523 //       0001 ->   1,
5524 //       0010 ->   1,
5525 //       0011 ->   2,
5526 //       0100 ->   1,
5527 //       0101 ->   2,
5528 //       0110 ->   2,
5529 //       0111 ->   3,
5530 //       1000 ->   1,
5531 //       1001 ->   2,
5532 //       1010 ->   3,
5533 //       1011 ->   3,
5534 //       1100 ->   2,
5535 //       1101 ->   3,
5536 //       1111 ->   4 ]
5537 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5538 //     shuffle indices for lookup table access.
5539 //  b. Right shift each byte of vector lane by 4 positions.
5540 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5541 //     shuffle indices for lookup table access.
5542 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5543 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5544 //     count of all the bytes of a quadword.
5545 //  f. Perform step e. for upper 128bit vector lane.
5546 //  g. Pack the bitset count of quadwords back to double word.
5547 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5548 
5549 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5550                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5551   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5552   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5553   vpsrlw(dst, src, 4, vec_enc);
5554   vpand(dst, dst, xtmp1, vec_enc);
5555   vpand(xtmp1, src, xtmp1, vec_enc);
5556   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5557   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5558   vpshufb(dst, xtmp2, dst, vec_enc);
5559   vpaddb(dst, dst, xtmp1, vec_enc);
5560 }
5561 
5562 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5563                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5564   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5565   // Following code is as per steps e,f,g and h of above algorithm.
5566   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5567   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5568   vpsadbw(dst, dst, xtmp2, vec_enc);
5569   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5570   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5571   vpackuswb(dst, xtmp1, dst, vec_enc);
5572 }
5573 
5574 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5575                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5576   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5577   // Add the popcount of upper and lower bytes of word.
5578   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5579   vpsrlw(dst, xtmp1, 8, vec_enc);
5580   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5581   vpaddw(dst, dst, xtmp1, vec_enc);
5582 }
5583 
5584 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5585                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5586   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5587   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5588   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5589 }
5590 
5591 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5592                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5593   switch(bt) {
5594     case T_LONG:
5595       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5596       break;
5597     case T_INT:
5598       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5599       break;
5600     case T_CHAR:
5601     case T_SHORT:
5602       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5603       break;
5604     case T_BYTE:
5605     case T_BOOLEAN:
5606       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5607       break;
5608     default:
5609       fatal("Unsupported type %s", type2name(bt));
5610       break;
5611   }
5612 }
5613 
5614 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5615                                                       KRegister mask, bool merge, int vec_enc) {
5616   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5617   switch(bt) {
5618     case T_LONG:
5619       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5620       evpopcntq(dst, mask, src, merge, vec_enc);
5621       break;
5622     case T_INT:
5623       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5624       evpopcntd(dst, mask, src, merge, vec_enc);
5625       break;
5626     case T_CHAR:
5627     case T_SHORT:
5628       assert(VM_Version::supports_avx512_bitalg(), "");
5629       evpopcntw(dst, mask, src, merge, vec_enc);
5630       break;
5631     case T_BYTE:
5632     case T_BOOLEAN:
5633       assert(VM_Version::supports_avx512_bitalg(), "");
5634       evpopcntb(dst, mask, src, merge, vec_enc);
5635       break;
5636     default:
5637       fatal("Unsupported type %s", type2name(bt));
5638       break;
5639   }
5640 }
5641 
5642 #ifndef _LP64
5643 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5644   assert(VM_Version::supports_avx512bw(), "");
5645   kmovdl(tmp, src);
5646   kunpckdql(dst, tmp, tmp);
5647 }
5648 #endif
5649 
5650 // Bit reversal algorithm first reverses the bits of each byte followed by
5651 // a byte level reversal for multi-byte primitive types (short/int/long).
5652 // Algorithm performs a lookup table access to get reverse bit sequence
5653 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5654 // is obtained by swapping the reverse bit sequences of upper and lower
5655 // nibble of a byte.
5656 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5657                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5658   if (VM_Version::supports_avx512vlbw()) {
5659 
5660     // Get the reverse bit sequence of lower nibble of each byte.
5661     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5662     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5663     evpandq(dst, xtmp2, src, vec_enc);
5664     vpshufb(dst, xtmp1, dst, vec_enc);
5665     vpsllq(dst, dst, 4, vec_enc);
5666 
5667     // Get the reverse bit sequence of upper nibble of each byte.
5668     vpandn(xtmp2, xtmp2, src, vec_enc);
5669     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5670     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5671 
5672     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5673     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5674     evporq(xtmp2, dst, xtmp2, vec_enc);
5675     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5676 
5677   } else if(vec_enc == Assembler::AVX_512bit) {
5678     // Shift based bit reversal.
5679     assert(bt == T_LONG || bt == T_INT, "");
5680 
5681     // Swap lower and upper nibble of each byte.
5682     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5683 
5684     // Swap two least and most significant bits of each nibble.
5685     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5686 
5687     // Swap adjacent pair of bits.
5688     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5689     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5690 
5691     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5692     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5693   } else {
5694     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5695     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5696 
5697     // Get the reverse bit sequence of lower nibble of each byte.
5698     vpand(dst, xtmp2, src, vec_enc);
5699     vpshufb(dst, xtmp1, dst, vec_enc);
5700     vpsllq(dst, dst, 4, vec_enc);
5701 
5702     // Get the reverse bit sequence of upper nibble of each byte.
5703     vpandn(xtmp2, xtmp2, src, vec_enc);
5704     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5705     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5706 
5707     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5708     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5709     vpor(xtmp2, dst, xtmp2, vec_enc);
5710     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5711   }
5712 }
5713 
5714 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5715                                                 XMMRegister xtmp, Register rscratch) {
5716   assert(VM_Version::supports_gfni(), "");
5717   assert(rscratch != noreg || always_reachable(mask), "missing");
5718 
5719   // Galois field instruction based bit reversal based on following algorithm.
5720   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5721   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5722   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5723   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5724 }
5725 
5726 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5727                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5728   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5729   evpandq(dst, xtmp1, src, vec_enc);
5730   vpsllq(dst, dst, nbits, vec_enc);
5731   vpandn(xtmp1, xtmp1, src, vec_enc);
5732   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5733   evporq(dst, dst, xtmp1, vec_enc);
5734 }
5735 
5736 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5737                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5738   // Shift based bit reversal.
5739   assert(VM_Version::supports_evex(), "");
5740   switch(bt) {
5741     case T_LONG:
5742       // Swap upper and lower double word of each quad word.
5743       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5744       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5745       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5746       break;
5747     case T_INT:
5748       // Swap upper and lower word of each double word.
5749       evprord(xtmp1, k0, src, 16, true, vec_enc);
5750       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5751       break;
5752     case T_CHAR:
5753     case T_SHORT:
5754       // Swap upper and lower byte of each word.
5755       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5756       break;
5757     case T_BYTE:
5758       evmovdquq(dst, k0, src, true, vec_enc);
5759       break;
5760     default:
5761       fatal("Unsupported type %s", type2name(bt));
5762       break;
5763   }
5764 }
5765 
5766 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5767   if (bt == T_BYTE) {
5768     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5769       evmovdquq(dst, k0, src, true, vec_enc);
5770     } else {
5771       vmovdqu(dst, src);
5772     }
5773     return;
5774   }
5775   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5776   // pre-computed shuffle indices.
5777   switch(bt) {
5778     case T_LONG:
5779       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5780       break;
5781     case T_INT:
5782       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5783       break;
5784     case T_CHAR:
5785     case T_SHORT:
5786       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5787       break;
5788     default:
5789       fatal("Unsupported type %s", type2name(bt));
5790       break;
5791   }
5792   vpshufb(dst, src, dst, vec_enc);
5793 }
5794 
5795 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5796                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5797                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5798   assert(is_integral_type(bt), "");
5799   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5800   assert(VM_Version::supports_avx512cd(), "");
5801   switch(bt) {
5802     case T_LONG:
5803       evplzcntq(dst, ktmp, src, merge, vec_enc);
5804       break;
5805     case T_INT:
5806       evplzcntd(dst, ktmp, src, merge, vec_enc);
5807       break;
5808     case T_SHORT:
5809       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5810       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5811       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5812       vpunpckhwd(dst, xtmp1, src, vec_enc);
5813       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5814       vpackusdw(dst, xtmp2, dst, vec_enc);
5815       break;
5816     case T_BYTE:
5817       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5818       // accessing the lookup table.
5819       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5820       // accessing the lookup table.
5821       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5822       assert(VM_Version::supports_avx512bw(), "");
5823       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5824       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5825       vpand(xtmp2, dst, src, vec_enc);
5826       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5827       vpsrlw(xtmp3, src, 4, vec_enc);
5828       vpand(xtmp3, dst, xtmp3, vec_enc);
5829       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5830       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5831       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5832       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5833       break;
5834     default:
5835       fatal("Unsupported type %s", type2name(bt));
5836       break;
5837   }
5838 }
5839 
5840 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5841                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5842   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5843   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5844   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5845   // accessing the lookup table.
5846   vpand(dst, xtmp2, src, vec_enc);
5847   vpshufb(dst, xtmp1, dst, vec_enc);
5848   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5849   // accessing the lookup table.
5850   vpsrlw(xtmp3, src, 4, vec_enc);
5851   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5852   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5853   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5854   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5855   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5856   vpaddb(dst, dst, xtmp2, vec_enc);
5857   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5858 }
5859 
5860 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5861                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5862   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5863   // Add zero counts of lower byte and upper byte of a word if
5864   // upper byte holds a zero value.
5865   vpsrlw(xtmp3, src, 8, vec_enc);
5866   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5867   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5868   vpsllw(xtmp2, dst, 8, vec_enc);
5869   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5870   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5871   vpsrlw(dst, dst, 8, vec_enc);
5872 }
5873 
5874 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5875                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5876   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5877   // hence biased exponent can be used to compute leading zero count as per
5878   // following formula:-
5879   // LZCNT = 32 - (biased_exp - 127)
5880   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5881 
5882   // Broadcast 0xFF
5883   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5884   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5885 
5886   // Extract biased exponent.
5887   vcvtdq2ps(dst, src, vec_enc);
5888   vpsrld(dst, dst, 23, vec_enc);
5889   vpand(dst, dst, xtmp1, vec_enc);
5890 
5891   // Broadcast 127.
5892   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5893   // Exponent = biased_exp - 127
5894   vpsubd(dst, dst, xtmp1, vec_enc);
5895 
5896   // Exponent = Exponent  + 1
5897   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5898   vpaddd(dst, dst, xtmp3, vec_enc);
5899 
5900   // Replace -ve exponent with zero, exponent is -ve when src
5901   // lane contains a zero value.
5902   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5903   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5904 
5905   // Rematerialize broadcast 32.
5906   vpslld(xtmp1, xtmp3, 5, vec_enc);
5907   // Exponent is 32 if corresponding source lane contains max_int value.
5908   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5909   // LZCNT = 32 - exponent
5910   vpsubd(dst, xtmp1, dst, vec_enc);
5911 
5912   // Replace LZCNT with a value 1 if corresponding source lane
5913   // contains max_int value.
5914   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5915 
5916   // Replace biased_exp with 0 if source lane value is less than zero.
5917   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5918   vblendvps(dst, dst, xtmp2, src, vec_enc);
5919 }
5920 
5921 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5922                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5923   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5924   // Add zero counts of lower word and upper word of a double word if
5925   // upper word holds a zero value.
5926   vpsrld(xtmp3, src, 16, vec_enc);
5927   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5928   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
5929   vpslld(xtmp2, dst, 16, vec_enc);
5930   vpaddd(xtmp2, xtmp2, dst, vec_enc);
5931   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5932   vpsrld(dst, dst, 16, vec_enc);
5933   // Add zero counts of lower doubleword and upper doubleword of a
5934   // quadword if upper doubleword holds a zero value.
5935   vpsrlq(xtmp3, src, 32, vec_enc);
5936   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
5937   vpsllq(xtmp2, dst, 32, vec_enc);
5938   vpaddq(xtmp2, xtmp2, dst, vec_enc);
5939   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5940   vpsrlq(dst, dst, 32, vec_enc);
5941 }
5942 
5943 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
5944                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5945                                                        Register rtmp, int vec_enc) {
5946   assert(is_integral_type(bt), "unexpected type");
5947   assert(vec_enc < Assembler::AVX_512bit, "");
5948   switch(bt) {
5949     case T_LONG:
5950       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5951       break;
5952     case T_INT:
5953       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
5954       break;
5955     case T_SHORT:
5956       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5957       break;
5958     case T_BYTE:
5959       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5960       break;
5961     default:
5962       fatal("Unsupported type %s", type2name(bt));
5963       break;
5964   }
5965 }
5966 
5967 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5968   switch(bt) {
5969     case T_BYTE:
5970       vpsubb(dst, src1, src2, vec_enc);
5971       break;
5972     case T_SHORT:
5973       vpsubw(dst, src1, src2, vec_enc);
5974       break;
5975     case T_INT:
5976       vpsubd(dst, src1, src2, vec_enc);
5977       break;
5978     case T_LONG:
5979       vpsubq(dst, src1, src2, vec_enc);
5980       break;
5981     default:
5982       fatal("Unsupported type %s", type2name(bt));
5983       break;
5984   }
5985 }
5986 
5987 // Trailing zero count computation is based on leading zero count operation as per
5988 // following equation. All AVX3 targets support AVX512CD feature which offers
5989 // direct vector instruction to compute leading zero count.
5990 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5991 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5992                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5993                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
5994   assert(is_integral_type(bt), "");
5995   // xtmp = -1
5996   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
5997   // xtmp = xtmp + src
5998   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
5999   // xtmp = xtmp & ~src
6000   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6001   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6002   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6003   vpsub(bt, dst, xtmp4, dst, vec_enc);
6004 }
6005 
6006 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6007 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6008 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6009                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6010   assert(is_integral_type(bt), "");
6011   // xtmp = 0
6012   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6013   // xtmp = 0 - src
6014   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6015   // xtmp = xtmp | src
6016   vpor(xtmp3, xtmp3, src, vec_enc);
6017   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6018   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6019   vpsub(bt, dst, xtmp1, dst, vec_enc);
6020 }
6021 
6022 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6023   Label done;
6024   Label neg_divisor_fastpath;
6025   cmpl(divisor, 0);
6026   jccb(Assembler::less, neg_divisor_fastpath);
6027   xorl(rdx, rdx);
6028   divl(divisor);
6029   jmpb(done);
6030   bind(neg_divisor_fastpath);
6031   // Fastpath for divisor < 0:
6032   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6033   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6034   movl(rdx, rax);
6035   subl(rdx, divisor);
6036   if (VM_Version::supports_bmi1()) {
6037     andnl(rax, rdx, rax);
6038   } else {
6039     notl(rdx);
6040     andl(rax, rdx);
6041   }
6042   shrl(rax, 31);
6043   bind(done);
6044 }
6045 
6046 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6047   Label done;
6048   Label neg_divisor_fastpath;
6049   cmpl(divisor, 0);
6050   jccb(Assembler::less, neg_divisor_fastpath);
6051   xorl(rdx, rdx);
6052   divl(divisor);
6053   jmpb(done);
6054   bind(neg_divisor_fastpath);
6055   // Fastpath when divisor < 0:
6056   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6057   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6058   movl(rdx, rax);
6059   subl(rax, divisor);
6060   if (VM_Version::supports_bmi1()) {
6061     andnl(rax, rax, rdx);
6062   } else {
6063     notl(rax);
6064     andl(rax, rdx);
6065   }
6066   sarl(rax, 31);
6067   andl(rax, divisor);
6068   subl(rdx, rax);
6069   bind(done);
6070 }
6071 
6072 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6073   Label done;
6074   Label neg_divisor_fastpath;
6075 
6076   cmpl(divisor, 0);
6077   jccb(Assembler::less, neg_divisor_fastpath);
6078   xorl(rdx, rdx);
6079   divl(divisor);
6080   jmpb(done);
6081   bind(neg_divisor_fastpath);
6082   // Fastpath for divisor < 0:
6083   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6084   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6085   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6086   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6087   movl(rdx, rax);
6088   subl(rax, divisor);
6089   if (VM_Version::supports_bmi1()) {
6090     andnl(rax, rax, rdx);
6091   } else {
6092     notl(rax);
6093     andl(rax, rdx);
6094   }
6095   movl(tmp, rax);
6096   shrl(rax, 31); // quotient
6097   sarl(tmp, 31);
6098   andl(tmp, divisor);
6099   subl(rdx, tmp); // remainder
6100   bind(done);
6101 }
6102 
6103 #ifdef _LP64
6104 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6105                                  XMMRegister xtmp2, Register rtmp) {
6106   if(VM_Version::supports_gfni()) {
6107     // Galois field instruction based bit reversal based on following algorithm.
6108     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6109     mov64(rtmp, 0x8040201008040201L);
6110     movq(xtmp1, src);
6111     movq(xtmp2, rtmp);
6112     gf2p8affineqb(xtmp1, xtmp2, 0);
6113     movq(dst, xtmp1);
6114   } else {
6115     // Swap even and odd numbered bits.
6116     movl(rtmp, src);
6117     andl(rtmp, 0x55555555);
6118     shll(rtmp, 1);
6119     movl(dst, src);
6120     andl(dst, 0xAAAAAAAA);
6121     shrl(dst, 1);
6122     orl(dst, rtmp);
6123 
6124     // Swap LSB and MSB 2 bits of each nibble.
6125     movl(rtmp, dst);
6126     andl(rtmp, 0x33333333);
6127     shll(rtmp, 2);
6128     andl(dst, 0xCCCCCCCC);
6129     shrl(dst, 2);
6130     orl(dst, rtmp);
6131 
6132     // Swap LSB and MSB 4 bits of each byte.
6133     movl(rtmp, dst);
6134     andl(rtmp, 0x0F0F0F0F);
6135     shll(rtmp, 4);
6136     andl(dst, 0xF0F0F0F0);
6137     shrl(dst, 4);
6138     orl(dst, rtmp);
6139   }
6140   bswapl(dst);
6141 }
6142 
6143 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6144                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6145   if(VM_Version::supports_gfni()) {
6146     // Galois field instruction based bit reversal based on following algorithm.
6147     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6148     mov64(rtmp1, 0x8040201008040201L);
6149     movq(xtmp1, src);
6150     movq(xtmp2, rtmp1);
6151     gf2p8affineqb(xtmp1, xtmp2, 0);
6152     movq(dst, xtmp1);
6153   } else {
6154     // Swap even and odd numbered bits.
6155     movq(rtmp1, src);
6156     mov64(rtmp2, 0x5555555555555555L);
6157     andq(rtmp1, rtmp2);
6158     shlq(rtmp1, 1);
6159     movq(dst, src);
6160     notq(rtmp2);
6161     andq(dst, rtmp2);
6162     shrq(dst, 1);
6163     orq(dst, rtmp1);
6164 
6165     // Swap LSB and MSB 2 bits of each nibble.
6166     movq(rtmp1, dst);
6167     mov64(rtmp2, 0x3333333333333333L);
6168     andq(rtmp1, rtmp2);
6169     shlq(rtmp1, 2);
6170     notq(rtmp2);
6171     andq(dst, rtmp2);
6172     shrq(dst, 2);
6173     orq(dst, rtmp1);
6174 
6175     // Swap LSB and MSB 4 bits of each byte.
6176     movq(rtmp1, dst);
6177     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6178     andq(rtmp1, rtmp2);
6179     shlq(rtmp1, 4);
6180     notq(rtmp2);
6181     andq(dst, rtmp2);
6182     shrq(dst, 4);
6183     orq(dst, rtmp1);
6184   }
6185   bswapq(dst);
6186 }
6187 
6188 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6189   Label done;
6190   Label neg_divisor_fastpath;
6191   cmpq(divisor, 0);
6192   jccb(Assembler::less, neg_divisor_fastpath);
6193   xorl(rdx, rdx);
6194   divq(divisor);
6195   jmpb(done);
6196   bind(neg_divisor_fastpath);
6197   // Fastpath for divisor < 0:
6198   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6199   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6200   movq(rdx, rax);
6201   subq(rdx, divisor);
6202   if (VM_Version::supports_bmi1()) {
6203     andnq(rax, rdx, rax);
6204   } else {
6205     notq(rdx);
6206     andq(rax, rdx);
6207   }
6208   shrq(rax, 63);
6209   bind(done);
6210 }
6211 
6212 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6213   Label done;
6214   Label neg_divisor_fastpath;
6215   cmpq(divisor, 0);
6216   jccb(Assembler::less, neg_divisor_fastpath);
6217   xorq(rdx, rdx);
6218   divq(divisor);
6219   jmp(done);
6220   bind(neg_divisor_fastpath);
6221   // Fastpath when divisor < 0:
6222   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6223   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6224   movq(rdx, rax);
6225   subq(rax, divisor);
6226   if (VM_Version::supports_bmi1()) {
6227     andnq(rax, rax, rdx);
6228   } else {
6229     notq(rax);
6230     andq(rax, rdx);
6231   }
6232   sarq(rax, 63);
6233   andq(rax, divisor);
6234   subq(rdx, rax);
6235   bind(done);
6236 }
6237 
6238 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6239   Label done;
6240   Label neg_divisor_fastpath;
6241   cmpq(divisor, 0);
6242   jccb(Assembler::less, neg_divisor_fastpath);
6243   xorq(rdx, rdx);
6244   divq(divisor);
6245   jmp(done);
6246   bind(neg_divisor_fastpath);
6247   // Fastpath for divisor < 0:
6248   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6249   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6250   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6251   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6252   movq(rdx, rax);
6253   subq(rax, divisor);
6254   if (VM_Version::supports_bmi1()) {
6255     andnq(rax, rax, rdx);
6256   } else {
6257     notq(rax);
6258     andq(rax, rdx);
6259   }
6260   movq(tmp, rax);
6261   shrq(rax, 63); // quotient
6262   sarq(tmp, 63);
6263   andq(tmp, divisor);
6264   subq(rdx, tmp); // remainder
6265   bind(done);
6266 }
6267 #endif
6268 
6269 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6270                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6271                                         int vlen_enc) {
6272   assert(VM_Version::supports_avx512bw(), "");
6273   // Byte shuffles are inlane operations and indices are determined using
6274   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6275   // normalized to index range 0-15. This makes sure that all the multiples
6276   // of an index value are placed at same relative position in 128 bit
6277   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6278   // will be 16th element in their respective 128 bit lanes.
6279   movl(rtmp, 16);
6280   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6281 
6282   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6283   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6284   // original shuffle indices and move the shuffled lanes corresponding to true
6285   // mask to destination vector.
6286   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6287   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6288   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6289 
6290   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6291   // and broadcasting second 128 bit lane.
6292   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6293   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6294   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6295   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6296   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6297 
6298   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6299   // and broadcasting third 128 bit lane.
6300   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6301   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6302   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6303   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6304   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6305 
6306   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6307   // and broadcasting third 128 bit lane.
6308   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6309   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6310   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6311   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6312   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6313 }
6314 
6315 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6316                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6317   if (vlen_enc == AVX_128bit) {
6318     vpermilps(dst, src, shuffle, vlen_enc);
6319   } else if (bt == T_INT) {
6320     vpermd(dst, shuffle, src, vlen_enc);
6321   } else {
6322     assert(bt == T_FLOAT, "");
6323     vpermps(dst, shuffle, src, vlen_enc);
6324   }
6325 }