1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  54 
  55   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  56   // NativeJump::patch_verified_entry will be able to patch out the entry
  57   // code safely. The push to verify stack depth is ok at 5 bytes,
  58   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  59   // stack bang then we must use the 6 byte frame allocation even if
  60   // we have no frame. :-(
  61   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  62 
  63   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  64   // Remove word for return addr
  65   framesize -= wordSize;
  66   stack_bang_size -= wordSize;
  67 
  68   // Calls to C2R adapters often do not accept exceptional returns.
  69   // We require that their callers must bang for them.  But be careful, because
  70   // some VM calls (such as call site linkage) can use several kilobytes of
  71   // stack.  But the stack safety zone should account for that.
  72   // See bugs 4446381, 4468289, 4497237.
  73   if (stack_bang_size > 0) {
  74     generate_stack_overflow_check(stack_bang_size);
  75 
  76     // We always push rbp, so that on return to interpreter rbp, will be
  77     // restored correctly and we can correct the stack.
  78     push(rbp);
  79     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  80     if (PreserveFramePointer) {
  81       mov(rbp, rsp);
  82     }
  83     // Remove word for ebp
  84     framesize -= wordSize;
  85 
  86     // Create frame
  87     if (framesize) {
  88       subptr(rsp, framesize);
  89     }
  90   } else {
  91     // Create frame (force generation of a 4 byte immediate value)
  92     subptr_imm32(rsp, framesize);
  93 
  94     // Save RBP register now.
  95     framesize -= wordSize;
  96     movptr(Address(rsp, framesize), rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       movptr(rbp, rsp);
 100       if (framesize > 0) {
 101         addptr(rbp, framesize);
 102       }
 103     }
 104   }
 105 
 106   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 107     framesize -= wordSize;
 108     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 109   }
 110 
 111 #ifndef _LP64
 112   // If method sets FPU control word do it now
 113   if (fp_mode_24b) {
 114     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 115   }
 116   if (UseSSE >= 2 && VerifyFPU) {
 117     verify_FPU(0, "FPU stack must be clean on entry");
 118   }
 119 #endif
 120 
 121 #ifdef ASSERT
 122   if (VerifyStackAtCalls) {
 123     Label L;
 124     push(rax);
 125     mov(rax, rsp);
 126     andptr(rax, StackAlignmentInBytes-1);
 127     cmpptr(rax, StackAlignmentInBytes-wordSize);
 128     pop(rax);
 129     jcc(Assembler::equal, L);
 130     STOP("Stack is not properly aligned!");
 131     bind(L);
 132   }
 133 #endif
 134 
 135   if (!is_stub) {
 136     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137  #ifdef _LP64
 138     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 139       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 140       Label dummy_slow_path;
 141       Label dummy_continuation;
 142       Label* slow_path = &dummy_slow_path;
 143       Label* continuation = &dummy_continuation;
 144       if (!Compile::current()->output()->in_scratch_emit_size()) {
 145         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 146         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 147         Compile::current()->output()->add_stub(stub);
 148         slow_path = &stub->entry();
 149         continuation = &stub->continuation();
 150       }
 151       bs->nmethod_entry_barrier(this, slow_path, continuation);
 152     }
 153 #else
 154     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 155     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 156 #endif
 157   }
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 // fast_lock and fast_unlock used by C2
 176 
 177 // Because the transitions from emitted code to the runtime
 178 // monitorenter/exit helper stubs are so slow it's critical that
 179 // we inline both the stack-locking fast path and the inflated fast path.
 180 //
 181 // See also: cmpFastLock and cmpFastUnlock.
 182 //
 183 // What follows is a specialized inline transliteration of the code
 184 // in enter() and exit(). If we're concerned about I$ bloat another
 185 // option would be to emit TrySlowEnter and TrySlowExit methods
 186 // at startup-time.  These methods would accept arguments as
 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 188 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 190 // In practice, however, the # of lock sites is bounded and is usually small.
 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 192 // if the processor uses simple bimodal branch predictors keyed by EIP
 193 // Since the helper routines would be called from multiple synchronization
 194 // sites.
 195 //
 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 198 // to those specialized methods.  That'd give us a mostly platform-independent
 199 // implementation that the JITs could optimize and inline at their pleasure.
 200 // Done correctly, the only time we'd need to cross to native could would be
 201 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 203 // (b) explicit barriers or fence operations.
 204 //
 205 // TODO:
 206 //
 207 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 208 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 209 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 210 //    the lock operators would typically be faster than reifying Self.
 211 //
 212 // *  Ideally I'd define the primitives as:
 213 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 214 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 215 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 216 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 217 //    Furthermore the register assignments are overconstrained, possibly resulting in
 218 //    sub-optimal code near the synchronization site.
 219 //
 220 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 221 //    Alternately, use a better sp-proximity test.
 222 //
 223 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 224 //    Either one is sufficient to uniquely identify a thread.
 225 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 226 //
 227 // *  Intrinsify notify() and notifyAll() for the common cases where the
 228 //    object is locked by the calling thread but the waitlist is empty.
 229 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 230 //
 231 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 232 //    But beware of excessive branch density on AMD Opterons.
 233 //
 234 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 235 //    or failure of the fast path.  If the fast path fails then we pass
 236 //    control to the slow path, typically in C.  In fast_lock and
 237 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 238 //    will emit a conditional branch immediately after the node.
 239 //    So we have branches to branches and lots of ICC.ZF games.
 240 //    Instead, it might be better to have C2 pass a "FailureLabel"
 241 //    into fast_lock and fast_unlock.  In the case of success, control
 242 //    will drop through the node.  ICC.ZF is undefined at exit.
 243 //    In the case of failure, the node will branch directly to the
 244 //    FailureLabel
 245 
 246 
 247 // obj: object to lock
 248 // box: on-stack box address (displaced header location) - KILLED
 249 // rax,: tmp -- KILLED
 250 // scr: tmp -- KILLED
 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 252                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 253                                  Metadata* method_data) {
 254   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 255   // Ensure the register assignments are disjoint
 256   assert(tmpReg == rax, "");
 257   assert(cx1Reg == noreg, "");
 258   assert(cx2Reg == noreg, "");
 259   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 260 
 261   // Possible cases that we'll encounter in fast_lock
 262   // ------------------------------------------------
 263   // * Inflated
 264   //    -- unlocked
 265   //    -- Locked
 266   //       = by self
 267   //       = by other
 268   // * neutral
 269   // * stack-locked
 270   //    -- by self
 271   //       = sp-proximity test hits
 272   //       = sp-proximity test generates false-negative
 273   //    -- by other
 274   //
 275 
 276   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 277 
 278   if (DiagnoseSyncOnValueBasedClasses != 0) {
 279     load_klass(tmpReg, objReg, scrReg);
 280     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 281     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 282     jcc(Assembler::notZero, DONE_LABEL);
 283   }
 284 
 285   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 286   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 287   jcc(Assembler::notZero, IsInflated);
 288 
 289   if (LockingMode == LM_MONITOR) {
 290     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 291     testptr(objReg, objReg);
 292   } else {
 293     assert(LockingMode == LM_LEGACY, "must be");
 294     // Attempt stack-locking ...
 295     orptr (tmpReg, markWord::unlocked_value);
 296     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 297     lock();
 298     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 299     jcc(Assembler::equal, COUNT);           // Success
 300 
 301     // Recursive locking.
 302     // The object is stack-locked: markword contains stack pointer to BasicLock.
 303     // Locked by current thread if difference with current SP is less than one page.
 304     subptr(tmpReg, rsp);
 305     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 306     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 307     movptr(Address(boxReg, 0), tmpReg);
 308   }
 309   jmp(DONE_LABEL);
 310 
 311   bind(IsInflated);
 312   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 313 
 314 #ifndef _LP64
 315   // The object is inflated.
 316 
 317   // boxReg refers to the on-stack BasicLock in the current frame.
 318   // We'd like to write:
 319   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 320   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 321   // additional latency as we have another ST in the store buffer that must drain.
 322 
 323   // avoid ST-before-CAS
 324   // register juggle because we need tmpReg for cmpxchgptr below
 325   movptr(scrReg, boxReg);
 326   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 327 
 328   // Optimistic form: consider XORL tmpReg,tmpReg
 329   movptr(tmpReg, NULL_WORD);
 330 
 331   // Appears unlocked - try to swing _owner from null to non-null.
 332   // Ideally, I'd manifest "Self" with get_thread and then attempt
 333   // to CAS the register containing Self into m->Owner.
 334   // But we don't have enough registers, so instead we can either try to CAS
 335   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 336   // we later store "Self" into m->Owner.  Transiently storing a stack address
 337   // (rsp or the address of the box) into  m->owner is harmless.
 338   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 339   lock();
 340   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 341   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 342   // If we weren't able to swing _owner from null to the BasicLock
 343   // then take the slow path.
 344   jccb  (Assembler::notZero, NO_COUNT);
 345   // update _owner from BasicLock to thread
 346   get_thread (scrReg);                    // beware: clobbers ICCs
 347   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 348   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 349 
 350   // If the CAS fails we can either retry or pass control to the slow path.
 351   // We use the latter tactic.
 352   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 353   // If the CAS was successful ...
 354   //   Self has acquired the lock
 355   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 356   // Intentional fall-through into DONE_LABEL ...
 357 #else // _LP64
 358   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 359   movq(scrReg, tmpReg);
 360   xorq(tmpReg, tmpReg);
 361   lock();
 362   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 363   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 364   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 365   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 366   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 367   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 368 
 369   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 370   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 371   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 372   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 373 #endif // _LP64
 374   bind(DONE_LABEL);
 375 
 376   // ZFlag == 1 count in fast path
 377   // ZFlag == 0 count in slow path
 378   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 379 
 380   bind(COUNT);
 381   // Count monitors in fast path
 382   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 383 
 384   xorl(tmpReg, tmpReg); // Set ZF == 1
 385 
 386   bind(NO_COUNT);
 387 
 388   // At NO_COUNT the icc ZFlag is set as follows ...
 389   // fast_unlock uses the same protocol.
 390   // ZFlag == 1 -> Success
 391   // ZFlag == 0 -> Failure - force control through the slow path
 392 }
 393 
 394 // obj: object to unlock
 395 // box: box address (displaced header location), killed.  Must be EAX.
 396 // tmp: killed, cannot be obj nor box.
 397 //
 398 // Some commentary on balanced locking:
 399 //
 400 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 401 // Methods that don't have provably balanced locking are forced to run in the
 402 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 403 // The interpreter provides two properties:
 404 // I1:  At return-time the interpreter automatically and quietly unlocks any
 405 //      objects acquired the current activation (frame).  Recall that the
 406 //      interpreter maintains an on-stack list of locks currently held by
 407 //      a frame.
 408 // I2:  If a method attempts to unlock an object that is not held by the
 409 //      the frame the interpreter throws IMSX.
 410 //
 411 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 412 // B() doesn't have provably balanced locking so it runs in the interpreter.
 413 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 414 // is still locked by A().
 415 //
 416 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 417 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 418 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 419 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 420 // Arguably given that the spec legislates the JNI case as undefined our implementation
 421 // could reasonably *avoid* checking owner in fast_unlock().
 422 // In the interest of performance we elide m->Owner==Self check in unlock.
 423 // A perfectly viable alternative is to elide the owner check except when
 424 // Xcheck:jni is enabled.
 425 
 426 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 427   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 428   assert(boxReg == rax, "");
 429   assert_different_registers(objReg, boxReg, tmpReg);
 430 
 431   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 432 
 433   if (LockingMode == LM_LEGACY) {
 434     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 435     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 436   }
 437   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 438   if (LockingMode != LM_MONITOR) {
 439     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 440     jcc(Assembler::zero, Stacked);
 441   }
 442 
 443   // It's inflated.
 444 
 445   // Despite our balanced locking property we still check that m->_owner == Self
 446   // as java routines or native JNI code called by this thread might
 447   // have released the lock.
 448   // Refer to the comments in synchronizer.cpp for how we might encode extra
 449   // state in _succ so we can avoid fetching EntryList|cxq.
 450   //
 451   // If there's no contention try a 1-0 exit.  That is, exit without
 452   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 453   // we detect and recover from the race that the 1-0 exit admits.
 454   //
 455   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 456   // before it STs null into _owner, releasing the lock.  Updates
 457   // to data protected by the critical section must be visible before
 458   // we drop the lock (and thus before any other thread could acquire
 459   // the lock and observe the fields protected by the lock).
 460   // IA32's memory-model is SPO, so STs are ordered with respect to
 461   // each other and there's no need for an explicit barrier (fence).
 462   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 463 #ifndef _LP64
 464   // Note that we could employ various encoding schemes to reduce
 465   // the number of loads below (currently 4) to just 2 or 3.
 466   // Refer to the comments in synchronizer.cpp.
 467   // In practice the chain of fetches doesn't seem to impact performance, however.
 468   xorptr(boxReg, boxReg);
 469   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 470   jccb  (Assembler::notZero, DONE_LABEL);
 471   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 472   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 473   jccb  (Assembler::notZero, DONE_LABEL);
 474   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 475   jmpb  (DONE_LABEL);
 476 #else // _LP64
 477   // It's inflated
 478   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 479 
 480   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 481   jccb(Assembler::equal, LNotRecursive);
 482 
 483   // Recursive inflated unlock
 484   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 485   jmpb(LSuccess);
 486 
 487   bind(LNotRecursive);
 488   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 489   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 490   jccb  (Assembler::notZero, CheckSucc);
 491   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 492   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 493   jmpb  (DONE_LABEL);
 494 
 495   // Try to avoid passing control into the slow_path ...
 496   bind  (CheckSucc);
 497 
 498   // The following optional optimization can be elided if necessary
 499   // Effectively: if (succ == null) goto slow path
 500   // The code reduces the window for a race, however,
 501   // and thus benefits performance.
 502   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 503   jccb  (Assembler::zero, LGoSlowPath);
 504 
 505   xorptr(boxReg, boxReg);
 506   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 507   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 508 
 509   // Memory barrier/fence
 510   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 511   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 512   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 513   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 514   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 515   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 516   lock(); addl(Address(rsp, 0), 0);
 517 
 518   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 519   jccb  (Assembler::notZero, LSuccess);
 520 
 521   // Rare inopportune interleaving - race.
 522   // The successor vanished in the small window above.
 523   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 524   // We need to ensure progress and succession.
 525   // Try to reacquire the lock.
 526   // If that fails then the new owner is responsible for succession and this
 527   // thread needs to take no further action and can exit via the fast path (success).
 528   // If the re-acquire succeeds then pass control into the slow path.
 529   // As implemented, this latter mode is horrible because we generated more
 530   // coherence traffic on the lock *and* artificially extended the critical section
 531   // length while by virtue of passing control into the slow path.
 532 
 533   // box is really RAX -- the following CMPXCHG depends on that binding
 534   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 535   lock();
 536   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 537   // There's no successor so we tried to regrab the lock.
 538   // If that didn't work, then another thread grabbed the
 539   // lock so we're done (and exit was a success).
 540   jccb  (Assembler::notEqual, LSuccess);
 541   // Intentional fall-through into slow path
 542 
 543   bind  (LGoSlowPath);
 544   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 545   jmpb  (DONE_LABEL);
 546 
 547   bind  (LSuccess);
 548   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 549   jmpb  (DONE_LABEL);
 550 
 551 #endif
 552   if (LockingMode == LM_LEGACY) {
 553     bind  (Stacked);
 554     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 555     lock();
 556     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 557     // Intentional fall-thru into DONE_LABEL
 558   }
 559 
 560   bind(DONE_LABEL);
 561 
 562   // ZFlag == 1 count in fast path
 563   // ZFlag == 0 count in slow path
 564   jccb(Assembler::notZero, NO_COUNT);
 565 
 566   bind(COUNT);
 567   // Count monitors in fast path
 568 #ifndef _LP64
 569   get_thread(tmpReg);
 570   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 571 #else // _LP64
 572   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 573 #endif
 574 
 575   xorl(tmpReg, tmpReg); // Set ZF == 1
 576 
 577   bind(NO_COUNT);
 578 }
 579 
 580 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 581                                               Register t, Register thread) {
 582   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 583   assert(rax_reg == rax, "Used for CAS");
 584   assert_different_registers(obj, box, rax_reg, t, thread);
 585 
 586   // Handle inflated monitor.
 587   Label inflated;
 588   // Finish fast lock successfully. ZF value is irrelevant.
 589   Label locked;
 590   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 591   Label slow_path;
 592 
 593   if (UseObjectMonitorTable) {
 594     // Clear cache in case fast locking succeeds.
 595     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 596   }
 597 
 598   if (DiagnoseSyncOnValueBasedClasses != 0) {
 599     load_klass(rax_reg, obj, t);
 600     movl(rax_reg, Address(rax_reg, Klass::access_flags_offset()));
 601     testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS);
 602     jcc(Assembler::notZero, slow_path);
 603   }
 604 
 605   const Register mark = t;
 606 
 607   { // Lightweight Lock
 608 
 609     Label push;
 610 
 611     const Register top = UseObjectMonitorTable ? rax_reg : box;
 612 
 613     // Load the mark.
 614     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 615 
 616     // Prefetch top.
 617     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 618 
 619     // Check for monitor (0b10).
 620     testptr(mark, markWord::monitor_value);
 621     jcc(Assembler::notZero, inflated);
 622 
 623     // Check if lock-stack is full.
 624     cmpl(top, LockStack::end_offset() - 1);
 625     jcc(Assembler::greater, slow_path);
 626 
 627     // Check if recursive.
 628     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 629     jccb(Assembler::equal, push);
 630 
 631     // Try to lock. Transition lock bits 0b01 => 0b00
 632     movptr(rax_reg, mark);
 633     orptr(rax_reg, markWord::unlocked_value);
 634     andptr(mark, ~(int32_t)markWord::unlocked_value);
 635     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 636     jcc(Assembler::notEqual, slow_path);
 637 
 638     if (UseObjectMonitorTable) {
 639       // Need to reload top, clobbered by CAS.
 640       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 641     }
 642     bind(push);
 643     // After successful lock, push object on lock-stack.
 644     movptr(Address(thread, top), obj);
 645     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 646     jmpb(locked);
 647   }
 648 
 649   { // Handle inflated monitor.
 650     bind(inflated);
 651 
 652     const Register monitor = t;
 653 
 654     if (!UseObjectMonitorTable) {
 655       assert(mark == monitor, "should be the same here");
 656     } else {
 657       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 658       // Fetch ObjectMonitor* from the cache or take the slow-path.
 659       Label monitor_found;
 660 
 661       // Load cache address
 662       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 663 
 664       const int num_unrolled = 2;
 665       for (int i = 0; i < num_unrolled; i++) {
 666         cmpptr(obj, Address(t));
 667         jccb(Assembler::equal, monitor_found);
 668         if (i + 1 != num_unrolled) {
 669           increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 670         }
 671       }
 672 
 673       // Loop after unrolling, advance iterator.
 674       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 675 
 676       Label loop;
 677 
 678       // Search for obj in cache.
 679       bind(loop);
 680 
 681       // Check for match.
 682       cmpptr(obj, Address(t));
 683       jccb(Assembler::equal, monitor_found);
 684 
 685       // Search until null encountered, guaranteed _null_sentinel at end.
 686       cmpptr(Address(t), 1);
 687       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 688       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 689       jmpb(loop);
 690 
 691       // Cache hit.
 692       bind(monitor_found);
 693       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 694     }
 695     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 696     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 697     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 698 
 699     Label monitor_locked;
 700     // Lock the monitor.
 701 
 702     // CAS owner (null => current thread).
 703     xorptr(rax_reg, rax_reg);
 704     lock(); cmpxchgptr(thread, owner_address);
 705     jccb(Assembler::equal, monitor_locked);
 706 
 707     // Check if recursive.
 708     cmpptr(thread, rax_reg);
 709     jccb(Assembler::notEqual, slow_path);
 710 
 711     // Recursive.
 712     increment(recursions_address);
 713 
 714     bind(monitor_locked);
 715     if (UseObjectMonitorTable) {
 716       // Cache the monitor for unlock
 717       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 718     }
 719   }
 720 
 721   bind(locked);
 722   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 723   // Set ZF = 1
 724   xorl(rax_reg, rax_reg);
 725 
 726 #ifdef ASSERT
 727   // Check that locked label is reached with ZF set.
 728   Label zf_correct;
 729   Label zf_bad_zero;
 730   jcc(Assembler::zero, zf_correct);
 731   jmp(zf_bad_zero);
 732 #endif
 733 
 734   bind(slow_path);
 735 #ifdef ASSERT
 736   // Check that slow_path label is reached with ZF not set.
 737   jcc(Assembler::notZero, zf_correct);
 738   stop("Fast Lock ZF != 0");
 739   bind(zf_bad_zero);
 740   stop("Fast Lock ZF != 1");
 741   bind(zf_correct);
 742 #endif
 743   // C2 uses the value of ZF to determine the continuation.
 744 }
 745 
 746 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 747   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 748   assert(reg_rax == rax, "Used for CAS");
 749   assert_different_registers(obj, reg_rax, t);
 750 
 751   // Handle inflated monitor.
 752   Label inflated, inflated_check_lock_stack;
 753   // Finish fast unlock successfully.  MUST jump with ZF == 1
 754   Label unlocked;
 755 
 756   // Assume success.
 757   decrement(Address(thread, JavaThread::held_monitor_count_offset()));
 758 
 759   const Register mark = t;
 760   const Register monitor = t;
 761   const Register top = UseObjectMonitorTable ? t : reg_rax;
 762   const Register box = reg_rax;
 763 
 764   Label dummy;
 765   C2FastUnlockLightweightStub* stub = nullptr;
 766 
 767   if (!Compile::current()->output()->in_scratch_emit_size()) {
 768     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 769     Compile::current()->output()->add_stub(stub);
 770   }
 771 
 772   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 773   Label& check_successor = stub == nullptr ? dummy : stub->check_successor();
 774   Label& slow_path = stub == nullptr ? dummy : stub->slow_path();
 775 
 776   { // Lightweight Unlock
 777 
 778     // Load top.
 779     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 780 
 781     if (!UseObjectMonitorTable) {
 782       // Prefetch mark.
 783       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 784     }
 785 
 786     // Check if obj is top of lock-stack.
 787     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 788     // Top of lock stack was not obj. Must be monitor.
 789     jcc(Assembler::notEqual, inflated_check_lock_stack);
 790 
 791     // Pop lock-stack.
 792     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 793     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 794 
 795     // Check if recursive.
 796     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 797     jcc(Assembler::equal, unlocked);
 798 
 799     // We elide the monitor check, let the CAS fail instead.
 800 
 801     if (UseObjectMonitorTable) {
 802       // Load mark.
 803       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 804     }
 805 
 806     // Try to unlock. Transition lock bits 0b00 => 0b01
 807     movptr(reg_rax, mark);
 808     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 809     orptr(mark, markWord::unlocked_value);
 810     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 811     jcc(Assembler::notEqual, push_and_slow_path);
 812     jmp(unlocked);
 813   }
 814 
 815 
 816   { // Handle inflated monitor.
 817     bind(inflated_check_lock_stack);
 818 #ifdef ASSERT
 819     Label check_done;
 820     subl(top, oopSize);
 821     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 822     jcc(Assembler::below, check_done);
 823     cmpptr(obj, Address(thread, top));
 824     jccb(Assembler::notEqual, inflated_check_lock_stack);
 825     stop("Fast Unlock lock on stack");
 826     bind(check_done);
 827     if (UseObjectMonitorTable) {
 828       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 829     }
 830     testptr(mark, markWord::monitor_value);
 831     jccb(Assembler::notZero, inflated);
 832     stop("Fast Unlock not monitor");
 833 #endif
 834 
 835     bind(inflated);
 836 
 837     if (!UseObjectMonitorTable) {
 838       assert(mark == monitor, "should be the same here");
 839     } else {
 840       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 841       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 842       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 843       cmpptr(monitor, alignof(ObjectMonitor*));
 844       jcc(Assembler::below, slow_path);
 845     }
 846     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 847     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 848     const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag};
 849     const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag};
 850     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 851 
 852     Label recursive;
 853 
 854     // Check if recursive.
 855     cmpptr(recursions_address, 0);
 856     jccb(Assembler::notEqual, recursive);
 857 
 858     // Check if the entry lists are empty.
 859     movptr(reg_rax, cxq_address);
 860     orptr(reg_rax, EntryList_address);
 861     jcc(Assembler::notZero, check_successor);
 862 
 863     // Release lock.
 864     movptr(owner_address, NULL_WORD);
 865     jmpb(unlocked);
 866 
 867     // Recursive unlock.
 868     bind(recursive);
 869     decrement(recursions_address);
 870     xorl(t, t);
 871   }
 872 
 873   bind(unlocked);
 874   if (stub != nullptr) {
 875     bind(stub->unlocked_continuation());
 876   }
 877 
 878 #ifdef ASSERT
 879   // Check that unlocked label is reached with ZF set.
 880   Label zf_correct;
 881   jcc(Assembler::zero, zf_correct);
 882   stop("Fast Unlock ZF != 1");
 883 #endif
 884 
 885   if (stub != nullptr) {
 886     bind(stub->slow_path_continuation());
 887   }
 888 #ifdef ASSERT
 889   // Check that stub->continuation() label is reached with ZF not set.
 890   jccb(Assembler::notZero, zf_correct);
 891   stop("Fast Unlock ZF != 0");
 892   bind(zf_correct);
 893 #endif
 894   // C2 uses the value of ZF to determine the continuation.
 895 }
 896 
 897 //-------------------------------------------------------------------------------------------
 898 // Generic instructions support for use in .ad files C2 code generation
 899 
 900 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 901   if (dst != src) {
 902     movdqu(dst, src);
 903   }
 904   if (opcode == Op_AbsVD) {
 905     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 906   } else {
 907     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 908     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 909   }
 910 }
 911 
 912 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 913   if (opcode == Op_AbsVD) {
 914     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 915   } else {
 916     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 917     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 918   }
 919 }
 920 
 921 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 922   if (dst != src) {
 923     movdqu(dst, src);
 924   }
 925   if (opcode == Op_AbsVF) {
 926     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 927   } else {
 928     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 929     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 930   }
 931 }
 932 
 933 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 934   if (opcode == Op_AbsVF) {
 935     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 936   } else {
 937     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 938     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 939   }
 940 }
 941 
 942 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 943   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 944   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 945 
 946   if (opcode == Op_MinV) {
 947     if (elem_bt == T_BYTE) {
 948       pminsb(dst, src);
 949     } else if (elem_bt == T_SHORT) {
 950       pminsw(dst, src);
 951     } else if (elem_bt == T_INT) {
 952       pminsd(dst, src);
 953     } else {
 954       assert(elem_bt == T_LONG, "required");
 955       assert(tmp == xmm0, "required");
 956       assert_different_registers(dst, src, tmp);
 957       movdqu(xmm0, dst);
 958       pcmpgtq(xmm0, src);
 959       blendvpd(dst, src);  // xmm0 as mask
 960     }
 961   } else { // opcode == Op_MaxV
 962     if (elem_bt == T_BYTE) {
 963       pmaxsb(dst, src);
 964     } else if (elem_bt == T_SHORT) {
 965       pmaxsw(dst, src);
 966     } else if (elem_bt == T_INT) {
 967       pmaxsd(dst, src);
 968     } else {
 969       assert(elem_bt == T_LONG, "required");
 970       assert(tmp == xmm0, "required");
 971       assert_different_registers(dst, src, tmp);
 972       movdqu(xmm0, src);
 973       pcmpgtq(xmm0, dst);
 974       blendvpd(dst, src);  // xmm0 as mask
 975     }
 976   }
 977 }
 978 
 979 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 980                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 981                                  int vlen_enc) {
 982   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 983 
 984   if (opcode == Op_MinV) {
 985     if (elem_bt == T_BYTE) {
 986       vpminsb(dst, src1, src2, vlen_enc);
 987     } else if (elem_bt == T_SHORT) {
 988       vpminsw(dst, src1, src2, vlen_enc);
 989     } else if (elem_bt == T_INT) {
 990       vpminsd(dst, src1, src2, vlen_enc);
 991     } else {
 992       assert(elem_bt == T_LONG, "required");
 993       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 994         vpminsq(dst, src1, src2, vlen_enc);
 995       } else {
 996         assert_different_registers(dst, src1, src2);
 997         vpcmpgtq(dst, src1, src2, vlen_enc);
 998         vblendvpd(dst, src1, src2, dst, vlen_enc);
 999       }
1000     }
1001   } else { // opcode == Op_MaxV
1002     if (elem_bt == T_BYTE) {
1003       vpmaxsb(dst, src1, src2, vlen_enc);
1004     } else if (elem_bt == T_SHORT) {
1005       vpmaxsw(dst, src1, src2, vlen_enc);
1006     } else if (elem_bt == T_INT) {
1007       vpmaxsd(dst, src1, src2, vlen_enc);
1008     } else {
1009       assert(elem_bt == T_LONG, "required");
1010       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1011         vpmaxsq(dst, src1, src2, vlen_enc);
1012       } else {
1013         assert_different_registers(dst, src1, src2);
1014         vpcmpgtq(dst, src1, src2, vlen_enc);
1015         vblendvpd(dst, src2, src1, dst, vlen_enc);
1016       }
1017     }
1018   }
1019 }
1020 
1021 // Float/Double min max
1022 
1023 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1024                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1025                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1026                                    int vlen_enc) {
1027   assert(UseAVX > 0, "required");
1028   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1029          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1030   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1031   assert_different_registers(a, tmp, atmp, btmp);
1032   assert_different_registers(b, tmp, atmp, btmp);
1033 
1034   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1035   bool is_double_word = is_double_word_type(elem_bt);
1036 
1037   /* Note on 'non-obvious' assembly sequence:
1038    *
1039    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1040    * and Java on how they handle floats:
1041    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1042    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1043    *
1044    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1045    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1046    *                (only useful when signs differ, noop otherwise)
1047    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1048 
1049    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1050    *   btmp = (b < +0.0) ? a : b
1051    *   atmp = (b < +0.0) ? b : a
1052    *   Tmp  = Max_Float(atmp , btmp)
1053    *   Res  = (atmp == NaN) ? atmp : Tmp
1054    */
1055 
1056   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1057   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1058   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1059   XMMRegister mask;
1060 
1061   if (!is_double_word && is_min) {
1062     mask = a;
1063     vblend = &MacroAssembler::vblendvps;
1064     vmaxmin = &MacroAssembler::vminps;
1065     vcmp = &MacroAssembler::vcmpps;
1066   } else if (!is_double_word && !is_min) {
1067     mask = b;
1068     vblend = &MacroAssembler::vblendvps;
1069     vmaxmin = &MacroAssembler::vmaxps;
1070     vcmp = &MacroAssembler::vcmpps;
1071   } else if (is_double_word && is_min) {
1072     mask = a;
1073     vblend = &MacroAssembler::vblendvpd;
1074     vmaxmin = &MacroAssembler::vminpd;
1075     vcmp = &MacroAssembler::vcmppd;
1076   } else {
1077     assert(is_double_word && !is_min, "sanity");
1078     mask = b;
1079     vblend = &MacroAssembler::vblendvpd;
1080     vmaxmin = &MacroAssembler::vmaxpd;
1081     vcmp = &MacroAssembler::vcmppd;
1082   }
1083 
1084   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1085   XMMRegister maxmin, scratch;
1086   if (dst == btmp) {
1087     maxmin = btmp;
1088     scratch = tmp;
1089   } else {
1090     maxmin = tmp;
1091     scratch = btmp;
1092   }
1093 
1094   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1095   if (precompute_mask && !is_double_word) {
1096     vpsrad(tmp, mask, 32, vlen_enc);
1097     mask = tmp;
1098   } else if (precompute_mask && is_double_word) {
1099     vpxor(tmp, tmp, tmp, vlen_enc);
1100     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1101     mask = tmp;
1102   }
1103 
1104   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1105   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1106   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1107   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1108   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1109 }
1110 
1111 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1112                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1113                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1114                                     int vlen_enc) {
1115   assert(UseAVX > 2, "required");
1116   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1117          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1118   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1119   assert_different_registers(dst, a, atmp, btmp);
1120   assert_different_registers(dst, b, atmp, btmp);
1121 
1122   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1123   bool is_double_word = is_double_word_type(elem_bt);
1124   bool merge = true;
1125 
1126   if (!is_double_word && is_min) {
1127     evpmovd2m(ktmp, a, vlen_enc);
1128     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1129     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1130     vminps(dst, atmp, btmp, vlen_enc);
1131     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1132     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1133   } else if (!is_double_word && !is_min) {
1134     evpmovd2m(ktmp, b, vlen_enc);
1135     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1136     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1137     vmaxps(dst, atmp, btmp, vlen_enc);
1138     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1139     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1140   } else if (is_double_word && is_min) {
1141     evpmovq2m(ktmp, a, vlen_enc);
1142     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1143     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1144     vminpd(dst, atmp, btmp, vlen_enc);
1145     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1146     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1147   } else {
1148     assert(is_double_word && !is_min, "sanity");
1149     evpmovq2m(ktmp, b, vlen_enc);
1150     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1151     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1152     vmaxpd(dst, atmp, btmp, vlen_enc);
1153     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1154     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1155   }
1156 }
1157 
1158 // Float/Double signum
1159 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1160   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1161 
1162   Label DONE_LABEL;
1163 
1164   if (opcode == Op_SignumF) {
1165     assert(UseSSE > 0, "required");
1166     ucomiss(dst, zero);
1167     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1168     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1169     movflt(dst, one);
1170     jcc(Assembler::above, DONE_LABEL);
1171     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1172   } else if (opcode == Op_SignumD) {
1173     assert(UseSSE > 1, "required");
1174     ucomisd(dst, zero);
1175     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1176     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1177     movdbl(dst, one);
1178     jcc(Assembler::above, DONE_LABEL);
1179     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1180   }
1181 
1182   bind(DONE_LABEL);
1183 }
1184 
1185 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1186   if (sign) {
1187     pmovsxbw(dst, src);
1188   } else {
1189     pmovzxbw(dst, src);
1190   }
1191 }
1192 
1193 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1194   if (sign) {
1195     vpmovsxbw(dst, src, vector_len);
1196   } else {
1197     vpmovzxbw(dst, src, vector_len);
1198   }
1199 }
1200 
1201 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1202   if (sign) {
1203     vpmovsxbd(dst, src, vector_len);
1204   } else {
1205     vpmovzxbd(dst, src, vector_len);
1206   }
1207 }
1208 
1209 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1210   if (sign) {
1211     vpmovsxwd(dst, src, vector_len);
1212   } else {
1213     vpmovzxwd(dst, src, vector_len);
1214   }
1215 }
1216 
1217 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1218                                      int shift, int vector_len) {
1219   if (opcode == Op_RotateLeftV) {
1220     if (etype == T_INT) {
1221       evprold(dst, src, shift, vector_len);
1222     } else {
1223       assert(etype == T_LONG, "expected type T_LONG");
1224       evprolq(dst, src, shift, vector_len);
1225     }
1226   } else {
1227     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1228     if (etype == T_INT) {
1229       evprord(dst, src, shift, vector_len);
1230     } else {
1231       assert(etype == T_LONG, "expected type T_LONG");
1232       evprorq(dst, src, shift, vector_len);
1233     }
1234   }
1235 }
1236 
1237 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1238                                      XMMRegister shift, int vector_len) {
1239   if (opcode == Op_RotateLeftV) {
1240     if (etype == T_INT) {
1241       evprolvd(dst, src, shift, vector_len);
1242     } else {
1243       assert(etype == T_LONG, "expected type T_LONG");
1244       evprolvq(dst, src, shift, vector_len);
1245     }
1246   } else {
1247     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1248     if (etype == T_INT) {
1249       evprorvd(dst, src, shift, vector_len);
1250     } else {
1251       assert(etype == T_LONG, "expected type T_LONG");
1252       evprorvq(dst, src, shift, vector_len);
1253     }
1254   }
1255 }
1256 
1257 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1258   if (opcode == Op_RShiftVI) {
1259     psrad(dst, shift);
1260   } else if (opcode == Op_LShiftVI) {
1261     pslld(dst, shift);
1262   } else {
1263     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1264     psrld(dst, shift);
1265   }
1266 }
1267 
1268 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1269   switch (opcode) {
1270     case Op_RShiftVI:  psrad(dst, shift); break;
1271     case Op_LShiftVI:  pslld(dst, shift); break;
1272     case Op_URShiftVI: psrld(dst, shift); break;
1273 
1274     default: assert(false, "%s", NodeClassNames[opcode]);
1275   }
1276 }
1277 
1278 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1279   if (opcode == Op_RShiftVI) {
1280     vpsrad(dst, nds, shift, vector_len);
1281   } else if (opcode == Op_LShiftVI) {
1282     vpslld(dst, nds, shift, vector_len);
1283   } else {
1284     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1285     vpsrld(dst, nds, shift, vector_len);
1286   }
1287 }
1288 
1289 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1290   switch (opcode) {
1291     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1292     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1293     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1294 
1295     default: assert(false, "%s", NodeClassNames[opcode]);
1296   }
1297 }
1298 
1299 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1300   switch (opcode) {
1301     case Op_RShiftVB:  // fall-through
1302     case Op_RShiftVS:  psraw(dst, shift); break;
1303 
1304     case Op_LShiftVB:  // fall-through
1305     case Op_LShiftVS:  psllw(dst, shift);   break;
1306 
1307     case Op_URShiftVS: // fall-through
1308     case Op_URShiftVB: psrlw(dst, shift);  break;
1309 
1310     default: assert(false, "%s", NodeClassNames[opcode]);
1311   }
1312 }
1313 
1314 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1315   switch (opcode) {
1316     case Op_RShiftVB:  // fall-through
1317     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1318 
1319     case Op_LShiftVB:  // fall-through
1320     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1321 
1322     case Op_URShiftVS: // fall-through
1323     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1324 
1325     default: assert(false, "%s", NodeClassNames[opcode]);
1326   }
1327 }
1328 
1329 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1330   switch (opcode) {
1331     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1332     case Op_LShiftVL:  psllq(dst, shift); break;
1333     case Op_URShiftVL: psrlq(dst, shift); break;
1334 
1335     default: assert(false, "%s", NodeClassNames[opcode]);
1336   }
1337 }
1338 
1339 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1340   if (opcode == Op_RShiftVL) {
1341     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1342   } else if (opcode == Op_LShiftVL) {
1343     psllq(dst, shift);
1344   } else {
1345     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1346     psrlq(dst, shift);
1347   }
1348 }
1349 
1350 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1351   switch (opcode) {
1352     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1353     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1354     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1355 
1356     default: assert(false, "%s", NodeClassNames[opcode]);
1357   }
1358 }
1359 
1360 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1361   if (opcode == Op_RShiftVL) {
1362     evpsraq(dst, nds, shift, vector_len);
1363   } else if (opcode == Op_LShiftVL) {
1364     vpsllq(dst, nds, shift, vector_len);
1365   } else {
1366     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1367     vpsrlq(dst, nds, shift, vector_len);
1368   }
1369 }
1370 
1371 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1372   switch (opcode) {
1373     case Op_RShiftVB:  // fall-through
1374     case Op_RShiftVS:  // fall-through
1375     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1376 
1377     case Op_LShiftVB:  // fall-through
1378     case Op_LShiftVS:  // fall-through
1379     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1380 
1381     case Op_URShiftVB: // fall-through
1382     case Op_URShiftVS: // fall-through
1383     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1384 
1385     default: assert(false, "%s", NodeClassNames[opcode]);
1386   }
1387 }
1388 
1389 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1390   switch (opcode) {
1391     case Op_RShiftVB:  // fall-through
1392     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1393 
1394     case Op_LShiftVB:  // fall-through
1395     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1396 
1397     case Op_URShiftVB: // fall-through
1398     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1399 
1400     default: assert(false, "%s", NodeClassNames[opcode]);
1401   }
1402 }
1403 
1404 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1405   assert(UseAVX >= 2, "required");
1406   switch (opcode) {
1407     case Op_RShiftVL: {
1408       if (UseAVX > 2) {
1409         assert(tmp == xnoreg, "not used");
1410         if (!VM_Version::supports_avx512vl()) {
1411           vlen_enc = Assembler::AVX_512bit;
1412         }
1413         evpsravq(dst, src, shift, vlen_enc);
1414       } else {
1415         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1416         vpsrlvq(dst, src, shift, vlen_enc);
1417         vpsrlvq(tmp, tmp, shift, vlen_enc);
1418         vpxor(dst, dst, tmp, vlen_enc);
1419         vpsubq(dst, dst, tmp, vlen_enc);
1420       }
1421       break;
1422     }
1423     case Op_LShiftVL: {
1424       assert(tmp == xnoreg, "not used");
1425       vpsllvq(dst, src, shift, vlen_enc);
1426       break;
1427     }
1428     case Op_URShiftVL: {
1429       assert(tmp == xnoreg, "not used");
1430       vpsrlvq(dst, src, shift, vlen_enc);
1431       break;
1432     }
1433     default: assert(false, "%s", NodeClassNames[opcode]);
1434   }
1435 }
1436 
1437 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1438 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1439   assert(opcode == Op_LShiftVB ||
1440          opcode == Op_RShiftVB ||
1441          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1442   bool sign = (opcode != Op_URShiftVB);
1443   assert(vector_len == 0, "required");
1444   vextendbd(sign, dst, src, 1);
1445   vpmovzxbd(vtmp, shift, 1);
1446   varshiftd(opcode, dst, dst, vtmp, 1);
1447   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1448   vextracti128_high(vtmp, dst);
1449   vpackusdw(dst, dst, vtmp, 0);
1450 }
1451 
1452 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1453 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1454   assert(opcode == Op_LShiftVB ||
1455          opcode == Op_RShiftVB ||
1456          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1457   bool sign = (opcode != Op_URShiftVB);
1458   int ext_vector_len = vector_len + 1;
1459   vextendbw(sign, dst, src, ext_vector_len);
1460   vpmovzxbw(vtmp, shift, ext_vector_len);
1461   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1462   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1463   if (vector_len == 0) {
1464     vextracti128_high(vtmp, dst);
1465     vpackuswb(dst, dst, vtmp, vector_len);
1466   } else {
1467     vextracti64x4_high(vtmp, dst);
1468     vpackuswb(dst, dst, vtmp, vector_len);
1469     vpermq(dst, dst, 0xD8, vector_len);
1470   }
1471 }
1472 
1473 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1474   switch(typ) {
1475     case T_BYTE:
1476       pinsrb(dst, val, idx);
1477       break;
1478     case T_SHORT:
1479       pinsrw(dst, val, idx);
1480       break;
1481     case T_INT:
1482       pinsrd(dst, val, idx);
1483       break;
1484     case T_LONG:
1485       pinsrq(dst, val, idx);
1486       break;
1487     default:
1488       assert(false,"Should not reach here.");
1489       break;
1490   }
1491 }
1492 
1493 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1494   switch(typ) {
1495     case T_BYTE:
1496       vpinsrb(dst, src, val, idx);
1497       break;
1498     case T_SHORT:
1499       vpinsrw(dst, src, val, idx);
1500       break;
1501     case T_INT:
1502       vpinsrd(dst, src, val, idx);
1503       break;
1504     case T_LONG:
1505       vpinsrq(dst, src, val, idx);
1506       break;
1507     default:
1508       assert(false,"Should not reach here.");
1509       break;
1510   }
1511 }
1512 
1513 #ifdef _LP64
1514 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1515                                                 XMMRegister dst, Register base,
1516                                                 Register idx_base,
1517                                                 Register offset, Register mask,
1518                                                 Register mask_idx, Register rtmp,
1519                                                 int vlen_enc) {
1520   vpxor(dst, dst, dst, vlen_enc);
1521   if (elem_bt == T_SHORT) {
1522     for (int i = 0; i < 4; i++) {
1523       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1524       Label skip_load;
1525       btq(mask, mask_idx);
1526       jccb(Assembler::carryClear, skip_load);
1527       movl(rtmp, Address(idx_base, i * 4));
1528       if (offset != noreg) {
1529         addl(rtmp, offset);
1530       }
1531       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1532       bind(skip_load);
1533       incq(mask_idx);
1534     }
1535   } else {
1536     assert(elem_bt == T_BYTE, "");
1537     for (int i = 0; i < 8; i++) {
1538       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1539       Label skip_load;
1540       btq(mask, mask_idx);
1541       jccb(Assembler::carryClear, skip_load);
1542       movl(rtmp, Address(idx_base, i * 4));
1543       if (offset != noreg) {
1544         addl(rtmp, offset);
1545       }
1546       pinsrb(dst, Address(base, rtmp), i);
1547       bind(skip_load);
1548       incq(mask_idx);
1549     }
1550   }
1551 }
1552 #endif // _LP64
1553 
1554 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1555                                          Register base, Register idx_base,
1556                                          Register offset, Register rtmp,
1557                                          int vlen_enc) {
1558   vpxor(dst, dst, dst, vlen_enc);
1559   if (elem_bt == T_SHORT) {
1560     for (int i = 0; i < 4; i++) {
1561       // dst[i] = src[offset + idx_base[i]]
1562       movl(rtmp, Address(idx_base, i * 4));
1563       if (offset != noreg) {
1564         addl(rtmp, offset);
1565       }
1566       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1567     }
1568   } else {
1569     assert(elem_bt == T_BYTE, "");
1570     for (int i = 0; i < 8; i++) {
1571       // dst[i] = src[offset + idx_base[i]]
1572       movl(rtmp, Address(idx_base, i * 4));
1573       if (offset != noreg) {
1574         addl(rtmp, offset);
1575       }
1576       pinsrb(dst, Address(base, rtmp), i);
1577     }
1578   }
1579 }
1580 
1581 /*
1582  * Gather using hybrid algorithm, first partially unroll scalar loop
1583  * to accumulate values from gather indices into a quad-word(64bit) slice.
1584  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1585  * permutation to place the slice into appropriate vector lane
1586  * locations in destination vector. Following pseudo code describes the
1587  * algorithm in detail:
1588  *
1589  * DST_VEC = ZERO_VEC
1590  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1591  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1592  * FOREACH_ITER:
1593  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1594  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1595  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1596  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1597  *
1598  * With each iteration, doubleword permute indices (0,1) corresponding
1599  * to gathered quadword gets right shifted by two lane positions.
1600  *
1601  */
1602 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1603                                         Register base, Register idx_base,
1604                                         Register offset, Register mask,
1605                                         XMMRegister xtmp1, XMMRegister xtmp2,
1606                                         XMMRegister temp_dst, Register rtmp,
1607                                         Register mask_idx, Register length,
1608                                         int vector_len, int vlen_enc) {
1609   Label GATHER8_LOOP;
1610   assert(is_subword_type(elem_ty), "");
1611   movl(length, vector_len);
1612   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1613   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1614   vallones(xtmp2, vlen_enc);
1615   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1616   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1617   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1618 
1619   bind(GATHER8_LOOP);
1620     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1621     if (mask == noreg) {
1622       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1623     } else {
1624       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1625     }
1626     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1627     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1628     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1629     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1630     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1631     vpor(dst, dst, temp_dst, vlen_enc);
1632     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1633     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1634     jcc(Assembler::notEqual, GATHER8_LOOP);
1635 }
1636 
1637 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1638   switch(typ) {
1639     case T_INT:
1640       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1641       break;
1642     case T_FLOAT:
1643       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1644       break;
1645     case T_LONG:
1646       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1647       break;
1648     case T_DOUBLE:
1649       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1650       break;
1651     default:
1652       assert(false,"Should not reach here.");
1653       break;
1654   }
1655 }
1656 
1657 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1658   switch(typ) {
1659     case T_INT:
1660       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1661       break;
1662     case T_FLOAT:
1663       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1664       break;
1665     case T_LONG:
1666       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1667       break;
1668     case T_DOUBLE:
1669       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1670       break;
1671     default:
1672       assert(false,"Should not reach here.");
1673       break;
1674   }
1675 }
1676 
1677 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1678   switch(typ) {
1679     case T_INT:
1680       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1681       break;
1682     case T_FLOAT:
1683       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1684       break;
1685     case T_LONG:
1686       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1687       break;
1688     case T_DOUBLE:
1689       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1690       break;
1691     default:
1692       assert(false,"Should not reach here.");
1693       break;
1694   }
1695 }
1696 
1697 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1698   if (vlen_in_bytes <= 16) {
1699     pxor (dst, dst);
1700     psubb(dst, src);
1701     switch (elem_bt) {
1702       case T_BYTE:   /* nothing to do */ break;
1703       case T_SHORT:  pmovsxbw(dst, dst); break;
1704       case T_INT:    pmovsxbd(dst, dst); break;
1705       case T_FLOAT:  pmovsxbd(dst, dst); break;
1706       case T_LONG:   pmovsxbq(dst, dst); break;
1707       case T_DOUBLE: pmovsxbq(dst, dst); break;
1708 
1709       default: assert(false, "%s", type2name(elem_bt));
1710     }
1711   } else {
1712     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1713     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1714 
1715     vpxor (dst, dst, dst, vlen_enc);
1716     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1717 
1718     switch (elem_bt) {
1719       case T_BYTE:   /* nothing to do */            break;
1720       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1721       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1722       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1723       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1724       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1725 
1726       default: assert(false, "%s", type2name(elem_bt));
1727     }
1728   }
1729 }
1730 
1731 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1732   if (novlbwdq) {
1733     vpmovsxbd(xtmp, src, vlen_enc);
1734     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1735             Assembler::eq, true, vlen_enc, noreg);
1736   } else {
1737     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1738     vpsubb(xtmp, xtmp, src, vlen_enc);
1739     evpmovb2m(dst, xtmp, vlen_enc);
1740   }
1741 }
1742 
1743 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1744   switch (vlen_in_bytes) {
1745     case 4:  movdl(dst, src);   break;
1746     case 8:  movq(dst, src);    break;
1747     case 16: movdqu(dst, src);  break;
1748     case 32: vmovdqu(dst, src); break;
1749     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1750     default: ShouldNotReachHere();
1751   }
1752 }
1753 
1754 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1755   assert(rscratch != noreg || always_reachable(src), "missing");
1756 
1757   if (reachable(src)) {
1758     load_vector(dst, as_Address(src), vlen_in_bytes);
1759   } else {
1760     lea(rscratch, src);
1761     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1762   }
1763 }
1764 
1765 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1766   int vlen_enc = vector_length_encoding(vlen);
1767   if (VM_Version::supports_avx()) {
1768     if (bt == T_LONG) {
1769       if (VM_Version::supports_avx2()) {
1770         vpbroadcastq(dst, src, vlen_enc);
1771       } else {
1772         vmovddup(dst, src, vlen_enc);
1773       }
1774     } else if (bt == T_DOUBLE) {
1775       if (vlen_enc != Assembler::AVX_128bit) {
1776         vbroadcastsd(dst, src, vlen_enc, noreg);
1777       } else {
1778         vmovddup(dst, src, vlen_enc);
1779       }
1780     } else {
1781       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1782         vpbroadcastd(dst, src, vlen_enc);
1783       } else {
1784         vbroadcastss(dst, src, vlen_enc);
1785       }
1786     }
1787   } else if (VM_Version::supports_sse3()) {
1788     movddup(dst, src);
1789   } else {
1790     movq(dst, src);
1791     if (vlen == 16) {
1792       punpcklqdq(dst, dst);
1793     }
1794   }
1795 }
1796 
1797 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1798   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1799   int offset = exact_log2(type2aelembytes(bt)) << 6;
1800   if (is_floating_point_type(bt)) {
1801     offset += 128;
1802   }
1803   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1804   load_vector(dst, addr, vlen_in_bytes);
1805 }
1806 
1807 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1808 
1809 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1810   int vector_len = Assembler::AVX_128bit;
1811 
1812   switch (opcode) {
1813     case Op_AndReductionV:  pand(dst, src); break;
1814     case Op_OrReductionV:   por (dst, src); break;
1815     case Op_XorReductionV:  pxor(dst, src); break;
1816     case Op_MinReductionV:
1817       switch (typ) {
1818         case T_BYTE:        pminsb(dst, src); break;
1819         case T_SHORT:       pminsw(dst, src); break;
1820         case T_INT:         pminsd(dst, src); break;
1821         case T_LONG:        assert(UseAVX > 2, "required");
1822                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1823         default:            assert(false, "wrong type");
1824       }
1825       break;
1826     case Op_MaxReductionV:
1827       switch (typ) {
1828         case T_BYTE:        pmaxsb(dst, src); break;
1829         case T_SHORT:       pmaxsw(dst, src); break;
1830         case T_INT:         pmaxsd(dst, src); break;
1831         case T_LONG:        assert(UseAVX > 2, "required");
1832                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1833         default:            assert(false, "wrong type");
1834       }
1835       break;
1836     case Op_AddReductionVF: addss(dst, src); break;
1837     case Op_AddReductionVD: addsd(dst, src); break;
1838     case Op_AddReductionVI:
1839       switch (typ) {
1840         case T_BYTE:        paddb(dst, src); break;
1841         case T_SHORT:       paddw(dst, src); break;
1842         case T_INT:         paddd(dst, src); break;
1843         default:            assert(false, "wrong type");
1844       }
1845       break;
1846     case Op_AddReductionVL: paddq(dst, src); break;
1847     case Op_MulReductionVF: mulss(dst, src); break;
1848     case Op_MulReductionVD: mulsd(dst, src); break;
1849     case Op_MulReductionVI:
1850       switch (typ) {
1851         case T_SHORT:       pmullw(dst, src); break;
1852         case T_INT:         pmulld(dst, src); break;
1853         default:            assert(false, "wrong type");
1854       }
1855       break;
1856     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1857                             evpmullq(dst, dst, src, vector_len); break;
1858     default:                assert(false, "wrong opcode");
1859   }
1860 }
1861 
1862 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1863   int vector_len = Assembler::AVX_256bit;
1864 
1865   switch (opcode) {
1866     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1867     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1868     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1869     case Op_MinReductionV:
1870       switch (typ) {
1871         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1872         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1873         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1874         case T_LONG:        assert(UseAVX > 2, "required");
1875                             vpminsq(dst, src1, src2, vector_len); break;
1876         default:            assert(false, "wrong type");
1877       }
1878       break;
1879     case Op_MaxReductionV:
1880       switch (typ) {
1881         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1882         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1883         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1884         case T_LONG:        assert(UseAVX > 2, "required");
1885                             vpmaxsq(dst, src1, src2, vector_len); break;
1886         default:            assert(false, "wrong type");
1887       }
1888       break;
1889     case Op_AddReductionVI:
1890       switch (typ) {
1891         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1892         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1893         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1894         default:            assert(false, "wrong type");
1895       }
1896       break;
1897     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1898     case Op_MulReductionVI:
1899       switch (typ) {
1900         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1901         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1902         default:            assert(false, "wrong type");
1903       }
1904       break;
1905     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1906     default:                assert(false, "wrong opcode");
1907   }
1908 }
1909 
1910 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1911                                   XMMRegister dst, XMMRegister src,
1912                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1913   switch (opcode) {
1914     case Op_AddReductionVF:
1915     case Op_MulReductionVF:
1916       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1917       break;
1918 
1919     case Op_AddReductionVD:
1920     case Op_MulReductionVD:
1921       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1922       break;
1923 
1924     default: assert(false, "wrong opcode");
1925   }
1926 }
1927 
1928 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1929                              Register dst, Register src1, XMMRegister src2,
1930                              XMMRegister vtmp1, XMMRegister vtmp2) {
1931   switch (vlen) {
1932     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1933     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1934     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1935     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1936 
1937     default: assert(false, "wrong vector length");
1938   }
1939 }
1940 
1941 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1942                              Register dst, Register src1, XMMRegister src2,
1943                              XMMRegister vtmp1, XMMRegister vtmp2) {
1944   switch (vlen) {
1945     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1946     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1947     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1948     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1949 
1950     default: assert(false, "wrong vector length");
1951   }
1952 }
1953 
1954 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1955                              Register dst, Register src1, XMMRegister src2,
1956                              XMMRegister vtmp1, XMMRegister vtmp2) {
1957   switch (vlen) {
1958     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1959     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1960     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1961     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1962 
1963     default: assert(false, "wrong vector length");
1964   }
1965 }
1966 
1967 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1968                              Register dst, Register src1, XMMRegister src2,
1969                              XMMRegister vtmp1, XMMRegister vtmp2) {
1970   switch (vlen) {
1971     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1972     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1973     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1974     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1975 
1976     default: assert(false, "wrong vector length");
1977   }
1978 }
1979 
1980 #ifdef _LP64
1981 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1982                              Register dst, Register src1, XMMRegister src2,
1983                              XMMRegister vtmp1, XMMRegister vtmp2) {
1984   switch (vlen) {
1985     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1986     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1987     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1988 
1989     default: assert(false, "wrong vector length");
1990   }
1991 }
1992 #endif // _LP64
1993 
1994 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1995   switch (vlen) {
1996     case 2:
1997       assert(vtmp2 == xnoreg, "");
1998       reduce2F(opcode, dst, src, vtmp1);
1999       break;
2000     case 4:
2001       assert(vtmp2 == xnoreg, "");
2002       reduce4F(opcode, dst, src, vtmp1);
2003       break;
2004     case 8:
2005       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2006       break;
2007     case 16:
2008       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2009       break;
2010     default: assert(false, "wrong vector length");
2011   }
2012 }
2013 
2014 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2015   switch (vlen) {
2016     case 2:
2017       assert(vtmp2 == xnoreg, "");
2018       reduce2D(opcode, dst, src, vtmp1);
2019       break;
2020     case 4:
2021       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2022       break;
2023     case 8:
2024       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2025       break;
2026     default: assert(false, "wrong vector length");
2027   }
2028 }
2029 
2030 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2031   if (opcode == Op_AddReductionVI) {
2032     if (vtmp1 != src2) {
2033       movdqu(vtmp1, src2);
2034     }
2035     phaddd(vtmp1, vtmp1);
2036   } else {
2037     pshufd(vtmp1, src2, 0x1);
2038     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2039   }
2040   movdl(vtmp2, src1);
2041   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2042   movdl(dst, vtmp1);
2043 }
2044 
2045 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2046   if (opcode == Op_AddReductionVI) {
2047     if (vtmp1 != src2) {
2048       movdqu(vtmp1, src2);
2049     }
2050     phaddd(vtmp1, src2);
2051     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2052   } else {
2053     pshufd(vtmp2, src2, 0xE);
2054     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2055     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2056   }
2057 }
2058 
2059 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2060   if (opcode == Op_AddReductionVI) {
2061     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2062     vextracti128_high(vtmp2, vtmp1);
2063     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2064     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2065   } else {
2066     vextracti128_high(vtmp1, src2);
2067     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2068     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2069   }
2070 }
2071 
2072 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2073   vextracti64x4_high(vtmp2, src2);
2074   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2075   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2076 }
2077 
2078 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2079   pshufd(vtmp2, src2, 0x1);
2080   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2081   movdqu(vtmp1, vtmp2);
2082   psrldq(vtmp1, 2);
2083   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2084   movdqu(vtmp2, vtmp1);
2085   psrldq(vtmp2, 1);
2086   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2087   movdl(vtmp2, src1);
2088   pmovsxbd(vtmp1, vtmp1);
2089   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2090   pextrb(dst, vtmp1, 0x0);
2091   movsbl(dst, dst);
2092 }
2093 
2094 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2095   pshufd(vtmp1, src2, 0xE);
2096   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2097   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2098 }
2099 
2100 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2101   vextracti128_high(vtmp2, src2);
2102   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2103   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2104 }
2105 
2106 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2107   vextracti64x4_high(vtmp1, src2);
2108   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2109   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2110 }
2111 
2112 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2113   pmovsxbw(vtmp2, src2);
2114   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2115 }
2116 
2117 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2118   if (UseAVX > 1) {
2119     int vector_len = Assembler::AVX_256bit;
2120     vpmovsxbw(vtmp1, src2, vector_len);
2121     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2122   } else {
2123     pmovsxbw(vtmp2, src2);
2124     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2125     pshufd(vtmp2, src2, 0x1);
2126     pmovsxbw(vtmp2, src2);
2127     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2128   }
2129 }
2130 
2131 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2132   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2133     int vector_len = Assembler::AVX_512bit;
2134     vpmovsxbw(vtmp1, src2, vector_len);
2135     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2136   } else {
2137     assert(UseAVX >= 2,"Should not reach here.");
2138     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2139     vextracti128_high(vtmp2, src2);
2140     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2141   }
2142 }
2143 
2144 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2145   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2146   vextracti64x4_high(vtmp2, src2);
2147   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2148 }
2149 
2150 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2151   if (opcode == Op_AddReductionVI) {
2152     if (vtmp1 != src2) {
2153       movdqu(vtmp1, src2);
2154     }
2155     phaddw(vtmp1, vtmp1);
2156     phaddw(vtmp1, vtmp1);
2157   } else {
2158     pshufd(vtmp2, src2, 0x1);
2159     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2160     movdqu(vtmp1, vtmp2);
2161     psrldq(vtmp1, 2);
2162     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2163   }
2164   movdl(vtmp2, src1);
2165   pmovsxwd(vtmp1, vtmp1);
2166   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2167   pextrw(dst, vtmp1, 0x0);
2168   movswl(dst, dst);
2169 }
2170 
2171 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2172   if (opcode == Op_AddReductionVI) {
2173     if (vtmp1 != src2) {
2174       movdqu(vtmp1, src2);
2175     }
2176     phaddw(vtmp1, src2);
2177   } else {
2178     pshufd(vtmp1, src2, 0xE);
2179     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2180   }
2181   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2182 }
2183 
2184 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2185   if (opcode == Op_AddReductionVI) {
2186     int vector_len = Assembler::AVX_256bit;
2187     vphaddw(vtmp2, src2, src2, vector_len);
2188     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2189   } else {
2190     vextracti128_high(vtmp2, src2);
2191     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2192   }
2193   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2194 }
2195 
2196 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2197   int vector_len = Assembler::AVX_256bit;
2198   vextracti64x4_high(vtmp1, src2);
2199   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2200   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2201 }
2202 
2203 #ifdef _LP64
2204 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2205   pshufd(vtmp2, src2, 0xE);
2206   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2207   movdq(vtmp1, src1);
2208   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2209   movdq(dst, vtmp1);
2210 }
2211 
2212 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2213   vextracti128_high(vtmp1, src2);
2214   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2215   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2216 }
2217 
2218 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2219   vextracti64x4_high(vtmp2, src2);
2220   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2221   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2222 }
2223 
2224 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2225   mov64(temp, -1L);
2226   bzhiq(temp, temp, len);
2227   kmovql(dst, temp);
2228 }
2229 #endif // _LP64
2230 
2231 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2232   reduce_operation_128(T_FLOAT, opcode, dst, src);
2233   pshufd(vtmp, src, 0x1);
2234   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2235 }
2236 
2237 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2238   reduce2F(opcode, dst, src, vtmp);
2239   pshufd(vtmp, src, 0x2);
2240   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2241   pshufd(vtmp, src, 0x3);
2242   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2243 }
2244 
2245 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2246   reduce4F(opcode, dst, src, vtmp2);
2247   vextractf128_high(vtmp2, src);
2248   reduce4F(opcode, dst, vtmp2, vtmp1);
2249 }
2250 
2251 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2252   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2253   vextracti64x4_high(vtmp1, src);
2254   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2255 }
2256 
2257 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2258   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2259   pshufd(vtmp, src, 0xE);
2260   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2261 }
2262 
2263 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2264   reduce2D(opcode, dst, src, vtmp2);
2265   vextractf128_high(vtmp2, src);
2266   reduce2D(opcode, dst, vtmp2, vtmp1);
2267 }
2268 
2269 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2270   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2271   vextracti64x4_high(vtmp1, src);
2272   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2273 }
2274 
2275 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2276   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2277 }
2278 
2279 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2280   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2281 }
2282 
2283 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2284                                  int vec_enc) {
2285   switch(elem_bt) {
2286     case T_INT:
2287     case T_FLOAT:
2288       vmaskmovps(dst, src, mask, vec_enc);
2289       break;
2290     case T_LONG:
2291     case T_DOUBLE:
2292       vmaskmovpd(dst, src, mask, vec_enc);
2293       break;
2294     default:
2295       fatal("Unsupported type %s", type2name(elem_bt));
2296       break;
2297   }
2298 }
2299 
2300 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2301                                  int vec_enc) {
2302   switch(elem_bt) {
2303     case T_INT:
2304     case T_FLOAT:
2305       vmaskmovps(dst, src, mask, vec_enc);
2306       break;
2307     case T_LONG:
2308     case T_DOUBLE:
2309       vmaskmovpd(dst, src, mask, vec_enc);
2310       break;
2311     default:
2312       fatal("Unsupported type %s", type2name(elem_bt));
2313       break;
2314   }
2315 }
2316 
2317 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2318                                           XMMRegister dst, XMMRegister src,
2319                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2320                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2321   const int permconst[] = {1, 14};
2322   XMMRegister wsrc = src;
2323   XMMRegister wdst = xmm_0;
2324   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2325 
2326   int vlen_enc = Assembler::AVX_128bit;
2327   if (vlen == 16) {
2328     vlen_enc = Assembler::AVX_256bit;
2329   }
2330 
2331   for (int i = log2(vlen) - 1; i >=0; i--) {
2332     if (i == 0 && !is_dst_valid) {
2333       wdst = dst;
2334     }
2335     if (i == 3) {
2336       vextracti64x4_high(wtmp, wsrc);
2337     } else if (i == 2) {
2338       vextracti128_high(wtmp, wsrc);
2339     } else { // i = [0,1]
2340       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2341     }
2342     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2343     wsrc = wdst;
2344     vlen_enc = Assembler::AVX_128bit;
2345   }
2346   if (is_dst_valid) {
2347     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2348   }
2349 }
2350 
2351 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2352                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2353                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2354   XMMRegister wsrc = src;
2355   XMMRegister wdst = xmm_0;
2356   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2357   int vlen_enc = Assembler::AVX_128bit;
2358   if (vlen == 8) {
2359     vlen_enc = Assembler::AVX_256bit;
2360   }
2361   for (int i = log2(vlen) - 1; i >=0; i--) {
2362     if (i == 0 && !is_dst_valid) {
2363       wdst = dst;
2364     }
2365     if (i == 1) {
2366       vextracti128_high(wtmp, wsrc);
2367     } else if (i == 2) {
2368       vextracti64x4_high(wtmp, wsrc);
2369     } else {
2370       assert(i == 0, "%d", i);
2371       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2372     }
2373     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2374     wsrc = wdst;
2375     vlen_enc = Assembler::AVX_128bit;
2376   }
2377   if (is_dst_valid) {
2378     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2379   }
2380 }
2381 
2382 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2383   switch (bt) {
2384     case T_BYTE:  pextrb(dst, src, idx); break;
2385     case T_SHORT: pextrw(dst, src, idx); break;
2386     case T_INT:   pextrd(dst, src, idx); break;
2387     case T_LONG:  pextrq(dst, src, idx); break;
2388 
2389     default:
2390       assert(false,"Should not reach here.");
2391       break;
2392   }
2393 }
2394 
2395 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2396   int esize =  type2aelembytes(typ);
2397   int elem_per_lane = 16/esize;
2398   int lane = elemindex / elem_per_lane;
2399   int eindex = elemindex % elem_per_lane;
2400 
2401   if (lane >= 2) {
2402     assert(UseAVX > 2, "required");
2403     vextractf32x4(dst, src, lane & 3);
2404     return dst;
2405   } else if (lane > 0) {
2406     assert(UseAVX > 0, "required");
2407     vextractf128(dst, src, lane);
2408     return dst;
2409   } else {
2410     return src;
2411   }
2412 }
2413 
2414 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2415   if (typ == T_BYTE) {
2416     movsbl(dst, dst);
2417   } else if (typ == T_SHORT) {
2418     movswl(dst, dst);
2419   }
2420 }
2421 
2422 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2423   int esize =  type2aelembytes(typ);
2424   int elem_per_lane = 16/esize;
2425   int eindex = elemindex % elem_per_lane;
2426   assert(is_integral_type(typ),"required");
2427 
2428   if (eindex == 0) {
2429     if (typ == T_LONG) {
2430       movq(dst, src);
2431     } else {
2432       movdl(dst, src);
2433       movsxl(typ, dst);
2434     }
2435   } else {
2436     extract(typ, dst, src, eindex);
2437     movsxl(typ, dst);
2438   }
2439 }
2440 
2441 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2442   int esize =  type2aelembytes(typ);
2443   int elem_per_lane = 16/esize;
2444   int eindex = elemindex % elem_per_lane;
2445   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2446 
2447   if (eindex == 0) {
2448     movq(dst, src);
2449   } else {
2450     if (typ == T_FLOAT) {
2451       if (UseAVX == 0) {
2452         movdqu(dst, src);
2453         shufps(dst, dst, eindex);
2454       } else {
2455         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2456       }
2457     } else {
2458       if (UseAVX == 0) {
2459         movdqu(dst, src);
2460         psrldq(dst, eindex*esize);
2461       } else {
2462         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2463       }
2464       movq(dst, dst);
2465     }
2466   }
2467   // Zero upper bits
2468   if (typ == T_FLOAT) {
2469     if (UseAVX == 0) {
2470       assert(vtmp != xnoreg, "required.");
2471       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2472       pand(dst, vtmp);
2473     } else {
2474       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2475     }
2476   }
2477 }
2478 
2479 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2480   switch(typ) {
2481     case T_BYTE:
2482     case T_BOOLEAN:
2483       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2484       break;
2485     case T_SHORT:
2486     case T_CHAR:
2487       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2488       break;
2489     case T_INT:
2490     case T_FLOAT:
2491       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2492       break;
2493     case T_LONG:
2494     case T_DOUBLE:
2495       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2496       break;
2497     default:
2498       assert(false,"Should not reach here.");
2499       break;
2500   }
2501 }
2502 
2503 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2504   assert(rscratch != noreg || always_reachable(src2), "missing");
2505 
2506   switch(typ) {
2507     case T_BOOLEAN:
2508     case T_BYTE:
2509       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2510       break;
2511     case T_CHAR:
2512     case T_SHORT:
2513       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2514       break;
2515     case T_INT:
2516     case T_FLOAT:
2517       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2518       break;
2519     case T_LONG:
2520     case T_DOUBLE:
2521       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2522       break;
2523     default:
2524       assert(false,"Should not reach here.");
2525       break;
2526   }
2527 }
2528 
2529 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2530   switch(typ) {
2531     case T_BYTE:
2532       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2533       break;
2534     case T_SHORT:
2535       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2536       break;
2537     case T_INT:
2538     case T_FLOAT:
2539       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2540       break;
2541     case T_LONG:
2542     case T_DOUBLE:
2543       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2544       break;
2545     default:
2546       assert(false,"Should not reach here.");
2547       break;
2548   }
2549 }
2550 
2551 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2552   assert(vlen_in_bytes <= 32, "");
2553   int esize = type2aelembytes(bt);
2554   if (vlen_in_bytes == 32) {
2555     assert(vtmp == xnoreg, "required.");
2556     if (esize >= 4) {
2557       vtestps(src1, src2, AVX_256bit);
2558     } else {
2559       vptest(src1, src2, AVX_256bit);
2560     }
2561     return;
2562   }
2563   if (vlen_in_bytes < 16) {
2564     // Duplicate the lower part to fill the whole register,
2565     // Don't need to do so for src2
2566     assert(vtmp != xnoreg, "required");
2567     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2568     pshufd(vtmp, src1, shuffle_imm);
2569   } else {
2570     assert(vtmp == xnoreg, "required");
2571     vtmp = src1;
2572   }
2573   if (esize >= 4 && VM_Version::supports_avx()) {
2574     vtestps(vtmp, src2, AVX_128bit);
2575   } else {
2576     ptest(vtmp, src2);
2577   }
2578 }
2579 
2580 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2581   assert(UseAVX >= 2, "required");
2582 #ifdef ASSERT
2583   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2584   bool is_bw_supported = VM_Version::supports_avx512bw();
2585   if (is_bw && !is_bw_supported) {
2586     assert(vlen_enc != Assembler::AVX_512bit, "required");
2587     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2588            "XMM register should be 0-15");
2589   }
2590 #endif // ASSERT
2591   switch (elem_bt) {
2592     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2593     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2594     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2595     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2596     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2597     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2598     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2599   }
2600 }
2601 
2602 #ifdef _LP64
2603 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2604   assert(UseAVX >= 2, "required");
2605   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2606   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2607   if ((UseAVX > 2) &&
2608       (!is_bw || VM_Version::supports_avx512bw()) &&
2609       (!is_vl || VM_Version::supports_avx512vl())) {
2610     switch (elem_bt) {
2611       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2612       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2613       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2614       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2615       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2616     }
2617   } else {
2618     assert(vlen_enc != Assembler::AVX_512bit, "required");
2619     assert((dst->encoding() < 16),"XMM register should be 0-15");
2620     switch (elem_bt) {
2621       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2622       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2623       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2624       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2625       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2626       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2627       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2628     }
2629   }
2630 }
2631 #endif
2632 
2633 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2634   switch (to_elem_bt) {
2635     case T_SHORT:
2636       vpmovsxbw(dst, src, vlen_enc);
2637       break;
2638     case T_INT:
2639       vpmovsxbd(dst, src, vlen_enc);
2640       break;
2641     case T_FLOAT:
2642       vpmovsxbd(dst, src, vlen_enc);
2643       vcvtdq2ps(dst, dst, vlen_enc);
2644       break;
2645     case T_LONG:
2646       vpmovsxbq(dst, src, vlen_enc);
2647       break;
2648     case T_DOUBLE: {
2649       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2650       vpmovsxbd(dst, src, mid_vlen_enc);
2651       vcvtdq2pd(dst, dst, vlen_enc);
2652       break;
2653     }
2654     default:
2655       fatal("Unsupported type %s", type2name(to_elem_bt));
2656       break;
2657   }
2658 }
2659 
2660 //-------------------------------------------------------------------------------------------
2661 
2662 // IndexOf for constant substrings with size >= 8 chars
2663 // which don't need to be loaded through stack.
2664 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2665                                          Register cnt1, Register cnt2,
2666                                          int int_cnt2,  Register result,
2667                                          XMMRegister vec, Register tmp,
2668                                          int ae) {
2669   ShortBranchVerifier sbv(this);
2670   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2671   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2672 
2673   // This method uses the pcmpestri instruction with bound registers
2674   //   inputs:
2675   //     xmm - substring
2676   //     rax - substring length (elements count)
2677   //     mem - scanned string
2678   //     rdx - string length (elements count)
2679   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2680   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2681   //   outputs:
2682   //     rcx - matched index in string
2683   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2684   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2685   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2686   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2687   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2688 
2689   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2690         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2691         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2692 
2693   // Note, inline_string_indexOf() generates checks:
2694   // if (substr.count > string.count) return -1;
2695   // if (substr.count == 0) return 0;
2696   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2697 
2698   // Load substring.
2699   if (ae == StrIntrinsicNode::UL) {
2700     pmovzxbw(vec, Address(str2, 0));
2701   } else {
2702     movdqu(vec, Address(str2, 0));
2703   }
2704   movl(cnt2, int_cnt2);
2705   movptr(result, str1); // string addr
2706 
2707   if (int_cnt2 > stride) {
2708     jmpb(SCAN_TO_SUBSTR);
2709 
2710     // Reload substr for rescan, this code
2711     // is executed only for large substrings (> 8 chars)
2712     bind(RELOAD_SUBSTR);
2713     if (ae == StrIntrinsicNode::UL) {
2714       pmovzxbw(vec, Address(str2, 0));
2715     } else {
2716       movdqu(vec, Address(str2, 0));
2717     }
2718     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2719 
2720     bind(RELOAD_STR);
2721     // We came here after the beginning of the substring was
2722     // matched but the rest of it was not so we need to search
2723     // again. Start from the next element after the previous match.
2724 
2725     // cnt2 is number of substring reminding elements and
2726     // cnt1 is number of string reminding elements when cmp failed.
2727     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2728     subl(cnt1, cnt2);
2729     addl(cnt1, int_cnt2);
2730     movl(cnt2, int_cnt2); // Now restore cnt2
2731 
2732     decrementl(cnt1);     // Shift to next element
2733     cmpl(cnt1, cnt2);
2734     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2735 
2736     addptr(result, (1<<scale1));
2737 
2738   } // (int_cnt2 > 8)
2739 
2740   // Scan string for start of substr in 16-byte vectors
2741   bind(SCAN_TO_SUBSTR);
2742   pcmpestri(vec, Address(result, 0), mode);
2743   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2744   subl(cnt1, stride);
2745   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2746   cmpl(cnt1, cnt2);
2747   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2748   addptr(result, 16);
2749   jmpb(SCAN_TO_SUBSTR);
2750 
2751   // Found a potential substr
2752   bind(FOUND_CANDIDATE);
2753   // Matched whole vector if first element matched (tmp(rcx) == 0).
2754   if (int_cnt2 == stride) {
2755     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2756   } else { // int_cnt2 > 8
2757     jccb(Assembler::overflow, FOUND_SUBSTR);
2758   }
2759   // After pcmpestri tmp(rcx) contains matched element index
2760   // Compute start addr of substr
2761   lea(result, Address(result, tmp, scale1));
2762 
2763   // Make sure string is still long enough
2764   subl(cnt1, tmp);
2765   cmpl(cnt1, cnt2);
2766   if (int_cnt2 == stride) {
2767     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2768   } else { // int_cnt2 > 8
2769     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2770   }
2771   // Left less then substring.
2772 
2773   bind(RET_NOT_FOUND);
2774   movl(result, -1);
2775   jmp(EXIT);
2776 
2777   if (int_cnt2 > stride) {
2778     // This code is optimized for the case when whole substring
2779     // is matched if its head is matched.
2780     bind(MATCH_SUBSTR_HEAD);
2781     pcmpestri(vec, Address(result, 0), mode);
2782     // Reload only string if does not match
2783     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2784 
2785     Label CONT_SCAN_SUBSTR;
2786     // Compare the rest of substring (> 8 chars).
2787     bind(FOUND_SUBSTR);
2788     // First 8 chars are already matched.
2789     negptr(cnt2);
2790     addptr(cnt2, stride);
2791 
2792     bind(SCAN_SUBSTR);
2793     subl(cnt1, stride);
2794     cmpl(cnt2, -stride); // Do not read beyond substring
2795     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2796     // Back-up strings to avoid reading beyond substring:
2797     // cnt1 = cnt1 - cnt2 + 8
2798     addl(cnt1, cnt2); // cnt2 is negative
2799     addl(cnt1, stride);
2800     movl(cnt2, stride); negptr(cnt2);
2801     bind(CONT_SCAN_SUBSTR);
2802     if (int_cnt2 < (int)G) {
2803       int tail_off1 = int_cnt2<<scale1;
2804       int tail_off2 = int_cnt2<<scale2;
2805       if (ae == StrIntrinsicNode::UL) {
2806         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2807       } else {
2808         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2809       }
2810       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2811     } else {
2812       // calculate index in register to avoid integer overflow (int_cnt2*2)
2813       movl(tmp, int_cnt2);
2814       addptr(tmp, cnt2);
2815       if (ae == StrIntrinsicNode::UL) {
2816         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2817       } else {
2818         movdqu(vec, Address(str2, tmp, scale2, 0));
2819       }
2820       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2821     }
2822     // Need to reload strings pointers if not matched whole vector
2823     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2824     addptr(cnt2, stride);
2825     jcc(Assembler::negative, SCAN_SUBSTR);
2826     // Fall through if found full substring
2827 
2828   } // (int_cnt2 > 8)
2829 
2830   bind(RET_FOUND);
2831   // Found result if we matched full small substring.
2832   // Compute substr offset
2833   subptr(result, str1);
2834   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2835     shrl(result, 1); // index
2836   }
2837   bind(EXIT);
2838 
2839 } // string_indexofC8
2840 
2841 // Small strings are loaded through stack if they cross page boundary.
2842 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2843                                        Register cnt1, Register cnt2,
2844                                        int int_cnt2,  Register result,
2845                                        XMMRegister vec, Register tmp,
2846                                        int ae) {
2847   ShortBranchVerifier sbv(this);
2848   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2849   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2850 
2851   //
2852   // int_cnt2 is length of small (< 8 chars) constant substring
2853   // or (-1) for non constant substring in which case its length
2854   // is in cnt2 register.
2855   //
2856   // Note, inline_string_indexOf() generates checks:
2857   // if (substr.count > string.count) return -1;
2858   // if (substr.count == 0) return 0;
2859   //
2860   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2861   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2862   // This method uses the pcmpestri instruction with bound registers
2863   //   inputs:
2864   //     xmm - substring
2865   //     rax - substring length (elements count)
2866   //     mem - scanned string
2867   //     rdx - string length (elements count)
2868   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2869   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2870   //   outputs:
2871   //     rcx - matched index in string
2872   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2873   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2874   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2875   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2876 
2877   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2878         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2879         FOUND_CANDIDATE;
2880 
2881   { //========================================================
2882     // We don't know where these strings are located
2883     // and we can't read beyond them. Load them through stack.
2884     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2885 
2886     movptr(tmp, rsp); // save old SP
2887 
2888     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2889       if (int_cnt2 == (1>>scale2)) { // One byte
2890         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2891         load_unsigned_byte(result, Address(str2, 0));
2892         movdl(vec, result); // move 32 bits
2893       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2894         // Not enough header space in 32-bit VM: 12+3 = 15.
2895         movl(result, Address(str2, -1));
2896         shrl(result, 8);
2897         movdl(vec, result); // move 32 bits
2898       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2899         load_unsigned_short(result, Address(str2, 0));
2900         movdl(vec, result); // move 32 bits
2901       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2902         movdl(vec, Address(str2, 0)); // move 32 bits
2903       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2904         movq(vec, Address(str2, 0));  // move 64 bits
2905       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2906         // Array header size is 12 bytes in 32-bit VM
2907         // + 6 bytes for 3 chars == 18 bytes,
2908         // enough space to load vec and shift.
2909         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2910         if (ae == StrIntrinsicNode::UL) {
2911           int tail_off = int_cnt2-8;
2912           pmovzxbw(vec, Address(str2, tail_off));
2913           psrldq(vec, -2*tail_off);
2914         }
2915         else {
2916           int tail_off = int_cnt2*(1<<scale2);
2917           movdqu(vec, Address(str2, tail_off-16));
2918           psrldq(vec, 16-tail_off);
2919         }
2920       }
2921     } else { // not constant substring
2922       cmpl(cnt2, stride);
2923       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2924 
2925       // We can read beyond string if srt+16 does not cross page boundary
2926       // since heaps are aligned and mapped by pages.
2927       assert(os::vm_page_size() < (int)G, "default page should be small");
2928       movl(result, str2); // We need only low 32 bits
2929       andl(result, ((int)os::vm_page_size()-1));
2930       cmpl(result, ((int)os::vm_page_size()-16));
2931       jccb(Assembler::belowEqual, CHECK_STR);
2932 
2933       // Move small strings to stack to allow load 16 bytes into vec.
2934       subptr(rsp, 16);
2935       int stk_offset = wordSize-(1<<scale2);
2936       push(cnt2);
2937 
2938       bind(COPY_SUBSTR);
2939       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2940         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2941         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2942       } else if (ae == StrIntrinsicNode::UU) {
2943         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2944         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2945       }
2946       decrement(cnt2);
2947       jccb(Assembler::notZero, COPY_SUBSTR);
2948 
2949       pop(cnt2);
2950       movptr(str2, rsp);  // New substring address
2951     } // non constant
2952 
2953     bind(CHECK_STR);
2954     cmpl(cnt1, stride);
2955     jccb(Assembler::aboveEqual, BIG_STRINGS);
2956 
2957     // Check cross page boundary.
2958     movl(result, str1); // We need only low 32 bits
2959     andl(result, ((int)os::vm_page_size()-1));
2960     cmpl(result, ((int)os::vm_page_size()-16));
2961     jccb(Assembler::belowEqual, BIG_STRINGS);
2962 
2963     subptr(rsp, 16);
2964     int stk_offset = -(1<<scale1);
2965     if (int_cnt2 < 0) { // not constant
2966       push(cnt2);
2967       stk_offset += wordSize;
2968     }
2969     movl(cnt2, cnt1);
2970 
2971     bind(COPY_STR);
2972     if (ae == StrIntrinsicNode::LL) {
2973       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2974       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2975     } else {
2976       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2977       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2978     }
2979     decrement(cnt2);
2980     jccb(Assembler::notZero, COPY_STR);
2981 
2982     if (int_cnt2 < 0) { // not constant
2983       pop(cnt2);
2984     }
2985     movptr(str1, rsp);  // New string address
2986 
2987     bind(BIG_STRINGS);
2988     // Load substring.
2989     if (int_cnt2 < 0) { // -1
2990       if (ae == StrIntrinsicNode::UL) {
2991         pmovzxbw(vec, Address(str2, 0));
2992       } else {
2993         movdqu(vec, Address(str2, 0));
2994       }
2995       push(cnt2);       // substr count
2996       push(str2);       // substr addr
2997       push(str1);       // string addr
2998     } else {
2999       // Small (< 8 chars) constant substrings are loaded already.
3000       movl(cnt2, int_cnt2);
3001     }
3002     push(tmp);  // original SP
3003 
3004   } // Finished loading
3005 
3006   //========================================================
3007   // Start search
3008   //
3009 
3010   movptr(result, str1); // string addr
3011 
3012   if (int_cnt2  < 0) {  // Only for non constant substring
3013     jmpb(SCAN_TO_SUBSTR);
3014 
3015     // SP saved at sp+0
3016     // String saved at sp+1*wordSize
3017     // Substr saved at sp+2*wordSize
3018     // Substr count saved at sp+3*wordSize
3019 
3020     // Reload substr for rescan, this code
3021     // is executed only for large substrings (> 8 chars)
3022     bind(RELOAD_SUBSTR);
3023     movptr(str2, Address(rsp, 2*wordSize));
3024     movl(cnt2, Address(rsp, 3*wordSize));
3025     if (ae == StrIntrinsicNode::UL) {
3026       pmovzxbw(vec, Address(str2, 0));
3027     } else {
3028       movdqu(vec, Address(str2, 0));
3029     }
3030     // We came here after the beginning of the substring was
3031     // matched but the rest of it was not so we need to search
3032     // again. Start from the next element after the previous match.
3033     subptr(str1, result); // Restore counter
3034     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3035       shrl(str1, 1);
3036     }
3037     addl(cnt1, str1);
3038     decrementl(cnt1);   // Shift to next element
3039     cmpl(cnt1, cnt2);
3040     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3041 
3042     addptr(result, (1<<scale1));
3043   } // non constant
3044 
3045   // Scan string for start of substr in 16-byte vectors
3046   bind(SCAN_TO_SUBSTR);
3047   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3048   pcmpestri(vec, Address(result, 0), mode);
3049   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3050   subl(cnt1, stride);
3051   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3052   cmpl(cnt1, cnt2);
3053   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3054   addptr(result, 16);
3055 
3056   bind(ADJUST_STR);
3057   cmpl(cnt1, stride); // Do not read beyond string
3058   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3059   // Back-up string to avoid reading beyond string.
3060   lea(result, Address(result, cnt1, scale1, -16));
3061   movl(cnt1, stride);
3062   jmpb(SCAN_TO_SUBSTR);
3063 
3064   // Found a potential substr
3065   bind(FOUND_CANDIDATE);
3066   // After pcmpestri tmp(rcx) contains matched element index
3067 
3068   // Make sure string is still long enough
3069   subl(cnt1, tmp);
3070   cmpl(cnt1, cnt2);
3071   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3072   // Left less then substring.
3073 
3074   bind(RET_NOT_FOUND);
3075   movl(result, -1);
3076   jmp(CLEANUP);
3077 
3078   bind(FOUND_SUBSTR);
3079   // Compute start addr of substr
3080   lea(result, Address(result, tmp, scale1));
3081   if (int_cnt2 > 0) { // Constant substring
3082     // Repeat search for small substring (< 8 chars)
3083     // from new point without reloading substring.
3084     // Have to check that we don't read beyond string.
3085     cmpl(tmp, stride-int_cnt2);
3086     jccb(Assembler::greater, ADJUST_STR);
3087     // Fall through if matched whole substring.
3088   } else { // non constant
3089     assert(int_cnt2 == -1, "should be != 0");
3090 
3091     addl(tmp, cnt2);
3092     // Found result if we matched whole substring.
3093     cmpl(tmp, stride);
3094     jcc(Assembler::lessEqual, RET_FOUND);
3095 
3096     // Repeat search for small substring (<= 8 chars)
3097     // from new point 'str1' without reloading substring.
3098     cmpl(cnt2, stride);
3099     // Have to check that we don't read beyond string.
3100     jccb(Assembler::lessEqual, ADJUST_STR);
3101 
3102     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3103     // Compare the rest of substring (> 8 chars).
3104     movptr(str1, result);
3105 
3106     cmpl(tmp, cnt2);
3107     // First 8 chars are already matched.
3108     jccb(Assembler::equal, CHECK_NEXT);
3109 
3110     bind(SCAN_SUBSTR);
3111     pcmpestri(vec, Address(str1, 0), mode);
3112     // Need to reload strings pointers if not matched whole vector
3113     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3114 
3115     bind(CHECK_NEXT);
3116     subl(cnt2, stride);
3117     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3118     addptr(str1, 16);
3119     if (ae == StrIntrinsicNode::UL) {
3120       addptr(str2, 8);
3121     } else {
3122       addptr(str2, 16);
3123     }
3124     subl(cnt1, stride);
3125     cmpl(cnt2, stride); // Do not read beyond substring
3126     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3127     // Back-up strings to avoid reading beyond substring.
3128 
3129     if (ae == StrIntrinsicNode::UL) {
3130       lea(str2, Address(str2, cnt2, scale2, -8));
3131       lea(str1, Address(str1, cnt2, scale1, -16));
3132     } else {
3133       lea(str2, Address(str2, cnt2, scale2, -16));
3134       lea(str1, Address(str1, cnt2, scale1, -16));
3135     }
3136     subl(cnt1, cnt2);
3137     movl(cnt2, stride);
3138     addl(cnt1, stride);
3139     bind(CONT_SCAN_SUBSTR);
3140     if (ae == StrIntrinsicNode::UL) {
3141       pmovzxbw(vec, Address(str2, 0));
3142     } else {
3143       movdqu(vec, Address(str2, 0));
3144     }
3145     jmp(SCAN_SUBSTR);
3146 
3147     bind(RET_FOUND_LONG);
3148     movptr(str1, Address(rsp, wordSize));
3149   } // non constant
3150 
3151   bind(RET_FOUND);
3152   // Compute substr offset
3153   subptr(result, str1);
3154   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3155     shrl(result, 1); // index
3156   }
3157   bind(CLEANUP);
3158   pop(rsp); // restore SP
3159 
3160 } // string_indexof
3161 
3162 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3163                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3164   ShortBranchVerifier sbv(this);
3165   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3166 
3167   int stride = 8;
3168 
3169   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3170         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3171         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3172         FOUND_SEQ_CHAR, DONE_LABEL;
3173 
3174   movptr(result, str1);
3175   if (UseAVX >= 2) {
3176     cmpl(cnt1, stride);
3177     jcc(Assembler::less, SCAN_TO_CHAR);
3178     cmpl(cnt1, 2*stride);
3179     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3180     movdl(vec1, ch);
3181     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3182     vpxor(vec2, vec2);
3183     movl(tmp, cnt1);
3184     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3185     andl(cnt1,0x0000000F);  //tail count (in chars)
3186 
3187     bind(SCAN_TO_16_CHAR_LOOP);
3188     vmovdqu(vec3, Address(result, 0));
3189     vpcmpeqw(vec3, vec3, vec1, 1);
3190     vptest(vec2, vec3);
3191     jcc(Assembler::carryClear, FOUND_CHAR);
3192     addptr(result, 32);
3193     subl(tmp, 2*stride);
3194     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3195     jmp(SCAN_TO_8_CHAR);
3196     bind(SCAN_TO_8_CHAR_INIT);
3197     movdl(vec1, ch);
3198     pshuflw(vec1, vec1, 0x00);
3199     pshufd(vec1, vec1, 0);
3200     pxor(vec2, vec2);
3201   }
3202   bind(SCAN_TO_8_CHAR);
3203   cmpl(cnt1, stride);
3204   jcc(Assembler::less, SCAN_TO_CHAR);
3205   if (UseAVX < 2) {
3206     movdl(vec1, ch);
3207     pshuflw(vec1, vec1, 0x00);
3208     pshufd(vec1, vec1, 0);
3209     pxor(vec2, vec2);
3210   }
3211   movl(tmp, cnt1);
3212   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3213   andl(cnt1,0x00000007);  //tail count (in chars)
3214 
3215   bind(SCAN_TO_8_CHAR_LOOP);
3216   movdqu(vec3, Address(result, 0));
3217   pcmpeqw(vec3, vec1);
3218   ptest(vec2, vec3);
3219   jcc(Assembler::carryClear, FOUND_CHAR);
3220   addptr(result, 16);
3221   subl(tmp, stride);
3222   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3223   bind(SCAN_TO_CHAR);
3224   testl(cnt1, cnt1);
3225   jcc(Assembler::zero, RET_NOT_FOUND);
3226   bind(SCAN_TO_CHAR_LOOP);
3227   load_unsigned_short(tmp, Address(result, 0));
3228   cmpl(ch, tmp);
3229   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3230   addptr(result, 2);
3231   subl(cnt1, 1);
3232   jccb(Assembler::zero, RET_NOT_FOUND);
3233   jmp(SCAN_TO_CHAR_LOOP);
3234 
3235   bind(RET_NOT_FOUND);
3236   movl(result, -1);
3237   jmpb(DONE_LABEL);
3238 
3239   bind(FOUND_CHAR);
3240   if (UseAVX >= 2) {
3241     vpmovmskb(tmp, vec3);
3242   } else {
3243     pmovmskb(tmp, vec3);
3244   }
3245   bsfl(ch, tmp);
3246   addptr(result, ch);
3247 
3248   bind(FOUND_SEQ_CHAR);
3249   subptr(result, str1);
3250   shrl(result, 1);
3251 
3252   bind(DONE_LABEL);
3253 } // string_indexof_char
3254 
3255 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3256                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3257   ShortBranchVerifier sbv(this);
3258   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3259 
3260   int stride = 16;
3261 
3262   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3263         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3264         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3265         FOUND_SEQ_CHAR, DONE_LABEL;
3266 
3267   movptr(result, str1);
3268   if (UseAVX >= 2) {
3269     cmpl(cnt1, stride);
3270     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3271     cmpl(cnt1, stride*2);
3272     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3273     movdl(vec1, ch);
3274     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3275     vpxor(vec2, vec2);
3276     movl(tmp, cnt1);
3277     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3278     andl(cnt1,0x0000001F);  //tail count (in chars)
3279 
3280     bind(SCAN_TO_32_CHAR_LOOP);
3281     vmovdqu(vec3, Address(result, 0));
3282     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3283     vptest(vec2, vec3);
3284     jcc(Assembler::carryClear, FOUND_CHAR);
3285     addptr(result, 32);
3286     subl(tmp, stride*2);
3287     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3288     jmp(SCAN_TO_16_CHAR);
3289 
3290     bind(SCAN_TO_16_CHAR_INIT);
3291     movdl(vec1, ch);
3292     pxor(vec2, vec2);
3293     pshufb(vec1, vec2);
3294   }
3295 
3296   bind(SCAN_TO_16_CHAR);
3297   cmpl(cnt1, stride);
3298   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3299   if (UseAVX < 2) {
3300     movdl(vec1, ch);
3301     pxor(vec2, vec2);
3302     pshufb(vec1, vec2);
3303   }
3304   movl(tmp, cnt1);
3305   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3306   andl(cnt1,0x0000000F);  //tail count (in bytes)
3307 
3308   bind(SCAN_TO_16_CHAR_LOOP);
3309   movdqu(vec3, Address(result, 0));
3310   pcmpeqb(vec3, vec1);
3311   ptest(vec2, vec3);
3312   jcc(Assembler::carryClear, FOUND_CHAR);
3313   addptr(result, 16);
3314   subl(tmp, stride);
3315   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3316 
3317   bind(SCAN_TO_CHAR_INIT);
3318   testl(cnt1, cnt1);
3319   jcc(Assembler::zero, RET_NOT_FOUND);
3320   bind(SCAN_TO_CHAR_LOOP);
3321   load_unsigned_byte(tmp, Address(result, 0));
3322   cmpl(ch, tmp);
3323   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3324   addptr(result, 1);
3325   subl(cnt1, 1);
3326   jccb(Assembler::zero, RET_NOT_FOUND);
3327   jmp(SCAN_TO_CHAR_LOOP);
3328 
3329   bind(RET_NOT_FOUND);
3330   movl(result, -1);
3331   jmpb(DONE_LABEL);
3332 
3333   bind(FOUND_CHAR);
3334   if (UseAVX >= 2) {
3335     vpmovmskb(tmp, vec3);
3336   } else {
3337     pmovmskb(tmp, vec3);
3338   }
3339   bsfl(ch, tmp);
3340   addptr(result, ch);
3341 
3342   bind(FOUND_SEQ_CHAR);
3343   subptr(result, str1);
3344 
3345   bind(DONE_LABEL);
3346 } // stringL_indexof_char
3347 
3348 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3349   switch (eltype) {
3350   case T_BOOLEAN: return sizeof(jboolean);
3351   case T_BYTE:  return sizeof(jbyte);
3352   case T_SHORT: return sizeof(jshort);
3353   case T_CHAR:  return sizeof(jchar);
3354   case T_INT:   return sizeof(jint);
3355   default:
3356     ShouldNotReachHere();
3357     return -1;
3358   }
3359 }
3360 
3361 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3362   switch (eltype) {
3363   // T_BOOLEAN used as surrogate for unsigned byte
3364   case T_BOOLEAN: movzbl(dst, src);   break;
3365   case T_BYTE:    movsbl(dst, src);   break;
3366   case T_SHORT:   movswl(dst, src);   break;
3367   case T_CHAR:    movzwl(dst, src);   break;
3368   case T_INT:     movl(dst, src);     break;
3369   default:
3370     ShouldNotReachHere();
3371   }
3372 }
3373 
3374 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3375   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3376 }
3377 
3378 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3379   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3380 }
3381 
3382 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3383   const int vlen = Assembler::AVX_256bit;
3384   switch (eltype) {
3385   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3386   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3387   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3388   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3389   case T_INT:
3390     // do nothing
3391     break;
3392   default:
3393     ShouldNotReachHere();
3394   }
3395 }
3396 
3397 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3398                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3399                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3400                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3401                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3402                                         BasicType eltype) {
3403   ShortBranchVerifier sbv(this);
3404   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3405   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3406   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3407 
3408   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3409         SHORT_UNROLLED_LOOP_EXIT,
3410         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3411         UNROLLED_VECTOR_LOOP_BEGIN,
3412         END;
3413   switch (eltype) {
3414   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3415   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3416   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3417   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3418   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3419   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3420   }
3421 
3422   // For "renaming" for readibility of the code
3423   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3424                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3425                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3426 
3427   const int elsize = arrays_hashcode_elsize(eltype);
3428 
3429   /*
3430     if (cnt1 >= 2) {
3431       if (cnt1 >= 32) {
3432         UNROLLED VECTOR LOOP
3433       }
3434       UNROLLED SCALAR LOOP
3435     }
3436     SINGLE SCALAR
3437    */
3438 
3439   cmpl(cnt1, 32);
3440   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3441 
3442   // cnt1 >= 32 && generate_vectorized_loop
3443   xorl(index, index);
3444 
3445   // vresult = IntVector.zero(I256);
3446   for (int idx = 0; idx < 4; idx++) {
3447     vpxor(vresult[idx], vresult[idx]);
3448   }
3449   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3450   Register bound = tmp2;
3451   Register next = tmp3;
3452   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3453   movl(next, Address(tmp2, 0));
3454   movdl(vnext, next);
3455   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3456 
3457   // index = 0;
3458   // bound = cnt1 & ~(32 - 1);
3459   movl(bound, cnt1);
3460   andl(bound, ~(32 - 1));
3461   // for (; index < bound; index += 32) {
3462   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3463   // result *= next;
3464   imull(result, next);
3465   // loop fission to upfront the cost of fetching from memory, OOO execution
3466   // can then hopefully do a better job of prefetching
3467   for (int idx = 0; idx < 4; idx++) {
3468     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3469   }
3470   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3471   for (int idx = 0; idx < 4; idx++) {
3472     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3473     arrays_hashcode_elvcast(vtmp[idx], eltype);
3474     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3475   }
3476   // index += 32;
3477   addl(index, 32);
3478   // index < bound;
3479   cmpl(index, bound);
3480   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3481   // }
3482 
3483   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3484   subl(cnt1, bound);
3485   // release bound
3486 
3487   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3488   for (int idx = 0; idx < 4; idx++) {
3489     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3490     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3491     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3492   }
3493   // result += vresult.reduceLanes(ADD);
3494   for (int idx = 0; idx < 4; idx++) {
3495     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3496   }
3497 
3498   // } else if (cnt1 < 32) {
3499 
3500   bind(SHORT_UNROLLED_BEGIN);
3501   // int i = 1;
3502   movl(index, 1);
3503   cmpl(index, cnt1);
3504   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3505 
3506   // for (; i < cnt1 ; i += 2) {
3507   bind(SHORT_UNROLLED_LOOP_BEGIN);
3508   movl(tmp3, 961);
3509   imull(result, tmp3);
3510   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3511   movl(tmp3, tmp2);
3512   shll(tmp3, 5);
3513   subl(tmp3, tmp2);
3514   addl(result, tmp3);
3515   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3516   addl(result, tmp3);
3517   addl(index, 2);
3518   cmpl(index, cnt1);
3519   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3520 
3521   // }
3522   // if (i >= cnt1) {
3523   bind(SHORT_UNROLLED_LOOP_EXIT);
3524   jccb(Assembler::greater, END);
3525   movl(tmp2, result);
3526   shll(result, 5);
3527   subl(result, tmp2);
3528   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3529   addl(result, tmp3);
3530   // }
3531   bind(END);
3532 
3533   BLOCK_COMMENT("} // arrays_hashcode");
3534 
3535 } // arrays_hashcode
3536 
3537 // helper function for string_compare
3538 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3539                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3540                                            Address::ScaleFactor scale2, Register index, int ae) {
3541   if (ae == StrIntrinsicNode::LL) {
3542     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3543     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3544   } else if (ae == StrIntrinsicNode::UU) {
3545     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3546     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3547   } else {
3548     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3549     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3550   }
3551 }
3552 
3553 // Compare strings, used for char[] and byte[].
3554 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3555                                        Register cnt1, Register cnt2, Register result,
3556                                        XMMRegister vec1, int ae, KRegister mask) {
3557   ShortBranchVerifier sbv(this);
3558   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3559   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3560   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3561   int stride2x2 = 0x40;
3562   Address::ScaleFactor scale = Address::no_scale;
3563   Address::ScaleFactor scale1 = Address::no_scale;
3564   Address::ScaleFactor scale2 = Address::no_scale;
3565 
3566   if (ae != StrIntrinsicNode::LL) {
3567     stride2x2 = 0x20;
3568   }
3569 
3570   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3571     shrl(cnt2, 1);
3572   }
3573   // Compute the minimum of the string lengths and the
3574   // difference of the string lengths (stack).
3575   // Do the conditional move stuff
3576   movl(result, cnt1);
3577   subl(cnt1, cnt2);
3578   push(cnt1);
3579   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3580 
3581   // Is the minimum length zero?
3582   testl(cnt2, cnt2);
3583   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3584   if (ae == StrIntrinsicNode::LL) {
3585     // Load first bytes
3586     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3587     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3588   } else if (ae == StrIntrinsicNode::UU) {
3589     // Load first characters
3590     load_unsigned_short(result, Address(str1, 0));
3591     load_unsigned_short(cnt1, Address(str2, 0));
3592   } else {
3593     load_unsigned_byte(result, Address(str1, 0));
3594     load_unsigned_short(cnt1, Address(str2, 0));
3595   }
3596   subl(result, cnt1);
3597   jcc(Assembler::notZero,  POP_LABEL);
3598 
3599   if (ae == StrIntrinsicNode::UU) {
3600     // Divide length by 2 to get number of chars
3601     shrl(cnt2, 1);
3602   }
3603   cmpl(cnt2, 1);
3604   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3605 
3606   // Check if the strings start at the same location and setup scale and stride
3607   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3608     cmpptr(str1, str2);
3609     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3610     if (ae == StrIntrinsicNode::LL) {
3611       scale = Address::times_1;
3612       stride = 16;
3613     } else {
3614       scale = Address::times_2;
3615       stride = 8;
3616     }
3617   } else {
3618     scale1 = Address::times_1;
3619     scale2 = Address::times_2;
3620     // scale not used
3621     stride = 8;
3622   }
3623 
3624   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3625     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3626     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3627     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3628     Label COMPARE_TAIL_LONG;
3629     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3630 
3631     int pcmpmask = 0x19;
3632     if (ae == StrIntrinsicNode::LL) {
3633       pcmpmask &= ~0x01;
3634     }
3635 
3636     // Setup to compare 16-chars (32-bytes) vectors,
3637     // start from first character again because it has aligned address.
3638     if (ae == StrIntrinsicNode::LL) {
3639       stride2 = 32;
3640     } else {
3641       stride2 = 16;
3642     }
3643     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3644       adr_stride = stride << scale;
3645     } else {
3646       adr_stride1 = 8;  //stride << scale1;
3647       adr_stride2 = 16; //stride << scale2;
3648     }
3649 
3650     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3651     // rax and rdx are used by pcmpestri as elements counters
3652     movl(result, cnt2);
3653     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3654     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3655 
3656     // fast path : compare first 2 8-char vectors.
3657     bind(COMPARE_16_CHARS);
3658     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3659       movdqu(vec1, Address(str1, 0));
3660     } else {
3661       pmovzxbw(vec1, Address(str1, 0));
3662     }
3663     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3664     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3665 
3666     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3667       movdqu(vec1, Address(str1, adr_stride));
3668       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3669     } else {
3670       pmovzxbw(vec1, Address(str1, adr_stride1));
3671       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3672     }
3673     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3674     addl(cnt1, stride);
3675 
3676     // Compare the characters at index in cnt1
3677     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3678     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3679     subl(result, cnt2);
3680     jmp(POP_LABEL);
3681 
3682     // Setup the registers to start vector comparison loop
3683     bind(COMPARE_WIDE_VECTORS);
3684     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3685       lea(str1, Address(str1, result, scale));
3686       lea(str2, Address(str2, result, scale));
3687     } else {
3688       lea(str1, Address(str1, result, scale1));
3689       lea(str2, Address(str2, result, scale2));
3690     }
3691     subl(result, stride2);
3692     subl(cnt2, stride2);
3693     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3694     negptr(result);
3695 
3696     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3697     bind(COMPARE_WIDE_VECTORS_LOOP);
3698 
3699 #ifdef _LP64
3700     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3701       cmpl(cnt2, stride2x2);
3702       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3703       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3704       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3705 
3706       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3707       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3708         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3709         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3710       } else {
3711         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3712         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3713       }
3714       kortestql(mask, mask);
3715       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3716       addptr(result, stride2x2);  // update since we already compared at this addr
3717       subl(cnt2, stride2x2);      // and sub the size too
3718       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3719 
3720       vpxor(vec1, vec1);
3721       jmpb(COMPARE_WIDE_TAIL);
3722     }//if (VM_Version::supports_avx512vlbw())
3723 #endif // _LP64
3724 
3725 
3726     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3727     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3728       vmovdqu(vec1, Address(str1, result, scale));
3729       vpxor(vec1, Address(str2, result, scale));
3730     } else {
3731       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3732       vpxor(vec1, Address(str2, result, scale2));
3733     }
3734     vptest(vec1, vec1);
3735     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3736     addptr(result, stride2);
3737     subl(cnt2, stride2);
3738     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3739     // clean upper bits of YMM registers
3740     vpxor(vec1, vec1);
3741 
3742     // compare wide vectors tail
3743     bind(COMPARE_WIDE_TAIL);
3744     testptr(result, result);
3745     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3746 
3747     movl(result, stride2);
3748     movl(cnt2, result);
3749     negptr(result);
3750     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3751 
3752     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3753     bind(VECTOR_NOT_EQUAL);
3754     // clean upper bits of YMM registers
3755     vpxor(vec1, vec1);
3756     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3757       lea(str1, Address(str1, result, scale));
3758       lea(str2, Address(str2, result, scale));
3759     } else {
3760       lea(str1, Address(str1, result, scale1));
3761       lea(str2, Address(str2, result, scale2));
3762     }
3763     jmp(COMPARE_16_CHARS);
3764 
3765     // Compare tail chars, length between 1 to 15 chars
3766     bind(COMPARE_TAIL_LONG);
3767     movl(cnt2, result);
3768     cmpl(cnt2, stride);
3769     jcc(Assembler::less, COMPARE_SMALL_STR);
3770 
3771     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3772       movdqu(vec1, Address(str1, 0));
3773     } else {
3774       pmovzxbw(vec1, Address(str1, 0));
3775     }
3776     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3777     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3778     subptr(cnt2, stride);
3779     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3780     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3781       lea(str1, Address(str1, result, scale));
3782       lea(str2, Address(str2, result, scale));
3783     } else {
3784       lea(str1, Address(str1, result, scale1));
3785       lea(str2, Address(str2, result, scale2));
3786     }
3787     negptr(cnt2);
3788     jmpb(WHILE_HEAD_LABEL);
3789 
3790     bind(COMPARE_SMALL_STR);
3791   } else if (UseSSE42Intrinsics) {
3792     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3793     int pcmpmask = 0x19;
3794     // Setup to compare 8-char (16-byte) vectors,
3795     // start from first character again because it has aligned address.
3796     movl(result, cnt2);
3797     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3798     if (ae == StrIntrinsicNode::LL) {
3799       pcmpmask &= ~0x01;
3800     }
3801     jcc(Assembler::zero, COMPARE_TAIL);
3802     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3803       lea(str1, Address(str1, result, scale));
3804       lea(str2, Address(str2, result, scale));
3805     } else {
3806       lea(str1, Address(str1, result, scale1));
3807       lea(str2, Address(str2, result, scale2));
3808     }
3809     negptr(result);
3810 
3811     // pcmpestri
3812     //   inputs:
3813     //     vec1- substring
3814     //     rax - negative string length (elements count)
3815     //     mem - scanned string
3816     //     rdx - string length (elements count)
3817     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3818     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3819     //   outputs:
3820     //     rcx - first mismatched element index
3821     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3822 
3823     bind(COMPARE_WIDE_VECTORS);
3824     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3825       movdqu(vec1, Address(str1, result, scale));
3826       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3827     } else {
3828       pmovzxbw(vec1, Address(str1, result, scale1));
3829       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3830     }
3831     // After pcmpestri cnt1(rcx) contains mismatched element index
3832 
3833     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3834     addptr(result, stride);
3835     subptr(cnt2, stride);
3836     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3837 
3838     // compare wide vectors tail
3839     testptr(result, result);
3840     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3841 
3842     movl(cnt2, stride);
3843     movl(result, stride);
3844     negptr(result);
3845     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3846       movdqu(vec1, Address(str1, result, scale));
3847       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3848     } else {
3849       pmovzxbw(vec1, Address(str1, result, scale1));
3850       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3851     }
3852     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3853 
3854     // Mismatched characters in the vectors
3855     bind(VECTOR_NOT_EQUAL);
3856     addptr(cnt1, result);
3857     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3858     subl(result, cnt2);
3859     jmpb(POP_LABEL);
3860 
3861     bind(COMPARE_TAIL); // limit is zero
3862     movl(cnt2, result);
3863     // Fallthru to tail compare
3864   }
3865   // Shift str2 and str1 to the end of the arrays, negate min
3866   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3867     lea(str1, Address(str1, cnt2, scale));
3868     lea(str2, Address(str2, cnt2, scale));
3869   } else {
3870     lea(str1, Address(str1, cnt2, scale1));
3871     lea(str2, Address(str2, cnt2, scale2));
3872   }
3873   decrementl(cnt2);  // first character was compared already
3874   negptr(cnt2);
3875 
3876   // Compare the rest of the elements
3877   bind(WHILE_HEAD_LABEL);
3878   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3879   subl(result, cnt1);
3880   jccb(Assembler::notZero, POP_LABEL);
3881   increment(cnt2);
3882   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3883 
3884   // Strings are equal up to min length.  Return the length difference.
3885   bind(LENGTH_DIFF_LABEL);
3886   pop(result);
3887   if (ae == StrIntrinsicNode::UU) {
3888     // Divide diff by 2 to get number of chars
3889     sarl(result, 1);
3890   }
3891   jmpb(DONE_LABEL);
3892 
3893 #ifdef _LP64
3894   if (VM_Version::supports_avx512vlbw()) {
3895 
3896     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3897 
3898     kmovql(cnt1, mask);
3899     notq(cnt1);
3900     bsfq(cnt2, cnt1);
3901     if (ae != StrIntrinsicNode::LL) {
3902       // Divide diff by 2 to get number of chars
3903       sarl(cnt2, 1);
3904     }
3905     addq(result, cnt2);
3906     if (ae == StrIntrinsicNode::LL) {
3907       load_unsigned_byte(cnt1, Address(str2, result));
3908       load_unsigned_byte(result, Address(str1, result));
3909     } else if (ae == StrIntrinsicNode::UU) {
3910       load_unsigned_short(cnt1, Address(str2, result, scale));
3911       load_unsigned_short(result, Address(str1, result, scale));
3912     } else {
3913       load_unsigned_short(cnt1, Address(str2, result, scale2));
3914       load_unsigned_byte(result, Address(str1, result, scale1));
3915     }
3916     subl(result, cnt1);
3917     jmpb(POP_LABEL);
3918   }//if (VM_Version::supports_avx512vlbw())
3919 #endif // _LP64
3920 
3921   // Discard the stored length difference
3922   bind(POP_LABEL);
3923   pop(cnt1);
3924 
3925   // That's it
3926   bind(DONE_LABEL);
3927   if(ae == StrIntrinsicNode::UL) {
3928     negl(result);
3929   }
3930 
3931 }
3932 
3933 // Search for Non-ASCII character (Negative byte value) in a byte array,
3934 // return the index of the first such character, otherwise the length
3935 // of the array segment searched.
3936 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3937 //   @IntrinsicCandidate
3938 //   public static int countPositives(byte[] ba, int off, int len) {
3939 //     for (int i = off; i < off + len; i++) {
3940 //       if (ba[i] < 0) {
3941 //         return i - off;
3942 //       }
3943 //     }
3944 //     return len;
3945 //   }
3946 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3947   Register result, Register tmp1,
3948   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3949   // rsi: byte array
3950   // rcx: len
3951   // rax: result
3952   ShortBranchVerifier sbv(this);
3953   assert_different_registers(ary1, len, result, tmp1);
3954   assert_different_registers(vec1, vec2);
3955   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3956 
3957   movl(result, len); // copy
3958   // len == 0
3959   testl(len, len);
3960   jcc(Assembler::zero, DONE);
3961 
3962   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3963     VM_Version::supports_avx512vlbw() &&
3964     VM_Version::supports_bmi2()) {
3965 
3966     Label test_64_loop, test_tail, BREAK_LOOP;
3967     movl(tmp1, len);
3968     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3969 
3970     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
3971     andl(len,  0xffffffc0); // vector count (in chars)
3972     jccb(Assembler::zero, test_tail);
3973 
3974     lea(ary1, Address(ary1, len, Address::times_1));
3975     negptr(len);
3976 
3977     bind(test_64_loop);
3978     // Check whether our 64 elements of size byte contain negatives
3979     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3980     kortestql(mask1, mask1);
3981     jcc(Assembler::notZero, BREAK_LOOP);
3982 
3983     addptr(len, 64);
3984     jccb(Assembler::notZero, test_64_loop);
3985 
3986     bind(test_tail);
3987     // bail out when there is nothing to be done
3988     testl(tmp1, -1);
3989     jcc(Assembler::zero, DONE);
3990 
3991 
3992     // check the tail for absense of negatives
3993     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3994 #ifdef _LP64
3995     {
3996       Register tmp3_aliased = len;
3997       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3998       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3999       notq(tmp3_aliased);
4000       kmovql(mask2, tmp3_aliased);
4001     }
4002 #else
4003     Label k_init;
4004     jmp(k_init);
4005 
4006     // We could not read 64-bits from a general purpose register thus we move
4007     // data required to compose 64 1's to the instruction stream
4008     // We emit 64 byte wide series of elements from 0..63 which later on would
4009     // be used as a compare targets with tail count contained in tmp1 register.
4010     // Result would be a k register having tmp1 consecutive number or 1
4011     // counting from least significant bit.
4012     address tmp = pc();
4013     emit_int64(0x0706050403020100);
4014     emit_int64(0x0F0E0D0C0B0A0908);
4015     emit_int64(0x1716151413121110);
4016     emit_int64(0x1F1E1D1C1B1A1918);
4017     emit_int64(0x2726252423222120);
4018     emit_int64(0x2F2E2D2C2B2A2928);
4019     emit_int64(0x3736353433323130);
4020     emit_int64(0x3F3E3D3C3B3A3938);
4021 
4022     bind(k_init);
4023     lea(len, InternalAddress(tmp));
4024     // create mask to test for negative byte inside a vector
4025     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4026     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4027 
4028 #endif
4029     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4030     ktestq(mask1, mask2);
4031     jcc(Assembler::zero, DONE);
4032 
4033     // do a full check for negative registers in the tail
4034     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4035                      // ary1 already pointing to the right place
4036     jmpb(TAIL_START);
4037 
4038     bind(BREAK_LOOP);
4039     // At least one byte in the last 64 byte block was negative.
4040     // Set up to look at the last 64 bytes as if they were a tail
4041     lea(ary1, Address(ary1, len, Address::times_1));
4042     addptr(result, len);
4043     // Ignore the very last byte: if all others are positive,
4044     // it must be negative, so we can skip right to the 2+1 byte
4045     // end comparison at this point
4046     orl(result, 63);
4047     movl(len, 63);
4048     // Fallthru to tail compare
4049   } else {
4050 
4051     if (UseAVX >= 2 && UseSSE >= 2) {
4052       // With AVX2, use 32-byte vector compare
4053       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4054 
4055       // Compare 32-byte vectors
4056       testl(len, 0xffffffe0);   // vector count (in bytes)
4057       jccb(Assembler::zero, TAIL_START);
4058 
4059       andl(len, 0xffffffe0);
4060       lea(ary1, Address(ary1, len, Address::times_1));
4061       negptr(len);
4062 
4063       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4064       movdl(vec2, tmp1);
4065       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4066 
4067       bind(COMPARE_WIDE_VECTORS);
4068       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4069       vptest(vec1, vec2);
4070       jccb(Assembler::notZero, BREAK_LOOP);
4071       addptr(len, 32);
4072       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4073 
4074       testl(result, 0x0000001f);   // any bytes remaining?
4075       jcc(Assembler::zero, DONE);
4076 
4077       // Quick test using the already prepared vector mask
4078       movl(len, result);
4079       andl(len, 0x0000001f);
4080       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4081       vptest(vec1, vec2);
4082       jcc(Assembler::zero, DONE);
4083       // There are zeros, jump to the tail to determine exactly where
4084       jmpb(TAIL_START);
4085 
4086       bind(BREAK_LOOP);
4087       // At least one byte in the last 32-byte vector is negative.
4088       // Set up to look at the last 32 bytes as if they were a tail
4089       lea(ary1, Address(ary1, len, Address::times_1));
4090       addptr(result, len);
4091       // Ignore the very last byte: if all others are positive,
4092       // it must be negative, so we can skip right to the 2+1 byte
4093       // end comparison at this point
4094       orl(result, 31);
4095       movl(len, 31);
4096       // Fallthru to tail compare
4097     } else if (UseSSE42Intrinsics) {
4098       // With SSE4.2, use double quad vector compare
4099       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4100 
4101       // Compare 16-byte vectors
4102       testl(len, 0xfffffff0);   // vector count (in bytes)
4103       jcc(Assembler::zero, TAIL_START);
4104 
4105       andl(len, 0xfffffff0);
4106       lea(ary1, Address(ary1, len, Address::times_1));
4107       negptr(len);
4108 
4109       movl(tmp1, 0x80808080);
4110       movdl(vec2, tmp1);
4111       pshufd(vec2, vec2, 0);
4112 
4113       bind(COMPARE_WIDE_VECTORS);
4114       movdqu(vec1, Address(ary1, len, Address::times_1));
4115       ptest(vec1, vec2);
4116       jccb(Assembler::notZero, BREAK_LOOP);
4117       addptr(len, 16);
4118       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4119 
4120       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4121       jcc(Assembler::zero, DONE);
4122 
4123       // Quick test using the already prepared vector mask
4124       movl(len, result);
4125       andl(len, 0x0000000f);   // tail count (in bytes)
4126       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4127       ptest(vec1, vec2);
4128       jcc(Assembler::zero, DONE);
4129       jmpb(TAIL_START);
4130 
4131       bind(BREAK_LOOP);
4132       // At least one byte in the last 16-byte vector is negative.
4133       // Set up and look at the last 16 bytes as if they were a tail
4134       lea(ary1, Address(ary1, len, Address::times_1));
4135       addptr(result, len);
4136       // Ignore the very last byte: if all others are positive,
4137       // it must be negative, so we can skip right to the 2+1 byte
4138       // end comparison at this point
4139       orl(result, 15);
4140       movl(len, 15);
4141       // Fallthru to tail compare
4142     }
4143   }
4144 
4145   bind(TAIL_START);
4146   // Compare 4-byte vectors
4147   andl(len, 0xfffffffc); // vector count (in bytes)
4148   jccb(Assembler::zero, COMPARE_CHAR);
4149 
4150   lea(ary1, Address(ary1, len, Address::times_1));
4151   negptr(len);
4152 
4153   bind(COMPARE_VECTORS);
4154   movl(tmp1, Address(ary1, len, Address::times_1));
4155   andl(tmp1, 0x80808080);
4156   jccb(Assembler::notZero, TAIL_ADJUST);
4157   addptr(len, 4);
4158   jccb(Assembler::notZero, COMPARE_VECTORS);
4159 
4160   // Compare trailing char (final 2-3 bytes), if any
4161   bind(COMPARE_CHAR);
4162 
4163   testl(result, 0x2);   // tail  char
4164   jccb(Assembler::zero, COMPARE_BYTE);
4165   load_unsigned_short(tmp1, Address(ary1, 0));
4166   andl(tmp1, 0x00008080);
4167   jccb(Assembler::notZero, CHAR_ADJUST);
4168   lea(ary1, Address(ary1, 2));
4169 
4170   bind(COMPARE_BYTE);
4171   testl(result, 0x1);   // tail  byte
4172   jccb(Assembler::zero, DONE);
4173   load_unsigned_byte(tmp1, Address(ary1, 0));
4174   testl(tmp1, 0x00000080);
4175   jccb(Assembler::zero, DONE);
4176   subptr(result, 1);
4177   jmpb(DONE);
4178 
4179   bind(TAIL_ADJUST);
4180   // there are negative bits in the last 4 byte block.
4181   // Adjust result and check the next three bytes
4182   addptr(result, len);
4183   orl(result, 3);
4184   lea(ary1, Address(ary1, len, Address::times_1));
4185   jmpb(COMPARE_CHAR);
4186 
4187   bind(CHAR_ADJUST);
4188   // We are looking at a char + optional byte tail, and found that one
4189   // of the bytes in the char is negative. Adjust the result, check the
4190   // first byte and readjust if needed.
4191   andl(result, 0xfffffffc);
4192   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4193   jccb(Assembler::notZero, DONE);
4194   addptr(result, 1);
4195 
4196   // That's it
4197   bind(DONE);
4198   if (UseAVX >= 2 && UseSSE >= 2) {
4199     // clean upper bits of YMM registers
4200     vpxor(vec1, vec1);
4201     vpxor(vec2, vec2);
4202   }
4203 }
4204 
4205 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4206 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4207                                       Register limit, Register result, Register chr,
4208                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4209                                       KRegister mask, bool expand_ary2) {
4210   // for expand_ary2, limit is the (smaller) size of the second array.
4211   ShortBranchVerifier sbv(this);
4212   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4213 
4214   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4215          "Expansion only implemented for AVX2");
4216 
4217   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4218   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4219 
4220   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4221   int scaleIncr = expand_ary2 ? 8 : 16;
4222 
4223   if (is_array_equ) {
4224     // Check the input args
4225     cmpoop(ary1, ary2);
4226     jcc(Assembler::equal, TRUE_LABEL);
4227 
4228     // Need additional checks for arrays_equals.
4229     testptr(ary1, ary1);
4230     jcc(Assembler::zero, FALSE_LABEL);
4231     testptr(ary2, ary2);
4232     jcc(Assembler::zero, FALSE_LABEL);
4233 
4234     // Check the lengths
4235     movl(limit, Address(ary1, length_offset));
4236     cmpl(limit, Address(ary2, length_offset));
4237     jcc(Assembler::notEqual, FALSE_LABEL);
4238   }
4239 
4240   // count == 0
4241   testl(limit, limit);
4242   jcc(Assembler::zero, TRUE_LABEL);
4243 
4244   if (is_array_equ) {
4245     // Load array address
4246     lea(ary1, Address(ary1, base_offset));
4247     lea(ary2, Address(ary2, base_offset));
4248   }
4249 
4250   if (is_array_equ && is_char) {
4251     // arrays_equals when used for char[].
4252     shll(limit, 1);      // byte count != 0
4253   }
4254   movl(result, limit); // copy
4255 
4256   if (UseAVX >= 2) {
4257     // With AVX2, use 32-byte vector compare
4258     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4259 
4260     // Compare 32-byte vectors
4261     if (expand_ary2) {
4262       andl(result, 0x0000000f);  //   tail count (in bytes)
4263       andl(limit, 0xfffffff0);   // vector count (in bytes)
4264       jcc(Assembler::zero, COMPARE_TAIL);
4265     } else {
4266       andl(result, 0x0000001f);  //   tail count (in bytes)
4267       andl(limit, 0xffffffe0);   // vector count (in bytes)
4268       jcc(Assembler::zero, COMPARE_TAIL_16);
4269     }
4270 
4271     lea(ary1, Address(ary1, limit, scaleFactor));
4272     lea(ary2, Address(ary2, limit, Address::times_1));
4273     negptr(limit);
4274 
4275 #ifdef _LP64
4276     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4277       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4278 
4279       cmpl(limit, -64);
4280       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4281 
4282       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4283 
4284       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4285       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4286       kortestql(mask, mask);
4287       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4288       addptr(limit, 64);  // update since we already compared at this addr
4289       cmpl(limit, -64);
4290       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4291 
4292       // At this point we may still need to compare -limit+result bytes.
4293       // We could execute the next two instruction and just continue via non-wide path:
4294       //  cmpl(limit, 0);
4295       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4296       // But since we stopped at the points ary{1,2}+limit which are
4297       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4298       // (|limit| <= 32 and result < 32),
4299       // we may just compare the last 64 bytes.
4300       //
4301       addptr(result, -64);   // it is safe, bc we just came from this area
4302       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4303       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4304       kortestql(mask, mask);
4305       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4306 
4307       jmp(TRUE_LABEL);
4308 
4309       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4310 
4311     }//if (VM_Version::supports_avx512vlbw())
4312 #endif //_LP64
4313     bind(COMPARE_WIDE_VECTORS);
4314     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4315     if (expand_ary2) {
4316       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4317     } else {
4318       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4319     }
4320     vpxor(vec1, vec2);
4321 
4322     vptest(vec1, vec1);
4323     jcc(Assembler::notZero, FALSE_LABEL);
4324     addptr(limit, scaleIncr * 2);
4325     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4326 
4327     testl(result, result);
4328     jcc(Assembler::zero, TRUE_LABEL);
4329 
4330     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4331     if (expand_ary2) {
4332       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4333     } else {
4334       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4335     }
4336     vpxor(vec1, vec2);
4337 
4338     vptest(vec1, vec1);
4339     jcc(Assembler::notZero, FALSE_LABEL);
4340     jmp(TRUE_LABEL);
4341 
4342     bind(COMPARE_TAIL_16); // limit is zero
4343     movl(limit, result);
4344 
4345     // Compare 16-byte chunks
4346     andl(result, 0x0000000f);  //   tail count (in bytes)
4347     andl(limit, 0xfffffff0);   // vector count (in bytes)
4348     jcc(Assembler::zero, COMPARE_TAIL);
4349 
4350     lea(ary1, Address(ary1, limit, scaleFactor));
4351     lea(ary2, Address(ary2, limit, Address::times_1));
4352     negptr(limit);
4353 
4354     bind(COMPARE_WIDE_VECTORS_16);
4355     movdqu(vec1, Address(ary1, limit, scaleFactor));
4356     if (expand_ary2) {
4357       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4358     } else {
4359       movdqu(vec2, Address(ary2, limit, Address::times_1));
4360     }
4361     pxor(vec1, vec2);
4362 
4363     ptest(vec1, vec1);
4364     jcc(Assembler::notZero, FALSE_LABEL);
4365     addptr(limit, scaleIncr);
4366     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4367 
4368     bind(COMPARE_TAIL); // limit is zero
4369     movl(limit, result);
4370     // Fallthru to tail compare
4371   } else if (UseSSE42Intrinsics) {
4372     // With SSE4.2, use double quad vector compare
4373     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4374 
4375     // Compare 16-byte vectors
4376     andl(result, 0x0000000f);  //   tail count (in bytes)
4377     andl(limit, 0xfffffff0);   // vector count (in bytes)
4378     jcc(Assembler::zero, COMPARE_TAIL);
4379 
4380     lea(ary1, Address(ary1, limit, Address::times_1));
4381     lea(ary2, Address(ary2, limit, Address::times_1));
4382     negptr(limit);
4383 
4384     bind(COMPARE_WIDE_VECTORS);
4385     movdqu(vec1, Address(ary1, limit, Address::times_1));
4386     movdqu(vec2, Address(ary2, limit, Address::times_1));
4387     pxor(vec1, vec2);
4388 
4389     ptest(vec1, vec1);
4390     jcc(Assembler::notZero, FALSE_LABEL);
4391     addptr(limit, 16);
4392     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4393 
4394     testl(result, result);
4395     jcc(Assembler::zero, TRUE_LABEL);
4396 
4397     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4398     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4399     pxor(vec1, vec2);
4400 
4401     ptest(vec1, vec1);
4402     jccb(Assembler::notZero, FALSE_LABEL);
4403     jmpb(TRUE_LABEL);
4404 
4405     bind(COMPARE_TAIL); // limit is zero
4406     movl(limit, result);
4407     // Fallthru to tail compare
4408   }
4409 
4410   // Compare 4-byte vectors
4411   if (expand_ary2) {
4412     testl(result, result);
4413     jccb(Assembler::zero, TRUE_LABEL);
4414   } else {
4415     andl(limit, 0xfffffffc); // vector count (in bytes)
4416     jccb(Assembler::zero, COMPARE_CHAR);
4417   }
4418 
4419   lea(ary1, Address(ary1, limit, scaleFactor));
4420   lea(ary2, Address(ary2, limit, Address::times_1));
4421   negptr(limit);
4422 
4423   bind(COMPARE_VECTORS);
4424   if (expand_ary2) {
4425     // There are no "vector" operations for bytes to shorts
4426     movzbl(chr, Address(ary2, limit, Address::times_1));
4427     cmpw(Address(ary1, limit, Address::times_2), chr);
4428     jccb(Assembler::notEqual, FALSE_LABEL);
4429     addptr(limit, 1);
4430     jcc(Assembler::notZero, COMPARE_VECTORS);
4431     jmp(TRUE_LABEL);
4432   } else {
4433     movl(chr, Address(ary1, limit, Address::times_1));
4434     cmpl(chr, Address(ary2, limit, Address::times_1));
4435     jccb(Assembler::notEqual, FALSE_LABEL);
4436     addptr(limit, 4);
4437     jcc(Assembler::notZero, COMPARE_VECTORS);
4438   }
4439 
4440   // Compare trailing char (final 2 bytes), if any
4441   bind(COMPARE_CHAR);
4442   testl(result, 0x2);   // tail  char
4443   jccb(Assembler::zero, COMPARE_BYTE);
4444   load_unsigned_short(chr, Address(ary1, 0));
4445   load_unsigned_short(limit, Address(ary2, 0));
4446   cmpl(chr, limit);
4447   jccb(Assembler::notEqual, FALSE_LABEL);
4448 
4449   if (is_array_equ && is_char) {
4450     bind(COMPARE_BYTE);
4451   } else {
4452     lea(ary1, Address(ary1, 2));
4453     lea(ary2, Address(ary2, 2));
4454 
4455     bind(COMPARE_BYTE);
4456     testl(result, 0x1);   // tail  byte
4457     jccb(Assembler::zero, TRUE_LABEL);
4458     load_unsigned_byte(chr, Address(ary1, 0));
4459     load_unsigned_byte(limit, Address(ary2, 0));
4460     cmpl(chr, limit);
4461     jccb(Assembler::notEqual, FALSE_LABEL);
4462   }
4463   bind(TRUE_LABEL);
4464   movl(result, 1);   // return true
4465   jmpb(DONE);
4466 
4467   bind(FALSE_LABEL);
4468   xorl(result, result); // return false
4469 
4470   // That's it
4471   bind(DONE);
4472   if (UseAVX >= 2) {
4473     // clean upper bits of YMM registers
4474     vpxor(vec1, vec1);
4475     vpxor(vec2, vec2);
4476   }
4477 }
4478 
4479 #ifdef _LP64
4480 
4481 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4482 #define __ masm.
4483   Register dst = stub.data<0>();
4484   XMMRegister src = stub.data<1>();
4485   address target = stub.data<2>();
4486   __ bind(stub.entry());
4487   __ subptr(rsp, 8);
4488   __ movdbl(Address(rsp), src);
4489   __ call(RuntimeAddress(target));
4490   __ pop(dst);
4491   __ jmp(stub.continuation());
4492 #undef __
4493 }
4494 
4495 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4496   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4497   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4498 
4499   address slowpath_target;
4500   if (dst_bt == T_INT) {
4501     if (src_bt == T_FLOAT) {
4502       cvttss2sil(dst, src);
4503       cmpl(dst, 0x80000000);
4504       slowpath_target = StubRoutines::x86::f2i_fixup();
4505     } else {
4506       cvttsd2sil(dst, src);
4507       cmpl(dst, 0x80000000);
4508       slowpath_target = StubRoutines::x86::d2i_fixup();
4509     }
4510   } else {
4511     if (src_bt == T_FLOAT) {
4512       cvttss2siq(dst, src);
4513       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4514       slowpath_target = StubRoutines::x86::f2l_fixup();
4515     } else {
4516       cvttsd2siq(dst, src);
4517       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4518       slowpath_target = StubRoutines::x86::d2l_fixup();
4519     }
4520   }
4521 
4522   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4523   jcc(Assembler::equal, stub->entry());
4524   bind(stub->continuation());
4525 }
4526 
4527 #endif // _LP64
4528 
4529 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4530                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4531   switch(ideal_opc) {
4532     case Op_LShiftVS:
4533       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4534     case Op_LShiftVI:
4535       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4536     case Op_LShiftVL:
4537       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4538     case Op_RShiftVS:
4539       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4540     case Op_RShiftVI:
4541       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4542     case Op_RShiftVL:
4543       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4544     case Op_URShiftVS:
4545       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4546     case Op_URShiftVI:
4547       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4548     case Op_URShiftVL:
4549       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4550     case Op_RotateRightV:
4551       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4552     case Op_RotateLeftV:
4553       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4554     default:
4555       fatal("Unsupported masked operation"); break;
4556   }
4557 }
4558 
4559 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4560                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4561                                     bool is_varshift) {
4562   switch (ideal_opc) {
4563     case Op_AddVB:
4564       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4565     case Op_AddVS:
4566       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4567     case Op_AddVI:
4568       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4569     case Op_AddVL:
4570       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4571     case Op_AddVF:
4572       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4573     case Op_AddVD:
4574       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4575     case Op_SubVB:
4576       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4577     case Op_SubVS:
4578       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4579     case Op_SubVI:
4580       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4581     case Op_SubVL:
4582       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4583     case Op_SubVF:
4584       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4585     case Op_SubVD:
4586       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4587     case Op_MulVS:
4588       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4589     case Op_MulVI:
4590       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4591     case Op_MulVL:
4592       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4593     case Op_MulVF:
4594       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4595     case Op_MulVD:
4596       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4597     case Op_DivVF:
4598       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4599     case Op_DivVD:
4600       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4601     case Op_SqrtVF:
4602       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4603     case Op_SqrtVD:
4604       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4605     case Op_AbsVB:
4606       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4607     case Op_AbsVS:
4608       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4609     case Op_AbsVI:
4610       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4611     case Op_AbsVL:
4612       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4613     case Op_FmaVF:
4614       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4615     case Op_FmaVD:
4616       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4617     case Op_VectorRearrange:
4618       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4619     case Op_LShiftVS:
4620       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4621     case Op_LShiftVI:
4622       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4623     case Op_LShiftVL:
4624       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4625     case Op_RShiftVS:
4626       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4627     case Op_RShiftVI:
4628       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4629     case Op_RShiftVL:
4630       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4631     case Op_URShiftVS:
4632       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4633     case Op_URShiftVI:
4634       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4635     case Op_URShiftVL:
4636       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4637     case Op_RotateLeftV:
4638       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4639     case Op_RotateRightV:
4640       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4641     case Op_MaxV:
4642       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4643     case Op_MinV:
4644       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4645     case Op_XorV:
4646       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4647     case Op_OrV:
4648       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4649     case Op_AndV:
4650       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4651     default:
4652       fatal("Unsupported masked operation"); break;
4653   }
4654 }
4655 
4656 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4657                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4658   switch (ideal_opc) {
4659     case Op_AddVB:
4660       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4661     case Op_AddVS:
4662       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4663     case Op_AddVI:
4664       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4665     case Op_AddVL:
4666       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4667     case Op_AddVF:
4668       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4669     case Op_AddVD:
4670       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4671     case Op_SubVB:
4672       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4673     case Op_SubVS:
4674       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4675     case Op_SubVI:
4676       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4677     case Op_SubVL:
4678       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4679     case Op_SubVF:
4680       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4681     case Op_SubVD:
4682       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4683     case Op_MulVS:
4684       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4685     case Op_MulVI:
4686       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4687     case Op_MulVL:
4688       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4689     case Op_MulVF:
4690       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4691     case Op_MulVD:
4692       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4693     case Op_DivVF:
4694       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4695     case Op_DivVD:
4696       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4697     case Op_FmaVF:
4698       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4699     case Op_FmaVD:
4700       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4701     case Op_MaxV:
4702       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4703     case Op_MinV:
4704       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4705     case Op_XorV:
4706       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4707     case Op_OrV:
4708       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4709     case Op_AndV:
4710       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4711     default:
4712       fatal("Unsupported masked operation"); break;
4713   }
4714 }
4715 
4716 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4717                                   KRegister src1, KRegister src2) {
4718   BasicType etype = T_ILLEGAL;
4719   switch(mask_len) {
4720     case 2:
4721     case 4:
4722     case 8:  etype = T_BYTE; break;
4723     case 16: etype = T_SHORT; break;
4724     case 32: etype = T_INT; break;
4725     case 64: etype = T_LONG; break;
4726     default: fatal("Unsupported type"); break;
4727   }
4728   assert(etype != T_ILLEGAL, "");
4729   switch(ideal_opc) {
4730     case Op_AndVMask:
4731       kand(etype, dst, src1, src2); break;
4732     case Op_OrVMask:
4733       kor(etype, dst, src1, src2); break;
4734     case Op_XorVMask:
4735       kxor(etype, dst, src1, src2); break;
4736     default:
4737       fatal("Unsupported masked operation"); break;
4738   }
4739 }
4740 
4741 /*
4742  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4743  * If src is NaN, the result is 0.
4744  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4745  * the result is equal to the value of Integer.MIN_VALUE.
4746  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4747  * the result is equal to the value of Integer.MAX_VALUE.
4748  */
4749 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4750                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4751                                                                    Register rscratch, AddressLiteral float_sign_flip,
4752                                                                    int vec_enc) {
4753   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4754   Label done;
4755   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4756   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4757   vptest(xtmp2, xtmp2, vec_enc);
4758   jccb(Assembler::equal, done);
4759 
4760   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4761   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4762 
4763   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4764   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4765   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4766 
4767   // Recompute the mask for remaining special value.
4768   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4769   // Extract SRC values corresponding to TRUE mask lanes.
4770   vpand(xtmp4, xtmp2, src, vec_enc);
4771   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4772   // values are set.
4773   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4774 
4775   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4776   bind(done);
4777 }
4778 
4779 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4780                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4781                                                                     Register rscratch, AddressLiteral float_sign_flip,
4782                                                                     int vec_enc) {
4783   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4784   Label done;
4785   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4786   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4787   kortestwl(ktmp1, ktmp1);
4788   jccb(Assembler::equal, done);
4789 
4790   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4791   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4792   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4793 
4794   kxorwl(ktmp1, ktmp1, ktmp2);
4795   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4796   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4797   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4798   bind(done);
4799 }
4800 
4801 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4802                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4803                                                                      Register rscratch, AddressLiteral double_sign_flip,
4804                                                                      int vec_enc) {
4805   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4806 
4807   Label done;
4808   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4809   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4810   kortestwl(ktmp1, ktmp1);
4811   jccb(Assembler::equal, done);
4812 
4813   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4814   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4815   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4816 
4817   kxorwl(ktmp1, ktmp1, ktmp2);
4818   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4819   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4820   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4821   bind(done);
4822 }
4823 
4824 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4825                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4826                                                                      Register rscratch, AddressLiteral float_sign_flip,
4827                                                                      int vec_enc) {
4828   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4829   Label done;
4830   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4831   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4832   kortestwl(ktmp1, ktmp1);
4833   jccb(Assembler::equal, done);
4834 
4835   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4836   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4837   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4838 
4839   kxorwl(ktmp1, ktmp1, ktmp2);
4840   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4841   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4842   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4843   bind(done);
4844 }
4845 
4846 /*
4847  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4848  * If src is NaN, the result is 0.
4849  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4850  * the result is equal to the value of Long.MIN_VALUE.
4851  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4852  * the result is equal to the value of Long.MAX_VALUE.
4853  */
4854 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4855                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4856                                                                       Register rscratch, AddressLiteral double_sign_flip,
4857                                                                       int vec_enc) {
4858   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4859 
4860   Label done;
4861   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4862   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4863   kortestwl(ktmp1, ktmp1);
4864   jccb(Assembler::equal, done);
4865 
4866   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4867   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4868   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4869 
4870   kxorwl(ktmp1, ktmp1, ktmp2);
4871   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4872   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4873   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4874   bind(done);
4875 }
4876 
4877 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4878                                                              XMMRegister xtmp, int index, int vec_enc) {
4879    assert(vec_enc < Assembler::AVX_512bit, "");
4880    if (vec_enc == Assembler::AVX_256bit) {
4881      vextractf128_high(xtmp, src);
4882      vshufps(dst, src, xtmp, index, vec_enc);
4883    } else {
4884      vshufps(dst, src, zero, index, vec_enc);
4885    }
4886 }
4887 
4888 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4889                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4890                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4891   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4892 
4893   Label done;
4894   // Compare the destination lanes with float_sign_flip
4895   // value to get mask for all special values.
4896   movdqu(xtmp1, float_sign_flip, rscratch);
4897   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4898   ptest(xtmp2, xtmp2);
4899   jccb(Assembler::equal, done);
4900 
4901   // Flip float_sign_flip to get max integer value.
4902   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4903   pxor(xtmp1, xtmp4);
4904 
4905   // Set detination lanes corresponding to unordered source lanes as zero.
4906   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
4907   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
4908 
4909   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4910   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4911   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
4912 
4913   // Recompute the mask for remaining special value.
4914   pxor(xtmp2, xtmp3);
4915   // Extract mask corresponding to non-negative source lanes.
4916   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
4917 
4918   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4919   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4920   pand(xtmp3, xtmp2);
4921 
4922   // Replace destination lanes holding special value(0x80000000) with max int
4923   // if corresponding source lane holds a +ve value.
4924   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
4925   bind(done);
4926 }
4927 
4928 
4929 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
4930                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
4931   switch(to_elem_bt) {
4932     case T_SHORT:
4933       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
4934       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
4935       vpackusdw(dst, dst, zero, vec_enc);
4936       if (vec_enc == Assembler::AVX_256bit) {
4937         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4938       }
4939       break;
4940     case  T_BYTE:
4941       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
4942       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
4943       vpackusdw(dst, dst, zero, vec_enc);
4944       if (vec_enc == Assembler::AVX_256bit) {
4945         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4946       }
4947       vpackuswb(dst, dst, zero, vec_enc);
4948       break;
4949     default: assert(false, "%s", type2name(to_elem_bt));
4950   }
4951 }
4952 
4953 /*
4954  * Algorithm for vector D2L and F2I conversions:-
4955  * a) Perform vector D2L/F2I cast.
4956  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4957  *    It signifies that source value could be any of the special floating point
4958  *    values(NaN,-Inf,Inf,Max,-Min).
4959  * c) Set destination to zero if source is NaN value.
4960  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4961  */
4962 
4963 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4964                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4965                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4966   int to_elem_sz = type2aelembytes(to_elem_bt);
4967   assert(to_elem_sz <= 4, "");
4968   vcvttps2dq(dst, src, vec_enc);
4969   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
4970   if (to_elem_sz < 4) {
4971     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4972     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
4973   }
4974 }
4975 
4976 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4977                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
4978                                             Register rscratch, int vec_enc) {
4979   int to_elem_sz = type2aelembytes(to_elem_bt);
4980   assert(to_elem_sz <= 4, "");
4981   vcvttps2dq(dst, src, vec_enc);
4982   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
4983   switch(to_elem_bt) {
4984     case T_INT:
4985       break;
4986     case T_SHORT:
4987       evpmovdw(dst, dst, vec_enc);
4988       break;
4989     case T_BYTE:
4990       evpmovdb(dst, dst, vec_enc);
4991       break;
4992     default: assert(false, "%s", type2name(to_elem_bt));
4993   }
4994 }
4995 
4996 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4997                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
4998                                             Register rscratch, int vec_enc) {
4999   evcvttps2qq(dst, src, vec_enc);
5000   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5001 }
5002 
5003 // Handling for downcasting from double to integer or sub-word types on AVX2.
5004 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5005                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5006                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5007   int to_elem_sz = type2aelembytes(to_elem_bt);
5008   assert(to_elem_sz < 8, "");
5009   vcvttpd2dq(dst, src, vec_enc);
5010   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5011                                               float_sign_flip, vec_enc);
5012   if (to_elem_sz < 4) {
5013     // xtmp4 holds all zero lanes.
5014     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5015   }
5016 }
5017 
5018 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5019                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5020                                             KRegister ktmp2, AddressLiteral sign_flip,
5021                                             Register rscratch, int vec_enc) {
5022   if (VM_Version::supports_avx512dq()) {
5023     evcvttpd2qq(dst, src, vec_enc);
5024     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5025     switch(to_elem_bt) {
5026       case T_LONG:
5027         break;
5028       case T_INT:
5029         evpmovsqd(dst, dst, vec_enc);
5030         break;
5031       case T_SHORT:
5032         evpmovsqd(dst, dst, vec_enc);
5033         evpmovdw(dst, dst, vec_enc);
5034         break;
5035       case T_BYTE:
5036         evpmovsqd(dst, dst, vec_enc);
5037         evpmovdb(dst, dst, vec_enc);
5038         break;
5039       default: assert(false, "%s", type2name(to_elem_bt));
5040     }
5041   } else {
5042     assert(type2aelembytes(to_elem_bt) <= 4, "");
5043     vcvttpd2dq(dst, src, vec_enc);
5044     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5045     switch(to_elem_bt) {
5046       case T_INT:
5047         break;
5048       case T_SHORT:
5049         evpmovdw(dst, dst, vec_enc);
5050         break;
5051       case T_BYTE:
5052         evpmovdb(dst, dst, vec_enc);
5053         break;
5054       default: assert(false, "%s", type2name(to_elem_bt));
5055     }
5056   }
5057 }
5058 
5059 #ifdef _LP64
5060 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5061                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5062                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5063   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5064   // and re-instantiate original MXCSR.RC mode after that.
5065   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5066 
5067   mov64(tmp, julong_cast(0.5L));
5068   evpbroadcastq(xtmp1, tmp, vec_enc);
5069   vaddpd(xtmp1, src , xtmp1, vec_enc);
5070   evcvtpd2qq(dst, xtmp1, vec_enc);
5071   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5072                                                 double_sign_flip, vec_enc);;
5073 
5074   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5075 }
5076 
5077 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5078                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5079                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5080   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5081   // and re-instantiate original MXCSR.RC mode after that.
5082   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5083 
5084   movl(tmp, jint_cast(0.5));
5085   movq(xtmp1, tmp);
5086   vbroadcastss(xtmp1, xtmp1, vec_enc);
5087   vaddps(xtmp1, src , xtmp1, vec_enc);
5088   vcvtps2dq(dst, xtmp1, vec_enc);
5089   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5090                                               float_sign_flip, vec_enc);
5091 
5092   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5093 }
5094 
5095 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5096                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5097                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5098   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5099   // and re-instantiate original MXCSR.RC mode after that.
5100   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5101 
5102   movl(tmp, jint_cast(0.5));
5103   movq(xtmp1, tmp);
5104   vbroadcastss(xtmp1, xtmp1, vec_enc);
5105   vaddps(xtmp1, src , xtmp1, vec_enc);
5106   vcvtps2dq(dst, xtmp1, vec_enc);
5107   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5108 
5109   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5110 }
5111 #endif // _LP64
5112 
5113 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5114                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5115   switch (from_elem_bt) {
5116     case T_BYTE:
5117       switch (to_elem_bt) {
5118         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5119         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5120         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5121         default: ShouldNotReachHere();
5122       }
5123       break;
5124     case T_SHORT:
5125       switch (to_elem_bt) {
5126         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5127         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5128         default: ShouldNotReachHere();
5129       }
5130       break;
5131     case T_INT:
5132       assert(to_elem_bt == T_LONG, "");
5133       vpmovzxdq(dst, src, vlen_enc);
5134       break;
5135     default:
5136       ShouldNotReachHere();
5137   }
5138 }
5139 
5140 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5141                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5142   switch (from_elem_bt) {
5143     case T_BYTE:
5144       switch (to_elem_bt) {
5145         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5146         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5147         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5148         default: ShouldNotReachHere();
5149       }
5150       break;
5151     case T_SHORT:
5152       switch (to_elem_bt) {
5153         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5154         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5155         default: ShouldNotReachHere();
5156       }
5157       break;
5158     case T_INT:
5159       assert(to_elem_bt == T_LONG, "");
5160       vpmovsxdq(dst, src, vlen_enc);
5161       break;
5162     default:
5163       ShouldNotReachHere();
5164   }
5165 }
5166 
5167 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5168                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5169   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5170   assert(vlen_enc != AVX_512bit, "");
5171 
5172   int dst_bt_size = type2aelembytes(dst_bt);
5173   int src_bt_size = type2aelembytes(src_bt);
5174   if (dst_bt_size > src_bt_size) {
5175     switch (dst_bt_size / src_bt_size) {
5176       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5177       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5178       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5179       default: ShouldNotReachHere();
5180     }
5181   } else {
5182     assert(dst_bt_size < src_bt_size, "");
5183     switch (src_bt_size / dst_bt_size) {
5184       case 2: {
5185         if (vlen_enc == AVX_128bit) {
5186           vpacksswb(dst, src, src, vlen_enc);
5187         } else {
5188           vpacksswb(dst, src, src, vlen_enc);
5189           vpermq(dst, dst, 0x08, vlen_enc);
5190         }
5191         break;
5192       }
5193       case 4: {
5194         if (vlen_enc == AVX_128bit) {
5195           vpackssdw(dst, src, src, vlen_enc);
5196           vpacksswb(dst, dst, dst, vlen_enc);
5197         } else {
5198           vpackssdw(dst, src, src, vlen_enc);
5199           vpermq(dst, dst, 0x08, vlen_enc);
5200           vpacksswb(dst, dst, dst, AVX_128bit);
5201         }
5202         break;
5203       }
5204       case 8: {
5205         if (vlen_enc == AVX_128bit) {
5206           vpshufd(dst, src, 0x08, vlen_enc);
5207           vpackssdw(dst, dst, dst, vlen_enc);
5208           vpacksswb(dst, dst, dst, vlen_enc);
5209         } else {
5210           vpshufd(dst, src, 0x08, vlen_enc);
5211           vpermq(dst, dst, 0x08, vlen_enc);
5212           vpackssdw(dst, dst, dst, AVX_128bit);
5213           vpacksswb(dst, dst, dst, AVX_128bit);
5214         }
5215         break;
5216       }
5217       default: ShouldNotReachHere();
5218     }
5219   }
5220 }
5221 
5222 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5223                                    bool merge, BasicType bt, int vlen_enc) {
5224   if (bt == T_INT) {
5225     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5226   } else {
5227     assert(bt == T_LONG, "");
5228     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5229   }
5230 }
5231 
5232 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5233                                    bool merge, BasicType bt, int vlen_enc) {
5234   if (bt == T_INT) {
5235     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5236   } else {
5237     assert(bt == T_LONG, "");
5238     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5239   }
5240 }
5241 
5242 #ifdef _LP64
5243 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5244                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5245                                                int vec_enc) {
5246   int index = 0;
5247   int vindex = 0;
5248   mov64(rtmp1, 0x0101010101010101L);
5249   pdepq(rtmp1, src, rtmp1);
5250   if (mask_len > 8) {
5251     movq(rtmp2, src);
5252     vpxor(xtmp, xtmp, xtmp, vec_enc);
5253     movq(xtmp, rtmp1);
5254   }
5255   movq(dst, rtmp1);
5256 
5257   mask_len -= 8;
5258   while (mask_len > 0) {
5259     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5260     index++;
5261     if ((index % 2) == 0) {
5262       pxor(xtmp, xtmp);
5263     }
5264     mov64(rtmp1, 0x0101010101010101L);
5265     shrq(rtmp2, 8);
5266     pdepq(rtmp1, rtmp2, rtmp1);
5267     pinsrq(xtmp, rtmp1, index % 2);
5268     vindex = index / 2;
5269     if (vindex) {
5270       // Write entire 16 byte vector when both 64 bit
5271       // lanes are update to save redundant instructions.
5272       if (index % 2) {
5273         vinsertf128(dst, dst, xtmp, vindex);
5274       }
5275     } else {
5276       vmovdqu(dst, xtmp);
5277     }
5278     mask_len -= 8;
5279   }
5280 }
5281 
5282 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5283   switch(opc) {
5284     case Op_VectorMaskTrueCount:
5285       popcntq(dst, tmp);
5286       break;
5287     case Op_VectorMaskLastTrue:
5288       if (VM_Version::supports_lzcnt()) {
5289         lzcntq(tmp, tmp);
5290         movl(dst, 63);
5291         subl(dst, tmp);
5292       } else {
5293         movl(dst, -1);
5294         bsrq(tmp, tmp);
5295         cmov32(Assembler::notZero, dst, tmp);
5296       }
5297       break;
5298     case Op_VectorMaskFirstTrue:
5299       if (VM_Version::supports_bmi1()) {
5300         if (masklen < 32) {
5301           orl(tmp, 1 << masklen);
5302           tzcntl(dst, tmp);
5303         } else if (masklen == 32) {
5304           tzcntl(dst, tmp);
5305         } else {
5306           assert(masklen == 64, "");
5307           tzcntq(dst, tmp);
5308         }
5309       } else {
5310         if (masklen < 32) {
5311           orl(tmp, 1 << masklen);
5312           bsfl(dst, tmp);
5313         } else {
5314           assert(masklen == 32 || masklen == 64, "");
5315           movl(dst, masklen);
5316           if (masklen == 32)  {
5317             bsfl(tmp, tmp);
5318           } else {
5319             bsfq(tmp, tmp);
5320           }
5321           cmov32(Assembler::notZero, dst, tmp);
5322         }
5323       }
5324       break;
5325     case Op_VectorMaskToLong:
5326       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5327       break;
5328     default: assert(false, "Unhandled mask operation");
5329   }
5330 }
5331 
5332 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5333                                               int masklen, int masksize, int vec_enc) {
5334   assert(VM_Version::supports_popcnt(), "");
5335 
5336   if(VM_Version::supports_avx512bw()) {
5337     kmovql(tmp, mask);
5338   } else {
5339     assert(masklen <= 16, "");
5340     kmovwl(tmp, mask);
5341   }
5342 
5343   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5344   // operations needs to be clipped.
5345   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5346     andq(tmp, (1 << masklen) - 1);
5347   }
5348 
5349   vector_mask_operation_helper(opc, dst, tmp, masklen);
5350 }
5351 
5352 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5353                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5354   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5355          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5356   assert(VM_Version::supports_popcnt(), "");
5357 
5358   bool need_clip = false;
5359   switch(bt) {
5360     case T_BOOLEAN:
5361       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5362       vpxor(xtmp, xtmp, xtmp, vec_enc);
5363       vpsubb(xtmp, xtmp, mask, vec_enc);
5364       vpmovmskb(tmp, xtmp, vec_enc);
5365       need_clip = masklen < 16;
5366       break;
5367     case T_BYTE:
5368       vpmovmskb(tmp, mask, vec_enc);
5369       need_clip = masklen < 16;
5370       break;
5371     case T_SHORT:
5372       vpacksswb(xtmp, mask, mask, vec_enc);
5373       if (masklen >= 16) {
5374         vpermpd(xtmp, xtmp, 8, vec_enc);
5375       }
5376       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5377       need_clip = masklen < 16;
5378       break;
5379     case T_INT:
5380     case T_FLOAT:
5381       vmovmskps(tmp, mask, vec_enc);
5382       need_clip = masklen < 4;
5383       break;
5384     case T_LONG:
5385     case T_DOUBLE:
5386       vmovmskpd(tmp, mask, vec_enc);
5387       need_clip = masklen < 2;
5388       break;
5389     default: assert(false, "Unhandled type, %s", type2name(bt));
5390   }
5391 
5392   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5393   // operations needs to be clipped.
5394   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5395     // need_clip implies masklen < 32
5396     andq(tmp, (1 << masklen) - 1);
5397   }
5398 
5399   vector_mask_operation_helper(opc, dst, tmp, masklen);
5400 }
5401 
5402 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5403                                              Register rtmp2, int mask_len) {
5404   kmov(rtmp1, src);
5405   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5406   mov64(rtmp2, -1L);
5407   pextq(rtmp2, rtmp2, rtmp1);
5408   kmov(dst, rtmp2);
5409 }
5410 
5411 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5412                                                     XMMRegister mask, Register rtmp, Register rscratch,
5413                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5414                                                     int vec_enc) {
5415   assert(type2aelembytes(bt) >= 4, "");
5416   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5417   address compress_perm_table = nullptr;
5418   address expand_perm_table = nullptr;
5419   if (type2aelembytes(bt) == 8) {
5420     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5421     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5422     vmovmskpd(rtmp, mask, vec_enc);
5423   } else {
5424     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5425     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5426     vmovmskps(rtmp, mask, vec_enc);
5427   }
5428   shlq(rtmp, 5); // for 32 byte permute row.
5429   if (opcode == Op_CompressV) {
5430     lea(rscratch, ExternalAddress(compress_perm_table));
5431   } else {
5432     lea(rscratch, ExternalAddress(expand_perm_table));
5433   }
5434   addptr(rtmp, rscratch);
5435   vmovdqu(permv, Address(rtmp));
5436   vpermps(dst, permv, src, Assembler::AVX_256bit);
5437   vpxor(xtmp, xtmp, xtmp, vec_enc);
5438   // Blend the result with zero vector using permute mask, each column entry
5439   // in a permute table row contains either a valid permute index or a -1 (default)
5440   // value, this can potentially be used as a blending mask after
5441   // compressing/expanding the source vector lanes.
5442   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5443 }
5444 
5445 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5446                                                bool merge, BasicType bt, int vec_enc) {
5447   if (opcode == Op_CompressV) {
5448     switch(bt) {
5449     case T_BYTE:
5450       evpcompressb(dst, mask, src, merge, vec_enc);
5451       break;
5452     case T_CHAR:
5453     case T_SHORT:
5454       evpcompressw(dst, mask, src, merge, vec_enc);
5455       break;
5456     case T_INT:
5457       evpcompressd(dst, mask, src, merge, vec_enc);
5458       break;
5459     case T_FLOAT:
5460       evcompressps(dst, mask, src, merge, vec_enc);
5461       break;
5462     case T_LONG:
5463       evpcompressq(dst, mask, src, merge, vec_enc);
5464       break;
5465     case T_DOUBLE:
5466       evcompresspd(dst, mask, src, merge, vec_enc);
5467       break;
5468     default:
5469       fatal("Unsupported type %s", type2name(bt));
5470       break;
5471     }
5472   } else {
5473     assert(opcode == Op_ExpandV, "");
5474     switch(bt) {
5475     case T_BYTE:
5476       evpexpandb(dst, mask, src, merge, vec_enc);
5477       break;
5478     case T_CHAR:
5479     case T_SHORT:
5480       evpexpandw(dst, mask, src, merge, vec_enc);
5481       break;
5482     case T_INT:
5483       evpexpandd(dst, mask, src, merge, vec_enc);
5484       break;
5485     case T_FLOAT:
5486       evexpandps(dst, mask, src, merge, vec_enc);
5487       break;
5488     case T_LONG:
5489       evpexpandq(dst, mask, src, merge, vec_enc);
5490       break;
5491     case T_DOUBLE:
5492       evexpandpd(dst, mask, src, merge, vec_enc);
5493       break;
5494     default:
5495       fatal("Unsupported type %s", type2name(bt));
5496       break;
5497     }
5498   }
5499 }
5500 #endif
5501 
5502 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5503                                            KRegister ktmp1, int vec_enc) {
5504   if (opcode == Op_SignumVD) {
5505     vsubpd(dst, zero, one, vec_enc);
5506     // if src < 0 ? -1 : 1
5507     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5508     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5509     // if src == NaN, -0.0 or 0.0 return src.
5510     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5511     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5512   } else {
5513     assert(opcode == Op_SignumVF, "");
5514     vsubps(dst, zero, one, vec_enc);
5515     // if src < 0 ? -1 : 1
5516     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5517     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5518     // if src == NaN, -0.0 or 0.0 return src.
5519     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5520     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5521   }
5522 }
5523 
5524 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5525                                           XMMRegister xtmp1, int vec_enc) {
5526   if (opcode == Op_SignumVD) {
5527     vsubpd(dst, zero, one, vec_enc);
5528     // if src < 0 ? -1 : 1
5529     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5530     // if src == NaN, -0.0 or 0.0 return src.
5531     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5532     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5533   } else {
5534     assert(opcode == Op_SignumVF, "");
5535     vsubps(dst, zero, one, vec_enc);
5536     // if src < 0 ? -1 : 1
5537     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5538     // if src == NaN, -0.0 or 0.0 return src.
5539     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5540     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5541   }
5542 }
5543 
5544 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5545   if (VM_Version::supports_avx512bw()) {
5546     if (mask_len > 32) {
5547       kmovql(dst, src);
5548     } else {
5549       kmovdl(dst, src);
5550       if (mask_len != 32) {
5551         kshiftrdl(dst, dst, 32 - mask_len);
5552       }
5553     }
5554   } else {
5555     assert(mask_len <= 16, "");
5556     kmovwl(dst, src);
5557     if (mask_len != 16) {
5558       kshiftrwl(dst, dst, 16 - mask_len);
5559     }
5560   }
5561 }
5562 
5563 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5564   int lane_size = type2aelembytes(bt);
5565   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5566   if ((is_LP64 || lane_size < 8) &&
5567       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5568        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5569     movptr(rtmp, imm32);
5570     switch(lane_size) {
5571       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5572       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5573       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5574       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5575       fatal("Unsupported lane size %d", lane_size);
5576       break;
5577     }
5578   } else {
5579     movptr(rtmp, imm32);
5580     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5581     switch(lane_size) {
5582       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5583       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5584       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5585       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5586       fatal("Unsupported lane size %d", lane_size);
5587       break;
5588     }
5589   }
5590 }
5591 
5592 //
5593 // Following is lookup table based popcount computation algorithm:-
5594 //       Index   Bit set count
5595 //     [ 0000 ->   0,
5596 //       0001 ->   1,
5597 //       0010 ->   1,
5598 //       0011 ->   2,
5599 //       0100 ->   1,
5600 //       0101 ->   2,
5601 //       0110 ->   2,
5602 //       0111 ->   3,
5603 //       1000 ->   1,
5604 //       1001 ->   2,
5605 //       1010 ->   3,
5606 //       1011 ->   3,
5607 //       1100 ->   2,
5608 //       1101 ->   3,
5609 //       1111 ->   4 ]
5610 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5611 //     shuffle indices for lookup table access.
5612 //  b. Right shift each byte of vector lane by 4 positions.
5613 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5614 //     shuffle indices for lookup table access.
5615 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5616 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5617 //     count of all the bytes of a quadword.
5618 //  f. Perform step e. for upper 128bit vector lane.
5619 //  g. Pack the bitset count of quadwords back to double word.
5620 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5621 
5622 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5623                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5624   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5625   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5626   vpsrlw(dst, src, 4, vec_enc);
5627   vpand(dst, dst, xtmp1, vec_enc);
5628   vpand(xtmp1, src, xtmp1, vec_enc);
5629   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5630   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5631   vpshufb(dst, xtmp2, dst, vec_enc);
5632   vpaddb(dst, dst, xtmp1, vec_enc);
5633 }
5634 
5635 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5636                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5637   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5638   // Following code is as per steps e,f,g and h of above algorithm.
5639   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5640   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5641   vpsadbw(dst, dst, xtmp2, vec_enc);
5642   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5643   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5644   vpackuswb(dst, xtmp1, dst, vec_enc);
5645 }
5646 
5647 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5648                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5649   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5650   // Add the popcount of upper and lower bytes of word.
5651   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5652   vpsrlw(dst, xtmp1, 8, vec_enc);
5653   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5654   vpaddw(dst, dst, xtmp1, vec_enc);
5655 }
5656 
5657 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5658                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5659   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5660   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5661   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5662 }
5663 
5664 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5665                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5666   switch(bt) {
5667     case T_LONG:
5668       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5669       break;
5670     case T_INT:
5671       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5672       break;
5673     case T_CHAR:
5674     case T_SHORT:
5675       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5676       break;
5677     case T_BYTE:
5678     case T_BOOLEAN:
5679       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5680       break;
5681     default:
5682       fatal("Unsupported type %s", type2name(bt));
5683       break;
5684   }
5685 }
5686 
5687 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5688                                                       KRegister mask, bool merge, int vec_enc) {
5689   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5690   switch(bt) {
5691     case T_LONG:
5692       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5693       evpopcntq(dst, mask, src, merge, vec_enc);
5694       break;
5695     case T_INT:
5696       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5697       evpopcntd(dst, mask, src, merge, vec_enc);
5698       break;
5699     case T_CHAR:
5700     case T_SHORT:
5701       assert(VM_Version::supports_avx512_bitalg(), "");
5702       evpopcntw(dst, mask, src, merge, vec_enc);
5703       break;
5704     case T_BYTE:
5705     case T_BOOLEAN:
5706       assert(VM_Version::supports_avx512_bitalg(), "");
5707       evpopcntb(dst, mask, src, merge, vec_enc);
5708       break;
5709     default:
5710       fatal("Unsupported type %s", type2name(bt));
5711       break;
5712   }
5713 }
5714 
5715 #ifndef _LP64
5716 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5717   assert(VM_Version::supports_avx512bw(), "");
5718   kmovdl(tmp, src);
5719   kunpckdql(dst, tmp, tmp);
5720 }
5721 #endif
5722 
5723 // Bit reversal algorithm first reverses the bits of each byte followed by
5724 // a byte level reversal for multi-byte primitive types (short/int/long).
5725 // Algorithm performs a lookup table access to get reverse bit sequence
5726 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5727 // is obtained by swapping the reverse bit sequences of upper and lower
5728 // nibble of a byte.
5729 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5730                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5731   if (VM_Version::supports_avx512vlbw()) {
5732 
5733     // Get the reverse bit sequence of lower nibble of each byte.
5734     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5735     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5736     evpandq(dst, xtmp2, src, vec_enc);
5737     vpshufb(dst, xtmp1, dst, vec_enc);
5738     vpsllq(dst, dst, 4, vec_enc);
5739 
5740     // Get the reverse bit sequence of upper nibble of each byte.
5741     vpandn(xtmp2, xtmp2, src, vec_enc);
5742     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5743     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5744 
5745     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5746     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5747     evporq(xtmp2, dst, xtmp2, vec_enc);
5748     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5749 
5750   } else if(vec_enc == Assembler::AVX_512bit) {
5751     // Shift based bit reversal.
5752     assert(bt == T_LONG || bt == T_INT, "");
5753 
5754     // Swap lower and upper nibble of each byte.
5755     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5756 
5757     // Swap two least and most significant bits of each nibble.
5758     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5759 
5760     // Swap adjacent pair of bits.
5761     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5762     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5763 
5764     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5765     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5766   } else {
5767     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5768     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5769 
5770     // Get the reverse bit sequence of lower nibble of each byte.
5771     vpand(dst, xtmp2, src, vec_enc);
5772     vpshufb(dst, xtmp1, dst, vec_enc);
5773     vpsllq(dst, dst, 4, vec_enc);
5774 
5775     // Get the reverse bit sequence of upper nibble of each byte.
5776     vpandn(xtmp2, xtmp2, src, vec_enc);
5777     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5778     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5779 
5780     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5781     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5782     vpor(xtmp2, dst, xtmp2, vec_enc);
5783     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5784   }
5785 }
5786 
5787 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5788                                                 XMMRegister xtmp, Register rscratch) {
5789   assert(VM_Version::supports_gfni(), "");
5790   assert(rscratch != noreg || always_reachable(mask), "missing");
5791 
5792   // Galois field instruction based bit reversal based on following algorithm.
5793   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5794   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5795   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5796   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5797 }
5798 
5799 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5800                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5801   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5802   evpandq(dst, xtmp1, src, vec_enc);
5803   vpsllq(dst, dst, nbits, vec_enc);
5804   vpandn(xtmp1, xtmp1, src, vec_enc);
5805   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5806   evporq(dst, dst, xtmp1, vec_enc);
5807 }
5808 
5809 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5810                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5811   // Shift based bit reversal.
5812   assert(VM_Version::supports_evex(), "");
5813   switch(bt) {
5814     case T_LONG:
5815       // Swap upper and lower double word of each quad word.
5816       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5817       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5818       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5819       break;
5820     case T_INT:
5821       // Swap upper and lower word of each double word.
5822       evprord(xtmp1, k0, src, 16, true, vec_enc);
5823       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5824       break;
5825     case T_CHAR:
5826     case T_SHORT:
5827       // Swap upper and lower byte of each word.
5828       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5829       break;
5830     case T_BYTE:
5831       evmovdquq(dst, k0, src, true, vec_enc);
5832       break;
5833     default:
5834       fatal("Unsupported type %s", type2name(bt));
5835       break;
5836   }
5837 }
5838 
5839 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5840   if (bt == T_BYTE) {
5841     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5842       evmovdquq(dst, k0, src, true, vec_enc);
5843     } else {
5844       vmovdqu(dst, src);
5845     }
5846     return;
5847   }
5848   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5849   // pre-computed shuffle indices.
5850   switch(bt) {
5851     case T_LONG:
5852       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5853       break;
5854     case T_INT:
5855       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5856       break;
5857     case T_CHAR:
5858     case T_SHORT:
5859       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5860       break;
5861     default:
5862       fatal("Unsupported type %s", type2name(bt));
5863       break;
5864   }
5865   vpshufb(dst, src, dst, vec_enc);
5866 }
5867 
5868 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5869                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5870                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5871   assert(is_integral_type(bt), "");
5872   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5873   assert(VM_Version::supports_avx512cd(), "");
5874   switch(bt) {
5875     case T_LONG:
5876       evplzcntq(dst, ktmp, src, merge, vec_enc);
5877       break;
5878     case T_INT:
5879       evplzcntd(dst, ktmp, src, merge, vec_enc);
5880       break;
5881     case T_SHORT:
5882       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5883       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5884       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5885       vpunpckhwd(dst, xtmp1, src, vec_enc);
5886       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5887       vpackusdw(dst, xtmp2, dst, vec_enc);
5888       break;
5889     case T_BYTE:
5890       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5891       // accessing the lookup table.
5892       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5893       // accessing the lookup table.
5894       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5895       assert(VM_Version::supports_avx512bw(), "");
5896       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5897       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5898       vpand(xtmp2, dst, src, vec_enc);
5899       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5900       vpsrlw(xtmp3, src, 4, vec_enc);
5901       vpand(xtmp3, dst, xtmp3, vec_enc);
5902       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5903       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5904       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5905       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5906       break;
5907     default:
5908       fatal("Unsupported type %s", type2name(bt));
5909       break;
5910   }
5911 }
5912 
5913 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5914                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5915   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5916   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5917   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5918   // accessing the lookup table.
5919   vpand(dst, xtmp2, src, vec_enc);
5920   vpshufb(dst, xtmp1, dst, vec_enc);
5921   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5922   // accessing the lookup table.
5923   vpsrlw(xtmp3, src, 4, vec_enc);
5924   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5925   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5926   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5927   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5928   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5929   vpaddb(dst, dst, xtmp2, vec_enc);
5930   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5931 }
5932 
5933 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5934                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5935   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5936   // Add zero counts of lower byte and upper byte of a word if
5937   // upper byte holds a zero value.
5938   vpsrlw(xtmp3, src, 8, vec_enc);
5939   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5940   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5941   vpsllw(xtmp2, dst, 8, vec_enc);
5942   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5943   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5944   vpsrlw(dst, dst, 8, vec_enc);
5945 }
5946 
5947 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5948                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5949   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5950   // hence biased exponent can be used to compute leading zero count as per
5951   // following formula:-
5952   // LZCNT = 32 - (biased_exp - 127)
5953   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5954 
5955   // Broadcast 0xFF
5956   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5957   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5958 
5959   // Extract biased exponent.
5960   vcvtdq2ps(dst, src, vec_enc);
5961   vpsrld(dst, dst, 23, vec_enc);
5962   vpand(dst, dst, xtmp1, vec_enc);
5963 
5964   // Broadcast 127.
5965   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5966   // Exponent = biased_exp - 127
5967   vpsubd(dst, dst, xtmp1, vec_enc);
5968 
5969   // Exponent = Exponent  + 1
5970   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5971   vpaddd(dst, dst, xtmp3, vec_enc);
5972 
5973   // Replace -ve exponent with zero, exponent is -ve when src
5974   // lane contains a zero value.
5975   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5976   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5977 
5978   // Rematerialize broadcast 32.
5979   vpslld(xtmp1, xtmp3, 5, vec_enc);
5980   // Exponent is 32 if corresponding source lane contains max_int value.
5981   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5982   // LZCNT = 32 - exponent
5983   vpsubd(dst, xtmp1, dst, vec_enc);
5984 
5985   // Replace LZCNT with a value 1 if corresponding source lane
5986   // contains max_int value.
5987   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5988 
5989   // Replace biased_exp with 0 if source lane value is less than zero.
5990   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5991   vblendvps(dst, dst, xtmp2, src, vec_enc);
5992 }
5993 
5994 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5995                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5996   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5997   // Add zero counts of lower word and upper word of a double word if
5998   // upper word holds a zero value.
5999   vpsrld(xtmp3, src, 16, vec_enc);
6000   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6001   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6002   vpslld(xtmp2, dst, 16, vec_enc);
6003   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6004   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6005   vpsrld(dst, dst, 16, vec_enc);
6006   // Add zero counts of lower doubleword and upper doubleword of a
6007   // quadword if upper doubleword holds a zero value.
6008   vpsrlq(xtmp3, src, 32, vec_enc);
6009   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6010   vpsllq(xtmp2, dst, 32, vec_enc);
6011   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6012   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6013   vpsrlq(dst, dst, 32, vec_enc);
6014 }
6015 
6016 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6017                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6018                                                        Register rtmp, int vec_enc) {
6019   assert(is_integral_type(bt), "unexpected type");
6020   assert(vec_enc < Assembler::AVX_512bit, "");
6021   switch(bt) {
6022     case T_LONG:
6023       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6024       break;
6025     case T_INT:
6026       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6027       break;
6028     case T_SHORT:
6029       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6030       break;
6031     case T_BYTE:
6032       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6033       break;
6034     default:
6035       fatal("Unsupported type %s", type2name(bt));
6036       break;
6037   }
6038 }
6039 
6040 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6041   switch(bt) {
6042     case T_BYTE:
6043       vpsubb(dst, src1, src2, vec_enc);
6044       break;
6045     case T_SHORT:
6046       vpsubw(dst, src1, src2, vec_enc);
6047       break;
6048     case T_INT:
6049       vpsubd(dst, src1, src2, vec_enc);
6050       break;
6051     case T_LONG:
6052       vpsubq(dst, src1, src2, vec_enc);
6053       break;
6054     default:
6055       fatal("Unsupported type %s", type2name(bt));
6056       break;
6057   }
6058 }
6059 
6060 // Trailing zero count computation is based on leading zero count operation as per
6061 // following equation. All AVX3 targets support AVX512CD feature which offers
6062 // direct vector instruction to compute leading zero count.
6063 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6064 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6065                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6066                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6067   assert(is_integral_type(bt), "");
6068   // xtmp = -1
6069   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6070   // xtmp = xtmp + src
6071   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6072   // xtmp = xtmp & ~src
6073   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6074   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6075   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6076   vpsub(bt, dst, xtmp4, dst, vec_enc);
6077 }
6078 
6079 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6080 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6081 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6082                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6083   assert(is_integral_type(bt), "");
6084   // xtmp = 0
6085   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6086   // xtmp = 0 - src
6087   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6088   // xtmp = xtmp | src
6089   vpor(xtmp3, xtmp3, src, vec_enc);
6090   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6091   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6092   vpsub(bt, dst, xtmp1, dst, vec_enc);
6093 }
6094 
6095 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6096   Label done;
6097   Label neg_divisor_fastpath;
6098   cmpl(divisor, 0);
6099   jccb(Assembler::less, neg_divisor_fastpath);
6100   xorl(rdx, rdx);
6101   divl(divisor);
6102   jmpb(done);
6103   bind(neg_divisor_fastpath);
6104   // Fastpath for divisor < 0:
6105   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6106   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6107   movl(rdx, rax);
6108   subl(rdx, divisor);
6109   if (VM_Version::supports_bmi1()) {
6110     andnl(rax, rdx, rax);
6111   } else {
6112     notl(rdx);
6113     andl(rax, rdx);
6114   }
6115   shrl(rax, 31);
6116   bind(done);
6117 }
6118 
6119 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6120   Label done;
6121   Label neg_divisor_fastpath;
6122   cmpl(divisor, 0);
6123   jccb(Assembler::less, neg_divisor_fastpath);
6124   xorl(rdx, rdx);
6125   divl(divisor);
6126   jmpb(done);
6127   bind(neg_divisor_fastpath);
6128   // Fastpath when divisor < 0:
6129   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6130   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6131   movl(rdx, rax);
6132   subl(rax, divisor);
6133   if (VM_Version::supports_bmi1()) {
6134     andnl(rax, rax, rdx);
6135   } else {
6136     notl(rax);
6137     andl(rax, rdx);
6138   }
6139   sarl(rax, 31);
6140   andl(rax, divisor);
6141   subl(rdx, rax);
6142   bind(done);
6143 }
6144 
6145 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6146   Label done;
6147   Label neg_divisor_fastpath;
6148 
6149   cmpl(divisor, 0);
6150   jccb(Assembler::less, neg_divisor_fastpath);
6151   xorl(rdx, rdx);
6152   divl(divisor);
6153   jmpb(done);
6154   bind(neg_divisor_fastpath);
6155   // Fastpath for divisor < 0:
6156   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6157   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6158   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6159   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6160   movl(rdx, rax);
6161   subl(rax, divisor);
6162   if (VM_Version::supports_bmi1()) {
6163     andnl(rax, rax, rdx);
6164   } else {
6165     notl(rax);
6166     andl(rax, rdx);
6167   }
6168   movl(tmp, rax);
6169   shrl(rax, 31); // quotient
6170   sarl(tmp, 31);
6171   andl(tmp, divisor);
6172   subl(rdx, tmp); // remainder
6173   bind(done);
6174 }
6175 
6176 #ifdef _LP64
6177 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6178                                  XMMRegister xtmp2, Register rtmp) {
6179   if(VM_Version::supports_gfni()) {
6180     // Galois field instruction based bit reversal based on following algorithm.
6181     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6182     mov64(rtmp, 0x8040201008040201L);
6183     movq(xtmp1, src);
6184     movq(xtmp2, rtmp);
6185     gf2p8affineqb(xtmp1, xtmp2, 0);
6186     movq(dst, xtmp1);
6187   } else {
6188     // Swap even and odd numbered bits.
6189     movl(rtmp, src);
6190     andl(rtmp, 0x55555555);
6191     shll(rtmp, 1);
6192     movl(dst, src);
6193     andl(dst, 0xAAAAAAAA);
6194     shrl(dst, 1);
6195     orl(dst, rtmp);
6196 
6197     // Swap LSB and MSB 2 bits of each nibble.
6198     movl(rtmp, dst);
6199     andl(rtmp, 0x33333333);
6200     shll(rtmp, 2);
6201     andl(dst, 0xCCCCCCCC);
6202     shrl(dst, 2);
6203     orl(dst, rtmp);
6204 
6205     // Swap LSB and MSB 4 bits of each byte.
6206     movl(rtmp, dst);
6207     andl(rtmp, 0x0F0F0F0F);
6208     shll(rtmp, 4);
6209     andl(dst, 0xF0F0F0F0);
6210     shrl(dst, 4);
6211     orl(dst, rtmp);
6212   }
6213   bswapl(dst);
6214 }
6215 
6216 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6217                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6218   if(VM_Version::supports_gfni()) {
6219     // Galois field instruction based bit reversal based on following algorithm.
6220     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6221     mov64(rtmp1, 0x8040201008040201L);
6222     movq(xtmp1, src);
6223     movq(xtmp2, rtmp1);
6224     gf2p8affineqb(xtmp1, xtmp2, 0);
6225     movq(dst, xtmp1);
6226   } else {
6227     // Swap even and odd numbered bits.
6228     movq(rtmp1, src);
6229     mov64(rtmp2, 0x5555555555555555L);
6230     andq(rtmp1, rtmp2);
6231     shlq(rtmp1, 1);
6232     movq(dst, src);
6233     notq(rtmp2);
6234     andq(dst, rtmp2);
6235     shrq(dst, 1);
6236     orq(dst, rtmp1);
6237 
6238     // Swap LSB and MSB 2 bits of each nibble.
6239     movq(rtmp1, dst);
6240     mov64(rtmp2, 0x3333333333333333L);
6241     andq(rtmp1, rtmp2);
6242     shlq(rtmp1, 2);
6243     notq(rtmp2);
6244     andq(dst, rtmp2);
6245     shrq(dst, 2);
6246     orq(dst, rtmp1);
6247 
6248     // Swap LSB and MSB 4 bits of each byte.
6249     movq(rtmp1, dst);
6250     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6251     andq(rtmp1, rtmp2);
6252     shlq(rtmp1, 4);
6253     notq(rtmp2);
6254     andq(dst, rtmp2);
6255     shrq(dst, 4);
6256     orq(dst, rtmp1);
6257   }
6258   bswapq(dst);
6259 }
6260 
6261 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6262   Label done;
6263   Label neg_divisor_fastpath;
6264   cmpq(divisor, 0);
6265   jccb(Assembler::less, neg_divisor_fastpath);
6266   xorl(rdx, rdx);
6267   divq(divisor);
6268   jmpb(done);
6269   bind(neg_divisor_fastpath);
6270   // Fastpath for divisor < 0:
6271   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6272   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6273   movq(rdx, rax);
6274   subq(rdx, divisor);
6275   if (VM_Version::supports_bmi1()) {
6276     andnq(rax, rdx, rax);
6277   } else {
6278     notq(rdx);
6279     andq(rax, rdx);
6280   }
6281   shrq(rax, 63);
6282   bind(done);
6283 }
6284 
6285 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6286   Label done;
6287   Label neg_divisor_fastpath;
6288   cmpq(divisor, 0);
6289   jccb(Assembler::less, neg_divisor_fastpath);
6290   xorq(rdx, rdx);
6291   divq(divisor);
6292   jmp(done);
6293   bind(neg_divisor_fastpath);
6294   // Fastpath when divisor < 0:
6295   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6296   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6297   movq(rdx, rax);
6298   subq(rax, divisor);
6299   if (VM_Version::supports_bmi1()) {
6300     andnq(rax, rax, rdx);
6301   } else {
6302     notq(rax);
6303     andq(rax, rdx);
6304   }
6305   sarq(rax, 63);
6306   andq(rax, divisor);
6307   subq(rdx, rax);
6308   bind(done);
6309 }
6310 
6311 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6312   Label done;
6313   Label neg_divisor_fastpath;
6314   cmpq(divisor, 0);
6315   jccb(Assembler::less, neg_divisor_fastpath);
6316   xorq(rdx, rdx);
6317   divq(divisor);
6318   jmp(done);
6319   bind(neg_divisor_fastpath);
6320   // Fastpath for divisor < 0:
6321   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6322   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6323   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6324   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6325   movq(rdx, rax);
6326   subq(rax, divisor);
6327   if (VM_Version::supports_bmi1()) {
6328     andnq(rax, rax, rdx);
6329   } else {
6330     notq(rax);
6331     andq(rax, rdx);
6332   }
6333   movq(tmp, rax);
6334   shrq(rax, 63); // quotient
6335   sarq(tmp, 63);
6336   andq(tmp, divisor);
6337   subq(rdx, tmp); // remainder
6338   bind(done);
6339 }
6340 #endif
6341 
6342 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6343                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6344                                         int vlen_enc) {
6345   assert(VM_Version::supports_avx512bw(), "");
6346   // Byte shuffles are inlane operations and indices are determined using
6347   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6348   // normalized to index range 0-15. This makes sure that all the multiples
6349   // of an index value are placed at same relative position in 128 bit
6350   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6351   // will be 16th element in their respective 128 bit lanes.
6352   movl(rtmp, 16);
6353   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6354 
6355   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6356   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6357   // original shuffle indices and move the shuffled lanes corresponding to true
6358   // mask to destination vector.
6359   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6360   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6361   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6362 
6363   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6364   // and broadcasting second 128 bit lane.
6365   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6366   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6367   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6368   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6369   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6370 
6371   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6372   // and broadcasting third 128 bit lane.
6373   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6374   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6375   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6376   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6377   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6378 
6379   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6380   // and broadcasting third 128 bit lane.
6381   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6382   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6383   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6384   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6385   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6386 }
6387 
6388 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6389                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6390   if (vlen_enc == AVX_128bit) {
6391     vpermilps(dst, src, shuffle, vlen_enc);
6392   } else if (bt == T_INT) {
6393     vpermd(dst, shuffle, src, vlen_enc);
6394   } else {
6395     assert(bt == T_FLOAT, "");
6396     vpermps(dst, shuffle, src, vlen_enc);
6397   }
6398 }
6399 
6400 #ifdef _LP64
6401 void C2_MacroAssembler::load_nklass_compact_c2(Register dst, Register obj, Register index, Address::ScaleFactor scale, int disp) {
6402   // Note: Don't clobber obj anywhere in that method!
6403 
6404   // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract
6405   // obj-start, so that we can load from the object's mark-word instead. Usually the address
6406   // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2
6407   // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and
6408   // then passes that register as obj and 0 in disp. The following code extracts the base
6409   // and offset to load the mark-word.
6410   int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes();
6411   movq(dst, Address(obj, index, scale, offset));
6412   shrq(dst, markWord::klass_shift);
6413 }
6414 #endif