New src/hotspot/cpu/x86/c2_MacroAssembler

   1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  54 
  55   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  56   // NativeJump::patch_verified_entry will be able to patch out the entry
  57   // code safely. The push to verify stack depth is ok at 5 bytes,
  58   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  59   // stack bang then we must use the 6 byte frame allocation even if
  60   // we have no frame. :-(
  61   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  62 
  63   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  64   // Remove word for return addr
  65   framesize -= wordSize;
  66   stack_bang_size -= wordSize;
  67 
  68   // Calls to C2R adapters often do not accept exceptional returns.
  69   // We require that their callers must bang for them.  But be careful, because
  70   // some VM calls (such as call site linkage) can use several kilobytes of
  71   // stack.  But the stack safety zone should account for that.
  72   // See bugs 4446381, 4468289, 4497237.
  73   if (stack_bang_size > 0) {
  74     generate_stack_overflow_check(stack_bang_size);
  75 
  76     // We always push rbp, so that on return to interpreter rbp, will be
  77     // restored correctly and we can correct the stack.
  78     push(rbp);
  79     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  80     if (PreserveFramePointer) {
  81       mov(rbp, rsp);
  82     }
  83     // Remove word for ebp
  84     framesize -= wordSize;
  85 
  86     // Create frame
  87     if (framesize) {
  88       subptr(rsp, framesize);
  89     }
  90   } else {
  91     // Create frame (force generation of a 4 byte immediate value)
  92     subptr_imm32(rsp, framesize);
  93 
  94     // Save RBP register now.
  95     framesize -= wordSize;
  96     movptr(Address(rsp, framesize), rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       movptr(rbp, rsp);
 100       if (framesize > 0) {
 101         addptr(rbp, framesize);
 102       }
 103     }
 104   }
 105 
 106   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 107     framesize -= wordSize;
 108     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 109   }
 110 
 111 #ifndef _LP64
 112   // If method sets FPU control word do it now
 113   if (fp_mode_24b) {
 114     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 115   }
 116   if (UseSSE >= 2 && VerifyFPU) {
 117     verify_FPU(0, "FPU stack must be clean on entry");
 118   }
 119 #endif
 120 
 121 #ifdef ASSERT
 122   if (VerifyStackAtCalls) {
 123     Label L;
 124     push(rax);
 125     mov(rax, rsp);
 126     andptr(rax, StackAlignmentInBytes-1);
 127     cmpptr(rax, StackAlignmentInBytes-wordSize);
 128     pop(rax);
 129     jcc(Assembler::equal, L);
 130     STOP("Stack is not properly aligned!");
 131     bind(L);
 132   }
 133 #endif
 134 
 135   if (!is_stub) {
 136     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137  #ifdef _LP64
 138     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 139       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 140       Label dummy_slow_path;
 141       Label dummy_continuation;
 142       Label* slow_path = &dummy_slow_path;
 143       Label* continuation = &dummy_continuation;
 144       if (!Compile::current()->output()->in_scratch_emit_size()) {
 145         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 146         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 147         Compile::current()->output()->add_stub(stub);
 148         slow_path = &stub->entry();
 149         continuation = &stub->continuation();
 150       }
 151       bs->nmethod_entry_barrier(this, slow_path, continuation);
 152     }
 153 #else
 154     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 155     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 156 #endif
 157   }
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 // fast_lock and fast_unlock used by C2
 176 
 177 // Because the transitions from emitted code to the runtime
 178 // monitorenter/exit helper stubs are so slow it's critical that
 179 // we inline both the stack-locking fast path and the inflated fast path.
 180 //
 181 // See also: cmpFastLock and cmpFastUnlock.
 182 //
 183 // What follows is a specialized inline transliteration of the code
 184 // in enter() and exit(). If we're concerned about I$ bloat another
 185 // option would be to emit TrySlowEnter and TrySlowExit methods
 186 // at startup-time.  These methods would accept arguments as
 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 188 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 190 // In practice, however, the # of lock sites is bounded and is usually small.
 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 192 // if the processor uses simple bimodal branch predictors keyed by EIP
 193 // Since the helper routines would be called from multiple synchronization
 194 // sites.
 195 //
 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 198 // to those specialized methods.  That'd give us a mostly platform-independent
 199 // implementation that the JITs could optimize and inline at their pleasure.
 200 // Done correctly, the only time we'd need to cross to native could would be
 201 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 203 // (b) explicit barriers or fence operations.
 204 //
 205 // TODO:
 206 //
 207 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 208 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 209 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 210 //    the lock operators would typically be faster than reifying Self.
 211 //
 212 // *  Ideally I'd define the primitives as:
 213 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 214 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 215 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 216 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 217 //    Furthermore the register assignments are overconstrained, possibly resulting in
 218 //    sub-optimal code near the synchronization site.
 219 //
 220 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 221 //    Alternately, use a better sp-proximity test.
 222 //
 223 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 224 //    Either one is sufficient to uniquely identify a thread.
 225 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 226 //
 227 // *  Intrinsify notify() and notifyAll() for the common cases where the
 228 //    object is locked by the calling thread but the waitlist is empty.
 229 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 230 //
 231 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 232 //    But beware of excessive branch density on AMD Opterons.
 233 //
 234 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 235 //    or failure of the fast path.  If the fast path fails then we pass
 236 //    control to the slow path, typically in C.  In fast_lock and
 237 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 238 //    will emit a conditional branch immediately after the node.
 239 //    So we have branches to branches and lots of ICC.ZF games.
 240 //    Instead, it might be better to have C2 pass a "FailureLabel"
 241 //    into fast_lock and fast_unlock.  In the case of success, control
 242 //    will drop through the node.  ICC.ZF is undefined at exit.
 243 //    In the case of failure, the node will branch directly to the
 244 //    FailureLabel
 245 
 246 
 247 // obj: object to lock
 248 // box: on-stack box address (displaced header location) - KILLED
 249 // rax,: tmp -- KILLED
 250 // scr: tmp -- KILLED
 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 252                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 253                                  Metadata* method_data) {
 254   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 255   // Ensure the register assignments are disjoint
 256   assert(tmpReg == rax, "");
 257   assert(cx1Reg == noreg, "");
 258   assert(cx2Reg == noreg, "");
 259   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 260 
 261   // Possible cases that we'll encounter in fast_lock
 262   // ------------------------------------------------
 263   // * Inflated
 264   //    -- unlocked
 265   //    -- Locked
 266   //       = by self
 267   //       = by other
 268   // * neutral
 269   // * stack-locked
 270   //    -- by self
 271   //       = sp-proximity test hits
 272   //       = sp-proximity test generates false-negative
 273   //    -- by other
 274   //
 275 
 276   Label IsInflated, DONE_LABEL, COUNT;
 277 
 278   if (DiagnoseSyncOnValueBasedClasses != 0) {
 279     load_klass(tmpReg, objReg, scrReg);
 280     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 281     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 282     jcc(Assembler::notZero, DONE_LABEL);
 283   }
 284 
 285   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 286   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 287   jcc(Assembler::notZero, IsInflated);
 288 
 289   if (LockingMode == LM_MONITOR) {
 290     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 291     testptr(objReg, objReg);
 292   } else {
 293     assert(LockingMode == LM_LEGACY, "must be");
 294     // Attempt stack-locking ...
 295     orptr (tmpReg, markWord::unlocked_value);
 296     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 297     lock();
 298     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 299     jcc(Assembler::equal, COUNT);           // Success
 300 
 301     // Recursive locking.
 302     // The object is stack-locked: markword contains stack pointer to BasicLock.
 303     // Locked by current thread if difference with current SP is less than one page.
 304     subptr(tmpReg, rsp);
 305     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 306     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 307     movptr(Address(boxReg, 0), tmpReg);
 308   }
 309   // After recursive stack locking attempt case
 310   jmp(DONE_LABEL);
 311 
 312   bind(IsInflated);
 313   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 314 
 315 #ifndef _LP64
 316   // The object is inflated.
 317 
 318   // boxReg refers to the on-stack BasicLock in the current frame.
 319   // We'd like to write:
 320   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 321   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 322   // additional latency as we have another ST in the store buffer that must drain.
 323 
 324   // avoid ST-before-CAS
 325   // register juggle because we need tmpReg for cmpxchgptr below
 326   movptr(scrReg, boxReg);
 327   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 328 
 329   // Optimistic form: consider XORL tmpReg,tmpReg
 330   movptr(tmpReg, NULL_WORD);
 331 
 332   // Appears unlocked - try to swing _owner from null to non-null.
 333   // Ideally, I'd manifest "Self" with get_thread and then attempt
 334   // to CAS the register containing thread id into m->Owner.
 335   // But we don't have enough registers, so instead we can either try to CAS
 336   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 337   // we later store thread id into m->Owner.  Transiently storing a stack address
 338   // (rsp or the address of the box) into  m->owner is harmless.
 339   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 340   lock();
 341   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 342   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 343   // If we weren't able to swing _owner from null to the BasicLock
 344   // then take the slow path.
 345   jccb  (Assembler::notZero, DONE_LABEL);
 346   // update _owner from BasicLock to thread
 347   get_thread (scrReg);                    // beware: clobbers ICCs
 348   movptr(scrReg, Address(scrReg, JavaThread::lock_id_offset()));
 349   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 350   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 351   jmp(DONE_LABEL);
 352 
 353   // If the CAS fails we can either retry or pass control to the slow path.
 354   // We use the latter tactic.
 355   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 356   // If the CAS was successful ...
 357   //   Self has acquired the lock
 358   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 359   // Intentional fall-through into DONE_LABEL ...
 360 #else // _LP64
 361   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 362   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 363   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 364 
 365   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 366   movq(scrReg, tmpReg);
 367   xorq(tmpReg, tmpReg);
 368   movptr(boxReg, Address(r15_thread, JavaThread::lock_id_offset()));
 369   lock();
 370   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 371 
 372   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 373   jccb(Assembler::equal, DONE_LABEL);    // CAS above succeeded; propagate ZF = 1 (success)
 374 
 375   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 376   jccb(Assembler::notEqual, DONE_LABEL);    // If not recursive, ZF = 0 at this point (fail)
 377   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 378   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 379   jmp(DONE_LABEL);
 380 #endif // _LP64
 381 
 382   bind(COUNT);
 383   // Count monitors in fast path
 384   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 385   xorl(tmpReg, tmpReg); // Set ZF == 1
 386 
 387   bind(DONE_LABEL);
 388 
 389   // At DONE_LABEL the icc ZFlag is set as follows ...
 390   // fast_unlock uses the same protocol.
 391   // ZFlag == 1 -> Success
 392   // ZFlag == 0 -> Failure - force control through the slow path
 393 }
 394 
 395 // obj: object to unlock
 396 // box: box address (displaced header location), killed.  Must be EAX.
 397 // tmp: killed, cannot be obj nor box.
 398 //
 399 // Some commentary on balanced locking:
 400 //
 401 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 402 // Methods that don't have provably balanced locking are forced to run in the
 403 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 404 // The interpreter provides two properties:
 405 // I1:  At return-time the interpreter automatically and quietly unlocks any
 406 //      objects acquired the current activation (frame).  Recall that the
 407 //      interpreter maintains an on-stack list of locks currently held by
 408 //      a frame.
 409 // I2:  If a method attempts to unlock an object that is not held by the
 410 //      the frame the interpreter throws IMSX.
 411 //
 412 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 413 // B() doesn't have provably balanced locking so it runs in the interpreter.
 414 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 415 // is still locked by A().
 416 //
 417 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 418 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 419 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 420 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 421 // Arguably given that the spec legislates the JNI case as undefined our implementation
 422 // could reasonably *avoid* checking owner in fast_unlock().
 423 // In the interest of performance we elide m->Owner==Self check in unlock.
 424 // A perfectly viable alternative is to elide the owner check except when
 425 // Xcheck:jni is enabled.
 426 
 427 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, Register scrReg) {
 428   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 429   assert(boxReg == rax, "");
 430   assert_different_registers(objReg, boxReg, tmpReg);
 431 
 432   Label DONE_LABEL, Stacked, COUNT;
 433 
 434   if (LockingMode == LM_LEGACY) {
 435     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 436     jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 437   }
 438   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 439   if (LockingMode != LM_MONITOR) {
 440     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 441     jcc(Assembler::zero, Stacked);
 442   }
 443 
 444   // It's inflated.
 445   // If the owner is ANONYMOUS, we need to fix it -  in an outline stub.
 446   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER);
 447 #ifdef _LP64
 448   if (!Compile::current()->output()->in_scratch_emit_size()) {
 449     C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg);
 450     Compile::current()->output()->add_stub(stub);
 451     jcc(Assembler::equal, stub->entry());
 452     bind(stub->continuation());
 453   } else
 454 #endif
 455   {
 456     // We can't easily implement this optimization on 32 bit because we don't have a thread register.
 457     // Call the slow-path instead.
 458     jcc(Assembler::notEqual, DONE_LABEL);
 459   }
 460 
 461   // Despite our balanced locking property we still check that m->_owner == Self
 462   // as java routines or native JNI code called by this thread might
 463   // have released the lock.
 464   // Refer to the comments in synchronizer.cpp for how we might encode extra
 465   // state in _succ so we can avoid fetching EntryList|cxq.
 466   //
 467   // If there's no contention try a 1-0 exit.  That is, exit without
 468   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 469   // we detect and recover from the race that the 1-0 exit admits.
 470   //
 471   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 472   // before it STs null into _owner, releasing the lock.  Updates
 473   // to data protected by the critical section must be visible before
 474   // we drop the lock (and thus before any other thread could acquire
 475   // the lock and observe the fields protected by the lock).
 476   // IA32's memory-model is SPO, so STs are ordered with respect to
 477   // each other and there's no need for an explicit barrier (fence).
 478   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 479 #ifndef _LP64
 480   // Note that we could employ various encoding schemes to reduce
 481   // the number of loads below (currently 4) to just 2 or 3.
 482   // Refer to the comments in synchronizer.cpp.
 483   // In practice the chain of fetches doesn't seem to impact performance, however.
 484   xorptr(boxReg, boxReg);
 485   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 486   jccb  (Assembler::notZero, DONE_LABEL);
 487   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 488   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 489   jccb  (Assembler::notZero, DONE_LABEL);
 490   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 491   jmpb  (DONE_LABEL);
 492 #else // _LP64
 493   // It's inflated
 494   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 495 
 496   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 497   jccb(Assembler::equal, LNotRecursive);
 498 
 499   // Recursive inflated unlock
 500   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 501   xorl(tmpReg, tmpReg); // Set ZF == 1
 502   jmp(DONE_LABEL);
 503 
 504   bind(LNotRecursive);
 505 
 506   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 507   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 508   jccb  (Assembler::notZero, CheckSucc);
 509   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 510   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 511   jmpb  (DONE_LABEL);
 512 
 513   // Try to avoid passing control into the slow_path ...
 514   bind  (CheckSucc);
 515 
 516   // The following optional optimization can be elided if necessary
 517   // Effectively: if (succ == null) goto slow path
 518   // The code reduces the window for a race, however,
 519   // and thus benefits performance.
 520   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 521   jccb  (Assembler::zero, LGoSlowPath);
 522 
 523   xorptr(boxReg, boxReg);
 524   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 525   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 526 
 527   // Memory barrier/fence
 528   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 529   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 530   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 531   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 532   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 533   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 534   lock(); addl(Address(rsp, 0), 0);
 535 
 536   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 537   jccb  (Assembler::notZero, LSuccess);
 538 
 539   // Rare inopportune interleaving - race.
 540   // The successor vanished in the small window above.
 541   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 542   // We need to ensure progress and succession.
 543   // Try to reacquire the lock.
 544   // If that fails then the new owner is responsible for succession and this
 545   // thread needs to take no further action and can exit via the fast path (success).
 546   // If the re-acquire succeeds then pass control into the slow path.
 547   // As implemented, this latter mode is horrible because we generated more
 548   // coherence traffic on the lock *and* artificially extended the critical section
 549   // length while by virtue of passing control into the slow path.
 550 
 551   // box is really RAX -- the following CMPXCHG depends on that binding
 552   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 553   movptr(scrReg, Address(r15_thread, JavaThread::lock_id_offset()));
 554   lock();
 555   cmpxchgptr(scrReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 556   // There's no successor so we tried to regrab the lock.
 557   // If that didn't work, then another thread grabbed the
 558   // lock so we're done (and exit was a success).
 559   jccb  (Assembler::notEqual, LSuccess);
 560   // Intentional fall-through into slow path
 561 
 562   bind  (LGoSlowPath);
 563   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 564   jmpb  (DONE_LABEL);
 565 
 566   bind  (LSuccess);
 567   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 568   jmpb  (DONE_LABEL);
 569 
 570 #endif
 571   if (LockingMode == LM_LEGACY) {
 572     bind  (Stacked);
 573     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 574     lock();
 575     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 576     jccb(Assembler::notZero, DONE_LABEL);
 577     // Count monitors in fast path
 578 #ifndef _LP64
 579     get_thread(tmpReg);
 580     decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 581 #else // _LP64
 582     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 583 #endif
 584     xorl(tmpReg, tmpReg); // Set ZF == 1
 585   }
 586 
 587   // ZFlag == 1 -> Success
 588   // ZFlag == 0 -> Failure - force control through the slow path
 589   bind(DONE_LABEL);
 590 }
 591 
 592 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 593                                               Register t, Register thread) {
 594   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 595   assert(rax_reg == rax, "Used for CAS");
 596   assert_different_registers(obj, box, rax_reg, t, thread);
 597 
 598   // Handle inflated monitor.
 599   Label inflated;
 600   // Finish fast lock successfully.
 601   Label locked;
 602   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 603   Label slow_path;
 604 
 605   if (DiagnoseSyncOnValueBasedClasses != 0) {
 606     load_klass(rax_reg, obj, t);
 607     movl(rax_reg, Address(rax_reg, Klass::access_flags_offset()));
 608     testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS);
 609     jcc(Assembler::notZero, slow_path);
 610   }
 611 
 612   const Register mark = t;
 613 
 614   { // Lightweight Lock
 615 
 616     Label push;
 617 
 618     const Register top = box;
 619 
 620     // Load the mark.
 621     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 622 
 623     // Prefetch top.
 624     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 625 
 626     // Check for monitor (0b10).
 627     testptr(mark, markWord::monitor_value);
 628     jcc(Assembler::notZero, inflated);
 629 
 630     // Check if lock-stack is full.
 631     cmpl(top, LockStack::end_offset() - 1);
 632     jcc(Assembler::greater, slow_path);
 633 
 634     // Check if recursive.
 635     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 636     jccb(Assembler::equal, push);
 637 
 638     // Try to lock. Transition lock bits 0b01 => 0b00
 639     movptr(rax_reg, mark);
 640     orptr(rax_reg, markWord::unlocked_value);
 641     andptr(mark, ~(int32_t)markWord::unlocked_value);
 642     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 643     jcc(Assembler::notEqual, slow_path);
 644 
 645     bind(push);
 646     // After successful lock, push object on lock-stack.
 647     movptr(Address(thread, top), obj);
 648     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 649     xorl(rax_reg, rax_reg);
 650     jmpb(locked);
 651   }
 652 
 653   { // Handle inflated monitor.
 654     bind(inflated);
 655 
 656     const Register tagged_monitor = mark;
 657 
 658     // CAS owner (null => current thread).
 659     xorptr(rax_reg, rax_reg);
 660     movptr(box, Address(thread, JavaThread::lock_id_offset()));
 661     lock(); cmpxchgptr(box, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 662     jccb(Assembler::equal, locked);
 663 
 664     // Check if recursive.
 665     cmpptr(box, rax_reg);
 666     jccb(Assembler::notEqual, slow_path);
 667 
 668     // Recursive.
 669     increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 670     xorl(rax_reg, rax_reg);
 671   }
 672 
 673   bind(locked);
 674 #ifdef ASSERT
 675   // Check that locked label is reached with ZF set.
 676   Label zf_correct;
 677   Label zf_bad_zero;
 678   jcc(Assembler::zero, zf_correct);
 679   jmp(zf_bad_zero);
 680 #endif
 681 
 682   bind(slow_path);
 683 #ifdef ASSERT
 684   // Check that slow_path label is reached with ZF not set.
 685   jcc(Assembler::notZero, zf_correct);
 686   stop("Fast Lock ZF != 0");
 687   bind(zf_bad_zero);
 688   stop("Fast Lock ZF != 1");
 689   bind(zf_correct);
 690 #endif
 691   // C2 uses the value of ZF to determine the continuation.
 692 }
 693 
 694 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t1, Register t2, Register thread) {
 695   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 696   assert(reg_rax == rax, "Used for CAS");
 697   assert_different_registers(obj, reg_rax, t1, t2);
 698 
 699   // Handle inflated monitor.
 700   Label inflated, inflated_check_lock_stack;
 701   // Finish fast unlock successfully.  MUST jump with ZF == 1
 702   Label unlocked;
 703 
 704   const Register mark = t1;
 705   const Register top = reg_rax;
 706 
 707   Label dummy;
 708   C2FastUnlockLightweightStub* stub = nullptr;
 709 
 710   if (!Compile::current()->output()->in_scratch_emit_size()) {
 711     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, t2, thread);
 712     Compile::current()->output()->add_stub(stub);
 713   }
 714 
 715   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 716   Label& check_successor = stub == nullptr ? dummy : stub->check_successor();
 717 
 718   { // Lightweight Unlock
 719 
 720     // Load top.
 721     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 722 
 723     // Prefetch mark.
 724     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 725 
 726     // Check if obj is top of lock-stack.
 727     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 728     // Top of lock stack was not obj. Must be monitor.
 729     jcc(Assembler::notEqual, inflated_check_lock_stack);
 730 
 731     // Pop lock-stack.
 732     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 733     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 734 
 735     // Check if recursive.
 736     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 737     jcc(Assembler::equal, unlocked);
 738 
 739     // We elide the monitor check, let the CAS fail instead.
 740 
 741     // Try to unlock. Transition lock bits 0b00 => 0b01
 742     movptr(reg_rax, mark);
 743     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 744     orptr(mark, markWord::unlocked_value);
 745     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 746     jcc(Assembler::notEqual, push_and_slow_path);
 747     jmp(unlocked);
 748   }
 749 
 750 
 751   { // Handle inflated monitor.
 752     bind(inflated_check_lock_stack);
 753 #ifdef ASSERT
 754     Label check_done;
 755     subl(top, oopSize);
 756     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 757     jcc(Assembler::below, check_done);
 758     cmpptr(obj, Address(thread, top));
 759     jccb(Assembler::notEqual, inflated_check_lock_stack);
 760     stop("Fast Unlock lock on stack");
 761     bind(check_done);
 762     testptr(mark, markWord::monitor_value);
 763     jccb(Assembler::notZero, inflated);
 764     stop("Fast Unlock not monitor");
 765 #endif
 766 
 767     bind(inflated);
 768 
 769     // mark contains the tagged ObjectMonitor*.
 770     const Register monitor = mark;
 771 
 772 #ifndef _LP64
 773     // Check if recursive.
 774     xorptr(reg_rax, reg_rax);
 775     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 776     jcc(Assembler::notZero, check_successor);
 777 
 778     // Check if the entry lists are empty.
 779     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 780     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 781     jcc(Assembler::notZero, check_successor);
 782 
 783     // Release lock.
 784     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 785 #else // _LP64
 786     Label recursive;
 787 
 788     // Check if recursive.
 789     cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 790     jccb(Assembler::notEqual, recursive);
 791 
 792     // Check if the entry lists are empty.
 793     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 794     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 795     jcc(Assembler::notZero, check_successor);
 796 
 797     // Release lock.
 798     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 799     jmpb(unlocked);
 800 
 801     // Recursive unlock.
 802     bind(recursive);
 803     decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 804     xorl(t1, t1);
 805 #endif
 806   }
 807 
 808   bind(unlocked);
 809   if (stub != nullptr) {
 810     bind(stub->unlocked_continuation());
 811   }
 812 
 813 #ifdef ASSERT
 814   // Check that unlocked label is reached with ZF set.
 815   Label zf_correct;
 816   jcc(Assembler::zero, zf_correct);
 817   stop("Fast Unlock ZF != 1");
 818 #endif
 819 
 820   if (stub != nullptr) {
 821     bind(stub->slow_path_continuation());
 822   }
 823 #ifdef ASSERT
 824   // Check that stub->continuation() label is reached with ZF not set.
 825   jccb(Assembler::notZero, zf_correct);
 826   stop("Fast Unlock ZF != 0");
 827   bind(zf_correct);
 828 #endif
 829   // C2 uses the value of ZF to determine the continuation.
 830 }
 831 
 832 //-------------------------------------------------------------------------------------------
 833 // Generic instructions support for use in .ad files C2 code generation
 834 
 835 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 836   if (dst != src) {
 837     movdqu(dst, src);
 838   }
 839   if (opcode == Op_AbsVD) {
 840     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 841   } else {
 842     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 843     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 844   }
 845 }
 846 
 847 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 848   if (opcode == Op_AbsVD) {
 849     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 850   } else {
 851     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 852     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 853   }
 854 }
 855 
 856 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 857   if (dst != src) {
 858     movdqu(dst, src);
 859   }
 860   if (opcode == Op_AbsVF) {
 861     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 862   } else {
 863     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 864     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 865   }
 866 }
 867 
 868 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 869   if (opcode == Op_AbsVF) {
 870     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 871   } else {
 872     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 873     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 874   }
 875 }
 876 
 877 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 878   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 879   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 880 
 881   if (opcode == Op_MinV) {
 882     if (elem_bt == T_BYTE) {
 883       pminsb(dst, src);
 884     } else if (elem_bt == T_SHORT) {
 885       pminsw(dst, src);
 886     } else if (elem_bt == T_INT) {
 887       pminsd(dst, src);
 888     } else {
 889       assert(elem_bt == T_LONG, "required");
 890       assert(tmp == xmm0, "required");
 891       assert_different_registers(dst, src, tmp);
 892       movdqu(xmm0, dst);
 893       pcmpgtq(xmm0, src);
 894       blendvpd(dst, src);  // xmm0 as mask
 895     }
 896   } else { // opcode == Op_MaxV
 897     if (elem_bt == T_BYTE) {
 898       pmaxsb(dst, src);
 899     } else if (elem_bt == T_SHORT) {
 900       pmaxsw(dst, src);
 901     } else if (elem_bt == T_INT) {
 902       pmaxsd(dst, src);
 903     } else {
 904       assert(elem_bt == T_LONG, "required");
 905       assert(tmp == xmm0, "required");
 906       assert_different_registers(dst, src, tmp);
 907       movdqu(xmm0, src);
 908       pcmpgtq(xmm0, dst);
 909       blendvpd(dst, src);  // xmm0 as mask
 910     }
 911   }
 912 }
 913 
 914 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 915                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 916                                  int vlen_enc) {
 917   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 918 
 919   if (opcode == Op_MinV) {
 920     if (elem_bt == T_BYTE) {
 921       vpminsb(dst, src1, src2, vlen_enc);
 922     } else if (elem_bt == T_SHORT) {
 923       vpminsw(dst, src1, src2, vlen_enc);
 924     } else if (elem_bt == T_INT) {
 925       vpminsd(dst, src1, src2, vlen_enc);
 926     } else {
 927       assert(elem_bt == T_LONG, "required");
 928       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 929         vpminsq(dst, src1, src2, vlen_enc);
 930       } else {
 931         assert_different_registers(dst, src1, src2);
 932         vpcmpgtq(dst, src1, src2, vlen_enc);
 933         vblendvpd(dst, src1, src2, dst, vlen_enc);
 934       }
 935     }
 936   } else { // opcode == Op_MaxV
 937     if (elem_bt == T_BYTE) {
 938       vpmaxsb(dst, src1, src2, vlen_enc);
 939     } else if (elem_bt == T_SHORT) {
 940       vpmaxsw(dst, src1, src2, vlen_enc);
 941     } else if (elem_bt == T_INT) {
 942       vpmaxsd(dst, src1, src2, vlen_enc);
 943     } else {
 944       assert(elem_bt == T_LONG, "required");
 945       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 946         vpmaxsq(dst, src1, src2, vlen_enc);
 947       } else {
 948         assert_different_registers(dst, src1, src2);
 949         vpcmpgtq(dst, src1, src2, vlen_enc);
 950         vblendvpd(dst, src2, src1, dst, vlen_enc);
 951       }
 952     }
 953   }
 954 }
 955 
 956 // Float/Double min max
 957 
 958 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 959                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 960                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 961                                    int vlen_enc) {
 962   assert(UseAVX > 0, "required");
 963   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 964          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 965   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 966   assert_different_registers(a, tmp, atmp, btmp);
 967   assert_different_registers(b, tmp, atmp, btmp);
 968 
 969   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 970   bool is_double_word = is_double_word_type(elem_bt);
 971 
 972   /* Note on 'non-obvious' assembly sequence:
 973    *
 974    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 975    * and Java on how they handle floats:
 976    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 977    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 978    *
 979    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 980    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 981    *                (only useful when signs differ, noop otherwise)
 982    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 983 
 984    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 985    *   btmp = (b < +0.0) ? a : b
 986    *   atmp = (b < +0.0) ? b : a
 987    *   Tmp  = Max_Float(atmp , btmp)
 988    *   Res  = (atmp == NaN) ? atmp : Tmp
 989    */
 990 
 991   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 992   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 993   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 994   XMMRegister mask;
 995 
 996   if (!is_double_word && is_min) {
 997     mask = a;
 998     vblend = &MacroAssembler::vblendvps;
 999     vmaxmin = &MacroAssembler::vminps;
1000     vcmp = &MacroAssembler::vcmpps;
1001   } else if (!is_double_word && !is_min) {
1002     mask = b;
1003     vblend = &MacroAssembler::vblendvps;
1004     vmaxmin = &MacroAssembler::vmaxps;
1005     vcmp = &MacroAssembler::vcmpps;
1006   } else if (is_double_word && is_min) {
1007     mask = a;
1008     vblend = &MacroAssembler::vblendvpd;
1009     vmaxmin = &MacroAssembler::vminpd;
1010     vcmp = &MacroAssembler::vcmppd;
1011   } else {
1012     assert(is_double_word && !is_min, "sanity");
1013     mask = b;
1014     vblend = &MacroAssembler::vblendvpd;
1015     vmaxmin = &MacroAssembler::vmaxpd;
1016     vcmp = &MacroAssembler::vcmppd;
1017   }
1018 
1019   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1020   XMMRegister maxmin, scratch;
1021   if (dst == btmp) {
1022     maxmin = btmp;
1023     scratch = tmp;
1024   } else {
1025     maxmin = tmp;
1026     scratch = btmp;
1027   }
1028 
1029   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1030   if (precompute_mask && !is_double_word) {
1031     vpsrad(tmp, mask, 32, vlen_enc);
1032     mask = tmp;
1033   } else if (precompute_mask && is_double_word) {
1034     vpxor(tmp, tmp, tmp, vlen_enc);
1035     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1036     mask = tmp;
1037   }
1038 
1039   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1040   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1041   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1042   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1043   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1044 }
1045 
1046 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1047                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1048                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1049                                     int vlen_enc) {
1050   assert(UseAVX > 2, "required");
1051   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1052          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1053   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1054   assert_different_registers(dst, a, atmp, btmp);
1055   assert_different_registers(dst, b, atmp, btmp);
1056 
1057   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1058   bool is_double_word = is_double_word_type(elem_bt);
1059   bool merge = true;
1060 
1061   if (!is_double_word && is_min) {
1062     evpmovd2m(ktmp, a, vlen_enc);
1063     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1064     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1065     vminps(dst, atmp, btmp, vlen_enc);
1066     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1067     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1068   } else if (!is_double_word && !is_min) {
1069     evpmovd2m(ktmp, b, vlen_enc);
1070     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1071     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1072     vmaxps(dst, atmp, btmp, vlen_enc);
1073     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1074     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1075   } else if (is_double_word && is_min) {
1076     evpmovq2m(ktmp, a, vlen_enc);
1077     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1078     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1079     vminpd(dst, atmp, btmp, vlen_enc);
1080     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1081     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1082   } else {
1083     assert(is_double_word && !is_min, "sanity");
1084     evpmovq2m(ktmp, b, vlen_enc);
1085     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1086     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1087     vmaxpd(dst, atmp, btmp, vlen_enc);
1088     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1089     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1090   }
1091 }
1092 
1093 // Float/Double signum
1094 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1095   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1096 
1097   Label DONE_LABEL;
1098 
1099   if (opcode == Op_SignumF) {
1100     assert(UseSSE > 0, "required");
1101     ucomiss(dst, zero);
1102     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1103     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1104     movflt(dst, one);
1105     jcc(Assembler::above, DONE_LABEL);
1106     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1107   } else if (opcode == Op_SignumD) {
1108     assert(UseSSE > 1, "required");
1109     ucomisd(dst, zero);
1110     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1111     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1112     movdbl(dst, one);
1113     jcc(Assembler::above, DONE_LABEL);
1114     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1115   }
1116 
1117   bind(DONE_LABEL);
1118 }
1119 
1120 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1121   if (sign) {
1122     pmovsxbw(dst, src);
1123   } else {
1124     pmovzxbw(dst, src);
1125   }
1126 }
1127 
1128 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1129   if (sign) {
1130     vpmovsxbw(dst, src, vector_len);
1131   } else {
1132     vpmovzxbw(dst, src, vector_len);
1133   }
1134 }
1135 
1136 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1137   if (sign) {
1138     vpmovsxbd(dst, src, vector_len);
1139   } else {
1140     vpmovzxbd(dst, src, vector_len);
1141   }
1142 }
1143 
1144 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1145   if (sign) {
1146     vpmovsxwd(dst, src, vector_len);
1147   } else {
1148     vpmovzxwd(dst, src, vector_len);
1149   }
1150 }
1151 
1152 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1153                                      int shift, int vector_len) {
1154   if (opcode == Op_RotateLeftV) {
1155     if (etype == T_INT) {
1156       evprold(dst, src, shift, vector_len);
1157     } else {
1158       assert(etype == T_LONG, "expected type T_LONG");
1159       evprolq(dst, src, shift, vector_len);
1160     }
1161   } else {
1162     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1163     if (etype == T_INT) {
1164       evprord(dst, src, shift, vector_len);
1165     } else {
1166       assert(etype == T_LONG, "expected type T_LONG");
1167       evprorq(dst, src, shift, vector_len);
1168     }
1169   }
1170 }
1171 
1172 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1173                                      XMMRegister shift, int vector_len) {
1174   if (opcode == Op_RotateLeftV) {
1175     if (etype == T_INT) {
1176       evprolvd(dst, src, shift, vector_len);
1177     } else {
1178       assert(etype == T_LONG, "expected type T_LONG");
1179       evprolvq(dst, src, shift, vector_len);
1180     }
1181   } else {
1182     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1183     if (etype == T_INT) {
1184       evprorvd(dst, src, shift, vector_len);
1185     } else {
1186       assert(etype == T_LONG, "expected type T_LONG");
1187       evprorvq(dst, src, shift, vector_len);
1188     }
1189   }
1190 }
1191 
1192 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1193   if (opcode == Op_RShiftVI) {
1194     psrad(dst, shift);
1195   } else if (opcode == Op_LShiftVI) {
1196     pslld(dst, shift);
1197   } else {
1198     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1199     psrld(dst, shift);
1200   }
1201 }
1202 
1203 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1204   switch (opcode) {
1205     case Op_RShiftVI:  psrad(dst, shift); break;
1206     case Op_LShiftVI:  pslld(dst, shift); break;
1207     case Op_URShiftVI: psrld(dst, shift); break;
1208 
1209     default: assert(false, "%s", NodeClassNames[opcode]);
1210   }
1211 }
1212 
1213 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1214   if (opcode == Op_RShiftVI) {
1215     vpsrad(dst, nds, shift, vector_len);
1216   } else if (opcode == Op_LShiftVI) {
1217     vpslld(dst, nds, shift, vector_len);
1218   } else {
1219     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1220     vpsrld(dst, nds, shift, vector_len);
1221   }
1222 }
1223 
1224 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1225   switch (opcode) {
1226     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1227     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1228     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1229 
1230     default: assert(false, "%s", NodeClassNames[opcode]);
1231   }
1232 }
1233 
1234 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1235   switch (opcode) {
1236     case Op_RShiftVB:  // fall-through
1237     case Op_RShiftVS:  psraw(dst, shift); break;
1238 
1239     case Op_LShiftVB:  // fall-through
1240     case Op_LShiftVS:  psllw(dst, shift);   break;
1241 
1242     case Op_URShiftVS: // fall-through
1243     case Op_URShiftVB: psrlw(dst, shift);  break;
1244 
1245     default: assert(false, "%s", NodeClassNames[opcode]);
1246   }
1247 }
1248 
1249 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1250   switch (opcode) {
1251     case Op_RShiftVB:  // fall-through
1252     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1253 
1254     case Op_LShiftVB:  // fall-through
1255     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1256 
1257     case Op_URShiftVS: // fall-through
1258     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1259 
1260     default: assert(false, "%s", NodeClassNames[opcode]);
1261   }
1262 }
1263 
1264 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1265   switch (opcode) {
1266     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1267     case Op_LShiftVL:  psllq(dst, shift); break;
1268     case Op_URShiftVL: psrlq(dst, shift); break;
1269 
1270     default: assert(false, "%s", NodeClassNames[opcode]);
1271   }
1272 }
1273 
1274 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1275   if (opcode == Op_RShiftVL) {
1276     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1277   } else if (opcode == Op_LShiftVL) {
1278     psllq(dst, shift);
1279   } else {
1280     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1281     psrlq(dst, shift);
1282   }
1283 }
1284 
1285 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1286   switch (opcode) {
1287     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1288     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1289     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1290 
1291     default: assert(false, "%s", NodeClassNames[opcode]);
1292   }
1293 }
1294 
1295 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1296   if (opcode == Op_RShiftVL) {
1297     evpsraq(dst, nds, shift, vector_len);
1298   } else if (opcode == Op_LShiftVL) {
1299     vpsllq(dst, nds, shift, vector_len);
1300   } else {
1301     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1302     vpsrlq(dst, nds, shift, vector_len);
1303   }
1304 }
1305 
1306 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1307   switch (opcode) {
1308     case Op_RShiftVB:  // fall-through
1309     case Op_RShiftVS:  // fall-through
1310     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1311 
1312     case Op_LShiftVB:  // fall-through
1313     case Op_LShiftVS:  // fall-through
1314     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1315 
1316     case Op_URShiftVB: // fall-through
1317     case Op_URShiftVS: // fall-through
1318     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1319 
1320     default: assert(false, "%s", NodeClassNames[opcode]);
1321   }
1322 }
1323 
1324 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1325   switch (opcode) {
1326     case Op_RShiftVB:  // fall-through
1327     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1328 
1329     case Op_LShiftVB:  // fall-through
1330     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1331 
1332     case Op_URShiftVB: // fall-through
1333     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1334 
1335     default: assert(false, "%s", NodeClassNames[opcode]);
1336   }
1337 }
1338 
1339 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1340   assert(UseAVX >= 2, "required");
1341   switch (opcode) {
1342     case Op_RShiftVL: {
1343       if (UseAVX > 2) {
1344         assert(tmp == xnoreg, "not used");
1345         if (!VM_Version::supports_avx512vl()) {
1346           vlen_enc = Assembler::AVX_512bit;
1347         }
1348         evpsravq(dst, src, shift, vlen_enc);
1349       } else {
1350         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1351         vpsrlvq(dst, src, shift, vlen_enc);
1352         vpsrlvq(tmp, tmp, shift, vlen_enc);
1353         vpxor(dst, dst, tmp, vlen_enc);
1354         vpsubq(dst, dst, tmp, vlen_enc);
1355       }
1356       break;
1357     }
1358     case Op_LShiftVL: {
1359       assert(tmp == xnoreg, "not used");
1360       vpsllvq(dst, src, shift, vlen_enc);
1361       break;
1362     }
1363     case Op_URShiftVL: {
1364       assert(tmp == xnoreg, "not used");
1365       vpsrlvq(dst, src, shift, vlen_enc);
1366       break;
1367     }
1368     default: assert(false, "%s", NodeClassNames[opcode]);
1369   }
1370 }
1371 
1372 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1373 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1374   assert(opcode == Op_LShiftVB ||
1375          opcode == Op_RShiftVB ||
1376          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1377   bool sign = (opcode != Op_URShiftVB);
1378   assert(vector_len == 0, "required");
1379   vextendbd(sign, dst, src, 1);
1380   vpmovzxbd(vtmp, shift, 1);
1381   varshiftd(opcode, dst, dst, vtmp, 1);
1382   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1383   vextracti128_high(vtmp, dst);
1384   vpackusdw(dst, dst, vtmp, 0);
1385 }
1386 
1387 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1388 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1389   assert(opcode == Op_LShiftVB ||
1390          opcode == Op_RShiftVB ||
1391          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1392   bool sign = (opcode != Op_URShiftVB);
1393   int ext_vector_len = vector_len + 1;
1394   vextendbw(sign, dst, src, ext_vector_len);
1395   vpmovzxbw(vtmp, shift, ext_vector_len);
1396   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1397   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1398   if (vector_len == 0) {
1399     vextracti128_high(vtmp, dst);
1400     vpackuswb(dst, dst, vtmp, vector_len);
1401   } else {
1402     vextracti64x4_high(vtmp, dst);
1403     vpackuswb(dst, dst, vtmp, vector_len);
1404     vpermq(dst, dst, 0xD8, vector_len);
1405   }
1406 }
1407 
1408 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1409   switch(typ) {
1410     case T_BYTE:
1411       pinsrb(dst, val, idx);
1412       break;
1413     case T_SHORT:
1414       pinsrw(dst, val, idx);
1415       break;
1416     case T_INT:
1417       pinsrd(dst, val, idx);
1418       break;
1419     case T_LONG:
1420       pinsrq(dst, val, idx);
1421       break;
1422     default:
1423       assert(false,"Should not reach here.");
1424       break;
1425   }
1426 }
1427 
1428 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1429   switch(typ) {
1430     case T_BYTE:
1431       vpinsrb(dst, src, val, idx);
1432       break;
1433     case T_SHORT:
1434       vpinsrw(dst, src, val, idx);
1435       break;
1436     case T_INT:
1437       vpinsrd(dst, src, val, idx);
1438       break;
1439     case T_LONG:
1440       vpinsrq(dst, src, val, idx);
1441       break;
1442     default:
1443       assert(false,"Should not reach here.");
1444       break;
1445   }
1446 }
1447 
1448 #ifdef _LP64
1449 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1450                                                 XMMRegister dst, Register base,
1451                                                 Register idx_base,
1452                                                 Register offset, Register mask,
1453                                                 Register mask_idx, Register rtmp,
1454                                                 int vlen_enc) {
1455   vpxor(dst, dst, dst, vlen_enc);
1456   if (elem_bt == T_SHORT) {
1457     for (int i = 0; i < 4; i++) {
1458       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1459       Label skip_load;
1460       btq(mask, mask_idx);
1461       jccb(Assembler::carryClear, skip_load);
1462       movl(rtmp, Address(idx_base, i * 4));
1463       if (offset != noreg) {
1464         addl(rtmp, offset);
1465       }
1466       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1467       bind(skip_load);
1468       incq(mask_idx);
1469     }
1470   } else {
1471     assert(elem_bt == T_BYTE, "");
1472     for (int i = 0; i < 8; i++) {
1473       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1474       Label skip_load;
1475       btq(mask, mask_idx);
1476       jccb(Assembler::carryClear, skip_load);
1477       movl(rtmp, Address(idx_base, i * 4));
1478       if (offset != noreg) {
1479         addl(rtmp, offset);
1480       }
1481       pinsrb(dst, Address(base, rtmp), i);
1482       bind(skip_load);
1483       incq(mask_idx);
1484     }
1485   }
1486 }
1487 #endif // _LP64
1488 
1489 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1490                                          Register base, Register idx_base,
1491                                          Register offset, Register rtmp,
1492                                          int vlen_enc) {
1493   vpxor(dst, dst, dst, vlen_enc);
1494   if (elem_bt == T_SHORT) {
1495     for (int i = 0; i < 4; i++) {
1496       // dst[i] = src[offset + idx_base[i]]
1497       movl(rtmp, Address(idx_base, i * 4));
1498       if (offset != noreg) {
1499         addl(rtmp, offset);
1500       }
1501       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1502     }
1503   } else {
1504     assert(elem_bt == T_BYTE, "");
1505     for (int i = 0; i < 8; i++) {
1506       // dst[i] = src[offset + idx_base[i]]
1507       movl(rtmp, Address(idx_base, i * 4));
1508       if (offset != noreg) {
1509         addl(rtmp, offset);
1510       }
1511       pinsrb(dst, Address(base, rtmp), i);
1512     }
1513   }
1514 }
1515 
1516 /*
1517  * Gather using hybrid algorithm, first partially unroll scalar loop
1518  * to accumulate values from gather indices into a quad-word(64bit) slice.
1519  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1520  * permutation to place the slice into appropriate vector lane
1521  * locations in destination vector. Following pseudo code describes the
1522  * algorithm in detail:
1523  *
1524  * DST_VEC = ZERO_VEC
1525  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1526  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1527  * FOREACH_ITER:
1528  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1529  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1530  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1531  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1532  *
1533  * With each iteration, doubleword permute indices (0,1) corresponding
1534  * to gathered quadword gets right shifted by two lane positions.
1535  *
1536  */
1537 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1538                                         Register base, Register idx_base,
1539                                         Register offset, Register mask,
1540                                         XMMRegister xtmp1, XMMRegister xtmp2,
1541                                         XMMRegister temp_dst, Register rtmp,
1542                                         Register mask_idx, Register length,
1543                                         int vector_len, int vlen_enc) {
1544   Label GATHER8_LOOP;
1545   assert(is_subword_type(elem_ty), "");
1546   movl(length, vector_len);
1547   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1548   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1549   vallones(xtmp2, vlen_enc);
1550   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1551   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1552   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1553 
1554   bind(GATHER8_LOOP);
1555     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1556     if (mask == noreg) {
1557       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1558     } else {
1559       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1560     }
1561     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1562     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1563     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1564     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1565     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1566     vpor(dst, dst, temp_dst, vlen_enc);
1567     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1568     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1569     jcc(Assembler::notEqual, GATHER8_LOOP);
1570 }
1571 
1572 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1573   switch(typ) {
1574     case T_INT:
1575       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1576       break;
1577     case T_FLOAT:
1578       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1579       break;
1580     case T_LONG:
1581       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1582       break;
1583     case T_DOUBLE:
1584       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1585       break;
1586     default:
1587       assert(false,"Should not reach here.");
1588       break;
1589   }
1590 }
1591 
1592 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1593   switch(typ) {
1594     case T_INT:
1595       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1596       break;
1597     case T_FLOAT:
1598       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1599       break;
1600     case T_LONG:
1601       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1602       break;
1603     case T_DOUBLE:
1604       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1605       break;
1606     default:
1607       assert(false,"Should not reach here.");
1608       break;
1609   }
1610 }
1611 
1612 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1613   switch(typ) {
1614     case T_INT:
1615       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1616       break;
1617     case T_FLOAT:
1618       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1619       break;
1620     case T_LONG:
1621       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1622       break;
1623     case T_DOUBLE:
1624       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1625       break;
1626     default:
1627       assert(false,"Should not reach here.");
1628       break;
1629   }
1630 }
1631 
1632 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1633   if (vlen_in_bytes <= 16) {
1634     pxor (dst, dst);
1635     psubb(dst, src);
1636     switch (elem_bt) {
1637       case T_BYTE:   /* nothing to do */ break;
1638       case T_SHORT:  pmovsxbw(dst, dst); break;
1639       case T_INT:    pmovsxbd(dst, dst); break;
1640       case T_FLOAT:  pmovsxbd(dst, dst); break;
1641       case T_LONG:   pmovsxbq(dst, dst); break;
1642       case T_DOUBLE: pmovsxbq(dst, dst); break;
1643 
1644       default: assert(false, "%s", type2name(elem_bt));
1645     }
1646   } else {
1647     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1648     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1649 
1650     vpxor (dst, dst, dst, vlen_enc);
1651     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1652 
1653     switch (elem_bt) {
1654       case T_BYTE:   /* nothing to do */            break;
1655       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1656       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1657       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1658       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1659       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1660 
1661       default: assert(false, "%s", type2name(elem_bt));
1662     }
1663   }
1664 }
1665 
1666 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1667   if (novlbwdq) {
1668     vpmovsxbd(xtmp, src, vlen_enc);
1669     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1670             Assembler::eq, true, vlen_enc, noreg);
1671   } else {
1672     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1673     vpsubb(xtmp, xtmp, src, vlen_enc);
1674     evpmovb2m(dst, xtmp, vlen_enc);
1675   }
1676 }
1677 
1678 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1679   switch (vlen_in_bytes) {
1680     case 4:  movdl(dst, src);   break;
1681     case 8:  movq(dst, src);    break;
1682     case 16: movdqu(dst, src);  break;
1683     case 32: vmovdqu(dst, src); break;
1684     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1685     default: ShouldNotReachHere();
1686   }
1687 }
1688 
1689 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1690   assert(rscratch != noreg || always_reachable(src), "missing");
1691 
1692   if (reachable(src)) {
1693     load_vector(dst, as_Address(src), vlen_in_bytes);
1694   } else {
1695     lea(rscratch, src);
1696     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1697   }
1698 }
1699 
1700 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1701   int vlen_enc = vector_length_encoding(vlen);
1702   if (VM_Version::supports_avx()) {
1703     if (bt == T_LONG) {
1704       if (VM_Version::supports_avx2()) {
1705         vpbroadcastq(dst, src, vlen_enc);
1706       } else {
1707         vmovddup(dst, src, vlen_enc);
1708       }
1709     } else if (bt == T_DOUBLE) {
1710       if (vlen_enc != Assembler::AVX_128bit) {
1711         vbroadcastsd(dst, src, vlen_enc, noreg);
1712       } else {
1713         vmovddup(dst, src, vlen_enc);
1714       }
1715     } else {
1716       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1717         vpbroadcastd(dst, src, vlen_enc);
1718       } else {
1719         vbroadcastss(dst, src, vlen_enc);
1720       }
1721     }
1722   } else if (VM_Version::supports_sse3()) {
1723     movddup(dst, src);
1724   } else {
1725     movq(dst, src);
1726     if (vlen == 16) {
1727       punpcklqdq(dst, dst);
1728     }
1729   }
1730 }
1731 
1732 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1733   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1734   int offset = exact_log2(type2aelembytes(bt)) << 6;
1735   if (is_floating_point_type(bt)) {
1736     offset += 128;
1737   }
1738   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1739   load_vector(dst, addr, vlen_in_bytes);
1740 }
1741 
1742 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1743 
1744 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1745   int vector_len = Assembler::AVX_128bit;
1746 
1747   switch (opcode) {
1748     case Op_AndReductionV:  pand(dst, src); break;
1749     case Op_OrReductionV:   por (dst, src); break;
1750     case Op_XorReductionV:  pxor(dst, src); break;
1751     case Op_MinReductionV:
1752       switch (typ) {
1753         case T_BYTE:        pminsb(dst, src); break;
1754         case T_SHORT:       pminsw(dst, src); break;
1755         case T_INT:         pminsd(dst, src); break;
1756         case T_LONG:        assert(UseAVX > 2, "required");
1757                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1758         default:            assert(false, "wrong type");
1759       }
1760       break;
1761     case Op_MaxReductionV:
1762       switch (typ) {
1763         case T_BYTE:        pmaxsb(dst, src); break;
1764         case T_SHORT:       pmaxsw(dst, src); break;
1765         case T_INT:         pmaxsd(dst, src); break;
1766         case T_LONG:        assert(UseAVX > 2, "required");
1767                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1768         default:            assert(false, "wrong type");
1769       }
1770       break;
1771     case Op_AddReductionVF: addss(dst, src); break;
1772     case Op_AddReductionVD: addsd(dst, src); break;
1773     case Op_AddReductionVI:
1774       switch (typ) {
1775         case T_BYTE:        paddb(dst, src); break;
1776         case T_SHORT:       paddw(dst, src); break;
1777         case T_INT:         paddd(dst, src); break;
1778         default:            assert(false, "wrong type");
1779       }
1780       break;
1781     case Op_AddReductionVL: paddq(dst, src); break;
1782     case Op_MulReductionVF: mulss(dst, src); break;
1783     case Op_MulReductionVD: mulsd(dst, src); break;
1784     case Op_MulReductionVI:
1785       switch (typ) {
1786         case T_SHORT:       pmullw(dst, src); break;
1787         case T_INT:         pmulld(dst, src); break;
1788         default:            assert(false, "wrong type");
1789       }
1790       break;
1791     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1792                             evpmullq(dst, dst, src, vector_len); break;
1793     default:                assert(false, "wrong opcode");
1794   }
1795 }
1796 
1797 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1798   int vector_len = Assembler::AVX_256bit;
1799 
1800   switch (opcode) {
1801     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1802     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1803     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1804     case Op_MinReductionV:
1805       switch (typ) {
1806         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1807         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1808         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1809         case T_LONG:        assert(UseAVX > 2, "required");
1810                             vpminsq(dst, src1, src2, vector_len); break;
1811         default:            assert(false, "wrong type");
1812       }
1813       break;
1814     case Op_MaxReductionV:
1815       switch (typ) {
1816         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1817         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1818         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1819         case T_LONG:        assert(UseAVX > 2, "required");
1820                             vpmaxsq(dst, src1, src2, vector_len); break;
1821         default:            assert(false, "wrong type");
1822       }
1823       break;
1824     case Op_AddReductionVI:
1825       switch (typ) {
1826         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1827         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1828         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1829         default:            assert(false, "wrong type");
1830       }
1831       break;
1832     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1833     case Op_MulReductionVI:
1834       switch (typ) {
1835         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1836         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1837         default:            assert(false, "wrong type");
1838       }
1839       break;
1840     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1841     default:                assert(false, "wrong opcode");
1842   }
1843 }
1844 
1845 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1846                                   XMMRegister dst, XMMRegister src,
1847                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1848   switch (opcode) {
1849     case Op_AddReductionVF:
1850     case Op_MulReductionVF:
1851       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1852       break;
1853 
1854     case Op_AddReductionVD:
1855     case Op_MulReductionVD:
1856       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1857       break;
1858 
1859     default: assert(false, "wrong opcode");
1860   }
1861 }
1862 
1863 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1864                              Register dst, Register src1, XMMRegister src2,
1865                              XMMRegister vtmp1, XMMRegister vtmp2) {
1866   switch (vlen) {
1867     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1868     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1869     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1870     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1871 
1872     default: assert(false, "wrong vector length");
1873   }
1874 }
1875 
1876 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1877                              Register dst, Register src1, XMMRegister src2,
1878                              XMMRegister vtmp1, XMMRegister vtmp2) {
1879   switch (vlen) {
1880     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1881     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1882     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1883     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1884 
1885     default: assert(false, "wrong vector length");
1886   }
1887 }
1888 
1889 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1890                              Register dst, Register src1, XMMRegister src2,
1891                              XMMRegister vtmp1, XMMRegister vtmp2) {
1892   switch (vlen) {
1893     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1894     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1895     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1896     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1897 
1898     default: assert(false, "wrong vector length");
1899   }
1900 }
1901 
1902 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1903                              Register dst, Register src1, XMMRegister src2,
1904                              XMMRegister vtmp1, XMMRegister vtmp2) {
1905   switch (vlen) {
1906     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1907     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1908     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1909     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1910 
1911     default: assert(false, "wrong vector length");
1912   }
1913 }
1914 
1915 #ifdef _LP64
1916 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1917                              Register dst, Register src1, XMMRegister src2,
1918                              XMMRegister vtmp1, XMMRegister vtmp2) {
1919   switch (vlen) {
1920     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1921     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1922     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1923 
1924     default: assert(false, "wrong vector length");
1925   }
1926 }
1927 #endif // _LP64
1928 
1929 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1930   switch (vlen) {
1931     case 2:
1932       assert(vtmp2 == xnoreg, "");
1933       reduce2F(opcode, dst, src, vtmp1);
1934       break;
1935     case 4:
1936       assert(vtmp2 == xnoreg, "");
1937       reduce4F(opcode, dst, src, vtmp1);
1938       break;
1939     case 8:
1940       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1941       break;
1942     case 16:
1943       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1944       break;
1945     default: assert(false, "wrong vector length");
1946   }
1947 }
1948 
1949 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1950   switch (vlen) {
1951     case 2:
1952       assert(vtmp2 == xnoreg, "");
1953       reduce2D(opcode, dst, src, vtmp1);
1954       break;
1955     case 4:
1956       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1957       break;
1958     case 8:
1959       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1960       break;
1961     default: assert(false, "wrong vector length");
1962   }
1963 }
1964 
1965 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1966   if (opcode == Op_AddReductionVI) {
1967     if (vtmp1 != src2) {
1968       movdqu(vtmp1, src2);
1969     }
1970     phaddd(vtmp1, vtmp1);
1971   } else {
1972     pshufd(vtmp1, src2, 0x1);
1973     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1974   }
1975   movdl(vtmp2, src1);
1976   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1977   movdl(dst, vtmp1);
1978 }
1979 
1980 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1981   if (opcode == Op_AddReductionVI) {
1982     if (vtmp1 != src2) {
1983       movdqu(vtmp1, src2);
1984     }
1985     phaddd(vtmp1, src2);
1986     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1987   } else {
1988     pshufd(vtmp2, src2, 0xE);
1989     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1990     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1991   }
1992 }
1993 
1994 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1995   if (opcode == Op_AddReductionVI) {
1996     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1997     vextracti128_high(vtmp2, vtmp1);
1998     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1999     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2000   } else {
2001     vextracti128_high(vtmp1, src2);
2002     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2003     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2004   }
2005 }
2006 
2007 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2008   vextracti64x4_high(vtmp2, src2);
2009   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2010   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2011 }
2012 
2013 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2014   pshufd(vtmp2, src2, 0x1);
2015   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2016   movdqu(vtmp1, vtmp2);
2017   psrldq(vtmp1, 2);
2018   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2019   movdqu(vtmp2, vtmp1);
2020   psrldq(vtmp2, 1);
2021   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2022   movdl(vtmp2, src1);
2023   pmovsxbd(vtmp1, vtmp1);
2024   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2025   pextrb(dst, vtmp1, 0x0);
2026   movsbl(dst, dst);
2027 }
2028 
2029 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2030   pshufd(vtmp1, src2, 0xE);
2031   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2032   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2033 }
2034 
2035 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2036   vextracti128_high(vtmp2, src2);
2037   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2038   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2039 }
2040 
2041 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2042   vextracti64x4_high(vtmp1, src2);
2043   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2044   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2045 }
2046 
2047 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2048   pmovsxbw(vtmp2, src2);
2049   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2050 }
2051 
2052 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2053   if (UseAVX > 1) {
2054     int vector_len = Assembler::AVX_256bit;
2055     vpmovsxbw(vtmp1, src2, vector_len);
2056     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2057   } else {
2058     pmovsxbw(vtmp2, src2);
2059     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2060     pshufd(vtmp2, src2, 0x1);
2061     pmovsxbw(vtmp2, src2);
2062     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2063   }
2064 }
2065 
2066 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2067   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2068     int vector_len = Assembler::AVX_512bit;
2069     vpmovsxbw(vtmp1, src2, vector_len);
2070     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2071   } else {
2072     assert(UseAVX >= 2,"Should not reach here.");
2073     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2074     vextracti128_high(vtmp2, src2);
2075     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2076   }
2077 }
2078 
2079 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2080   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2081   vextracti64x4_high(vtmp2, src2);
2082   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2083 }
2084 
2085 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2086   if (opcode == Op_AddReductionVI) {
2087     if (vtmp1 != src2) {
2088       movdqu(vtmp1, src2);
2089     }
2090     phaddw(vtmp1, vtmp1);
2091     phaddw(vtmp1, vtmp1);
2092   } else {
2093     pshufd(vtmp2, src2, 0x1);
2094     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2095     movdqu(vtmp1, vtmp2);
2096     psrldq(vtmp1, 2);
2097     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2098   }
2099   movdl(vtmp2, src1);
2100   pmovsxwd(vtmp1, vtmp1);
2101   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2102   pextrw(dst, vtmp1, 0x0);
2103   movswl(dst, dst);
2104 }
2105 
2106 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2107   if (opcode == Op_AddReductionVI) {
2108     if (vtmp1 != src2) {
2109       movdqu(vtmp1, src2);
2110     }
2111     phaddw(vtmp1, src2);
2112   } else {
2113     pshufd(vtmp1, src2, 0xE);
2114     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2115   }
2116   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2117 }
2118 
2119 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2120   if (opcode == Op_AddReductionVI) {
2121     int vector_len = Assembler::AVX_256bit;
2122     vphaddw(vtmp2, src2, src2, vector_len);
2123     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2124   } else {
2125     vextracti128_high(vtmp2, src2);
2126     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2127   }
2128   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2129 }
2130 
2131 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2132   int vector_len = Assembler::AVX_256bit;
2133   vextracti64x4_high(vtmp1, src2);
2134   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2135   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2136 }
2137 
2138 #ifdef _LP64
2139 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2140   pshufd(vtmp2, src2, 0xE);
2141   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2142   movdq(vtmp1, src1);
2143   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2144   movdq(dst, vtmp1);
2145 }
2146 
2147 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2148   vextracti128_high(vtmp1, src2);
2149   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2150   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2151 }
2152 
2153 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2154   vextracti64x4_high(vtmp2, src2);
2155   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2156   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2157 }
2158 
2159 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2160   mov64(temp, -1L);
2161   bzhiq(temp, temp, len);
2162   kmovql(dst, temp);
2163 }
2164 #endif // _LP64
2165 
2166 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2167   reduce_operation_128(T_FLOAT, opcode, dst, src);
2168   pshufd(vtmp, src, 0x1);
2169   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2170 }
2171 
2172 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2173   reduce2F(opcode, dst, src, vtmp);
2174   pshufd(vtmp, src, 0x2);
2175   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2176   pshufd(vtmp, src, 0x3);
2177   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2178 }
2179 
2180 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2181   reduce4F(opcode, dst, src, vtmp2);
2182   vextractf128_high(vtmp2, src);
2183   reduce4F(opcode, dst, vtmp2, vtmp1);
2184 }
2185 
2186 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2187   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2188   vextracti64x4_high(vtmp1, src);
2189   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2190 }
2191 
2192 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2193   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2194   pshufd(vtmp, src, 0xE);
2195   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2196 }
2197 
2198 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2199   reduce2D(opcode, dst, src, vtmp2);
2200   vextractf128_high(vtmp2, src);
2201   reduce2D(opcode, dst, vtmp2, vtmp1);
2202 }
2203 
2204 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2205   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2206   vextracti64x4_high(vtmp1, src);
2207   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2208 }
2209 
2210 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2211   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2212 }
2213 
2214 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2215   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2216 }
2217 
2218 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2219                                  int vec_enc) {
2220   switch(elem_bt) {
2221     case T_INT:
2222     case T_FLOAT:
2223       vmaskmovps(dst, src, mask, vec_enc);
2224       break;
2225     case T_LONG:
2226     case T_DOUBLE:
2227       vmaskmovpd(dst, src, mask, vec_enc);
2228       break;
2229     default:
2230       fatal("Unsupported type %s", type2name(elem_bt));
2231       break;
2232   }
2233 }
2234 
2235 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2236                                  int vec_enc) {
2237   switch(elem_bt) {
2238     case T_INT:
2239     case T_FLOAT:
2240       vmaskmovps(dst, src, mask, vec_enc);
2241       break;
2242     case T_LONG:
2243     case T_DOUBLE:
2244       vmaskmovpd(dst, src, mask, vec_enc);
2245       break;
2246     default:
2247       fatal("Unsupported type %s", type2name(elem_bt));
2248       break;
2249   }
2250 }
2251 
2252 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2253                                           XMMRegister dst, XMMRegister src,
2254                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2255                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2256   const int permconst[] = {1, 14};
2257   XMMRegister wsrc = src;
2258   XMMRegister wdst = xmm_0;
2259   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2260 
2261   int vlen_enc = Assembler::AVX_128bit;
2262   if (vlen == 16) {
2263     vlen_enc = Assembler::AVX_256bit;
2264   }
2265 
2266   for (int i = log2(vlen) - 1; i >=0; i--) {
2267     if (i == 0 && !is_dst_valid) {
2268       wdst = dst;
2269     }
2270     if (i == 3) {
2271       vextracti64x4_high(wtmp, wsrc);
2272     } else if (i == 2) {
2273       vextracti128_high(wtmp, wsrc);
2274     } else { // i = [0,1]
2275       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2276     }
2277     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2278     wsrc = wdst;
2279     vlen_enc = Assembler::AVX_128bit;
2280   }
2281   if (is_dst_valid) {
2282     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2283   }
2284 }
2285 
2286 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2287                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2288                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2289   XMMRegister wsrc = src;
2290   XMMRegister wdst = xmm_0;
2291   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2292   int vlen_enc = Assembler::AVX_128bit;
2293   if (vlen == 8) {
2294     vlen_enc = Assembler::AVX_256bit;
2295   }
2296   for (int i = log2(vlen) - 1; i >=0; i--) {
2297     if (i == 0 && !is_dst_valid) {
2298       wdst = dst;
2299     }
2300     if (i == 1) {
2301       vextracti128_high(wtmp, wsrc);
2302     } else if (i == 2) {
2303       vextracti64x4_high(wtmp, wsrc);
2304     } else {
2305       assert(i == 0, "%d", i);
2306       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2307     }
2308     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2309     wsrc = wdst;
2310     vlen_enc = Assembler::AVX_128bit;
2311   }
2312   if (is_dst_valid) {
2313     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2314   }
2315 }
2316 
2317 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2318   switch (bt) {
2319     case T_BYTE:  pextrb(dst, src, idx); break;
2320     case T_SHORT: pextrw(dst, src, idx); break;
2321     case T_INT:   pextrd(dst, src, idx); break;
2322     case T_LONG:  pextrq(dst, src, idx); break;
2323 
2324     default:
2325       assert(false,"Should not reach here.");
2326       break;
2327   }
2328 }
2329 
2330 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2331   int esize =  type2aelembytes(typ);
2332   int elem_per_lane = 16/esize;
2333   int lane = elemindex / elem_per_lane;
2334   int eindex = elemindex % elem_per_lane;
2335 
2336   if (lane >= 2) {
2337     assert(UseAVX > 2, "required");
2338     vextractf32x4(dst, src, lane & 3);
2339     return dst;
2340   } else if (lane > 0) {
2341     assert(UseAVX > 0, "required");
2342     vextractf128(dst, src, lane);
2343     return dst;
2344   } else {
2345     return src;
2346   }
2347 }
2348 
2349 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2350   if (typ == T_BYTE) {
2351     movsbl(dst, dst);
2352   } else if (typ == T_SHORT) {
2353     movswl(dst, dst);
2354   }
2355 }
2356 
2357 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2358   int esize =  type2aelembytes(typ);
2359   int elem_per_lane = 16/esize;
2360   int eindex = elemindex % elem_per_lane;
2361   assert(is_integral_type(typ),"required");
2362 
2363   if (eindex == 0) {
2364     if (typ == T_LONG) {
2365       movq(dst, src);
2366     } else {
2367       movdl(dst, src);
2368       movsxl(typ, dst);
2369     }
2370   } else {
2371     extract(typ, dst, src, eindex);
2372     movsxl(typ, dst);
2373   }
2374 }
2375 
2376 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2377   int esize =  type2aelembytes(typ);
2378   int elem_per_lane = 16/esize;
2379   int eindex = elemindex % elem_per_lane;
2380   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2381 
2382   if (eindex == 0) {
2383     movq(dst, src);
2384   } else {
2385     if (typ == T_FLOAT) {
2386       if (UseAVX == 0) {
2387         movdqu(dst, src);
2388         shufps(dst, dst, eindex);
2389       } else {
2390         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2391       }
2392     } else {
2393       if (UseAVX == 0) {
2394         movdqu(dst, src);
2395         psrldq(dst, eindex*esize);
2396       } else {
2397         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2398       }
2399       movq(dst, dst);
2400     }
2401   }
2402   // Zero upper bits
2403   if (typ == T_FLOAT) {
2404     if (UseAVX == 0) {
2405       assert(vtmp != xnoreg, "required.");
2406       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2407       pand(dst, vtmp);
2408     } else {
2409       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2410     }
2411   }
2412 }
2413 
2414 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2415   switch(typ) {
2416     case T_BYTE:
2417     case T_BOOLEAN:
2418       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2419       break;
2420     case T_SHORT:
2421     case T_CHAR:
2422       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2423       break;
2424     case T_INT:
2425     case T_FLOAT:
2426       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2427       break;
2428     case T_LONG:
2429     case T_DOUBLE:
2430       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2431       break;
2432     default:
2433       assert(false,"Should not reach here.");
2434       break;
2435   }
2436 }
2437 
2438 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2439   assert(rscratch != noreg || always_reachable(src2), "missing");
2440 
2441   switch(typ) {
2442     case T_BOOLEAN:
2443     case T_BYTE:
2444       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2445       break;
2446     case T_CHAR:
2447     case T_SHORT:
2448       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2449       break;
2450     case T_INT:
2451     case T_FLOAT:
2452       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2453       break;
2454     case T_LONG:
2455     case T_DOUBLE:
2456       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2457       break;
2458     default:
2459       assert(false,"Should not reach here.");
2460       break;
2461   }
2462 }
2463 
2464 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2465   switch(typ) {
2466     case T_BYTE:
2467       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2468       break;
2469     case T_SHORT:
2470       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2471       break;
2472     case T_INT:
2473     case T_FLOAT:
2474       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2475       break;
2476     case T_LONG:
2477     case T_DOUBLE:
2478       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2479       break;
2480     default:
2481       assert(false,"Should not reach here.");
2482       break;
2483   }
2484 }
2485 
2486 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2487   assert(vlen_in_bytes <= 32, "");
2488   int esize = type2aelembytes(bt);
2489   if (vlen_in_bytes == 32) {
2490     assert(vtmp == xnoreg, "required.");
2491     if (esize >= 4) {
2492       vtestps(src1, src2, AVX_256bit);
2493     } else {
2494       vptest(src1, src2, AVX_256bit);
2495     }
2496     return;
2497   }
2498   if (vlen_in_bytes < 16) {
2499     // Duplicate the lower part to fill the whole register,
2500     // Don't need to do so for src2
2501     assert(vtmp != xnoreg, "required");
2502     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2503     pshufd(vtmp, src1, shuffle_imm);
2504   } else {
2505     assert(vtmp == xnoreg, "required");
2506     vtmp = src1;
2507   }
2508   if (esize >= 4 && VM_Version::supports_avx()) {
2509     vtestps(vtmp, src2, AVX_128bit);
2510   } else {
2511     ptest(vtmp, src2);
2512   }
2513 }
2514 
2515 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2516   assert(UseAVX >= 2, "required");
2517 #ifdef ASSERT
2518   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2519   bool is_bw_supported = VM_Version::supports_avx512bw();
2520   if (is_bw && !is_bw_supported) {
2521     assert(vlen_enc != Assembler::AVX_512bit, "required");
2522     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2523            "XMM register should be 0-15");
2524   }
2525 #endif // ASSERT
2526   switch (elem_bt) {
2527     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2528     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2529     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2530     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2531     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2532     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2533     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2534   }
2535 }
2536 
2537 #ifdef _LP64
2538 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2539   assert(UseAVX >= 2, "required");
2540   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2541   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2542   if ((UseAVX > 2) &&
2543       (!is_bw || VM_Version::supports_avx512bw()) &&
2544       (!is_vl || VM_Version::supports_avx512vl())) {
2545     switch (elem_bt) {
2546       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2547       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2548       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2549       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2550       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2551     }
2552   } else {
2553     assert(vlen_enc != Assembler::AVX_512bit, "required");
2554     assert((dst->encoding() < 16),"XMM register should be 0-15");
2555     switch (elem_bt) {
2556       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2557       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2558       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2559       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2560       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2561       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2562       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2563     }
2564   }
2565 }
2566 #endif
2567 
2568 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2569   switch (to_elem_bt) {
2570     case T_SHORT:
2571       vpmovsxbw(dst, src, vlen_enc);
2572       break;
2573     case T_INT:
2574       vpmovsxbd(dst, src, vlen_enc);
2575       break;
2576     case T_FLOAT:
2577       vpmovsxbd(dst, src, vlen_enc);
2578       vcvtdq2ps(dst, dst, vlen_enc);
2579       break;
2580     case T_LONG:
2581       vpmovsxbq(dst, src, vlen_enc);
2582       break;
2583     case T_DOUBLE: {
2584       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2585       vpmovsxbd(dst, src, mid_vlen_enc);
2586       vcvtdq2pd(dst, dst, vlen_enc);
2587       break;
2588     }
2589     default:
2590       fatal("Unsupported type %s", type2name(to_elem_bt));
2591       break;
2592   }
2593 }
2594 
2595 //-------------------------------------------------------------------------------------------
2596 
2597 // IndexOf for constant substrings with size >= 8 chars
2598 // which don't need to be loaded through stack.
2599 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2600                                          Register cnt1, Register cnt2,
2601                                          int int_cnt2,  Register result,
2602                                          XMMRegister vec, Register tmp,
2603                                          int ae) {
2604   ShortBranchVerifier sbv(this);
2605   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2606   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2607 
2608   // This method uses the pcmpestri instruction with bound registers
2609   //   inputs:
2610   //     xmm - substring
2611   //     rax - substring length (elements count)
2612   //     mem - scanned string
2613   //     rdx - string length (elements count)
2614   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2615   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2616   //   outputs:
2617   //     rcx - matched index in string
2618   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2619   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2620   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2621   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2622   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2623 
2624   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2625         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2626         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2627 
2628   // Note, inline_string_indexOf() generates checks:
2629   // if (substr.count > string.count) return -1;
2630   // if (substr.count == 0) return 0;
2631   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2632 
2633   // Load substring.
2634   if (ae == StrIntrinsicNode::UL) {
2635     pmovzxbw(vec, Address(str2, 0));
2636   } else {
2637     movdqu(vec, Address(str2, 0));
2638   }
2639   movl(cnt2, int_cnt2);
2640   movptr(result, str1); // string addr
2641 
2642   if (int_cnt2 > stride) {
2643     jmpb(SCAN_TO_SUBSTR);
2644 
2645     // Reload substr for rescan, this code
2646     // is executed only for large substrings (> 8 chars)
2647     bind(RELOAD_SUBSTR);
2648     if (ae == StrIntrinsicNode::UL) {
2649       pmovzxbw(vec, Address(str2, 0));
2650     } else {
2651       movdqu(vec, Address(str2, 0));
2652     }
2653     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2654 
2655     bind(RELOAD_STR);
2656     // We came here after the beginning of the substring was
2657     // matched but the rest of it was not so we need to search
2658     // again. Start from the next element after the previous match.
2659 
2660     // cnt2 is number of substring reminding elements and
2661     // cnt1 is number of string reminding elements when cmp failed.
2662     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2663     subl(cnt1, cnt2);
2664     addl(cnt1, int_cnt2);
2665     movl(cnt2, int_cnt2); // Now restore cnt2
2666 
2667     decrementl(cnt1);     // Shift to next element
2668     cmpl(cnt1, cnt2);
2669     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2670 
2671     addptr(result, (1<<scale1));
2672 
2673   } // (int_cnt2 > 8)
2674 
2675   // Scan string for start of substr in 16-byte vectors
2676   bind(SCAN_TO_SUBSTR);
2677   pcmpestri(vec, Address(result, 0), mode);
2678   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2679   subl(cnt1, stride);
2680   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2681   cmpl(cnt1, cnt2);
2682   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2683   addptr(result, 16);
2684   jmpb(SCAN_TO_SUBSTR);
2685 
2686   // Found a potential substr
2687   bind(FOUND_CANDIDATE);
2688   // Matched whole vector if first element matched (tmp(rcx) == 0).
2689   if (int_cnt2 == stride) {
2690     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2691   } else { // int_cnt2 > 8
2692     jccb(Assembler::overflow, FOUND_SUBSTR);
2693   }
2694   // After pcmpestri tmp(rcx) contains matched element index
2695   // Compute start addr of substr
2696   lea(result, Address(result, tmp, scale1));
2697 
2698   // Make sure string is still long enough
2699   subl(cnt1, tmp);
2700   cmpl(cnt1, cnt2);
2701   if (int_cnt2 == stride) {
2702     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2703   } else { // int_cnt2 > 8
2704     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2705   }
2706   // Left less then substring.
2707 
2708   bind(RET_NOT_FOUND);
2709   movl(result, -1);
2710   jmp(EXIT);
2711 
2712   if (int_cnt2 > stride) {
2713     // This code is optimized for the case when whole substring
2714     // is matched if its head is matched.
2715     bind(MATCH_SUBSTR_HEAD);
2716     pcmpestri(vec, Address(result, 0), mode);
2717     // Reload only string if does not match
2718     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2719 
2720     Label CONT_SCAN_SUBSTR;
2721     // Compare the rest of substring (> 8 chars).
2722     bind(FOUND_SUBSTR);
2723     // First 8 chars are already matched.
2724     negptr(cnt2);
2725     addptr(cnt2, stride);
2726 
2727     bind(SCAN_SUBSTR);
2728     subl(cnt1, stride);
2729     cmpl(cnt2, -stride); // Do not read beyond substring
2730     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2731     // Back-up strings to avoid reading beyond substring:
2732     // cnt1 = cnt1 - cnt2 + 8
2733     addl(cnt1, cnt2); // cnt2 is negative
2734     addl(cnt1, stride);
2735     movl(cnt2, stride); negptr(cnt2);
2736     bind(CONT_SCAN_SUBSTR);
2737     if (int_cnt2 < (int)G) {
2738       int tail_off1 = int_cnt2<<scale1;
2739       int tail_off2 = int_cnt2<<scale2;
2740       if (ae == StrIntrinsicNode::UL) {
2741         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2742       } else {
2743         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2744       }
2745       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2746     } else {
2747       // calculate index in register to avoid integer overflow (int_cnt2*2)
2748       movl(tmp, int_cnt2);
2749       addptr(tmp, cnt2);
2750       if (ae == StrIntrinsicNode::UL) {
2751         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2752       } else {
2753         movdqu(vec, Address(str2, tmp, scale2, 0));
2754       }
2755       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2756     }
2757     // Need to reload strings pointers if not matched whole vector
2758     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2759     addptr(cnt2, stride);
2760     jcc(Assembler::negative, SCAN_SUBSTR);
2761     // Fall through if found full substring
2762 
2763   } // (int_cnt2 > 8)
2764 
2765   bind(RET_FOUND);
2766   // Found result if we matched full small substring.
2767   // Compute substr offset
2768   subptr(result, str1);
2769   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2770     shrl(result, 1); // index
2771   }
2772   bind(EXIT);
2773 
2774 } // string_indexofC8
2775 
2776 // Small strings are loaded through stack if they cross page boundary.
2777 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2778                                        Register cnt1, Register cnt2,
2779                                        int int_cnt2,  Register result,
2780                                        XMMRegister vec, Register tmp,
2781                                        int ae) {
2782   ShortBranchVerifier sbv(this);
2783   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2784   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2785 
2786   //
2787   // int_cnt2 is length of small (< 8 chars) constant substring
2788   // or (-1) for non constant substring in which case its length
2789   // is in cnt2 register.
2790   //
2791   // Note, inline_string_indexOf() generates checks:
2792   // if (substr.count > string.count) return -1;
2793   // if (substr.count == 0) return 0;
2794   //
2795   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2796   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2797   // This method uses the pcmpestri instruction with bound registers
2798   //   inputs:
2799   //     xmm - substring
2800   //     rax - substring length (elements count)
2801   //     mem - scanned string
2802   //     rdx - string length (elements count)
2803   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2804   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2805   //   outputs:
2806   //     rcx - matched index in string
2807   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2808   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2809   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2810   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2811 
2812   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2813         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2814         FOUND_CANDIDATE;
2815 
2816   { //========================================================
2817     // We don't know where these strings are located
2818     // and we can't read beyond them. Load them through stack.
2819     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2820 
2821     movptr(tmp, rsp); // save old SP
2822 
2823     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2824       if (int_cnt2 == (1>>scale2)) { // One byte
2825         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2826         load_unsigned_byte(result, Address(str2, 0));
2827         movdl(vec, result); // move 32 bits
2828       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2829         // Not enough header space in 32-bit VM: 12+3 = 15.
2830         movl(result, Address(str2, -1));
2831         shrl(result, 8);
2832         movdl(vec, result); // move 32 bits
2833       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2834         load_unsigned_short(result, Address(str2, 0));
2835         movdl(vec, result); // move 32 bits
2836       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2837         movdl(vec, Address(str2, 0)); // move 32 bits
2838       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2839         movq(vec, Address(str2, 0));  // move 64 bits
2840       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2841         // Array header size is 12 bytes in 32-bit VM
2842         // + 6 bytes for 3 chars == 18 bytes,
2843         // enough space to load vec and shift.
2844         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2845         if (ae == StrIntrinsicNode::UL) {
2846           int tail_off = int_cnt2-8;
2847           pmovzxbw(vec, Address(str2, tail_off));
2848           psrldq(vec, -2*tail_off);
2849         }
2850         else {
2851           int tail_off = int_cnt2*(1<<scale2);
2852           movdqu(vec, Address(str2, tail_off-16));
2853           psrldq(vec, 16-tail_off);
2854         }
2855       }
2856     } else { // not constant substring
2857       cmpl(cnt2, stride);
2858       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2859 
2860       // We can read beyond string if srt+16 does not cross page boundary
2861       // since heaps are aligned and mapped by pages.
2862       assert(os::vm_page_size() < (int)G, "default page should be small");
2863       movl(result, str2); // We need only low 32 bits
2864       andl(result, ((int)os::vm_page_size()-1));
2865       cmpl(result, ((int)os::vm_page_size()-16));
2866       jccb(Assembler::belowEqual, CHECK_STR);
2867 
2868       // Move small strings to stack to allow load 16 bytes into vec.
2869       subptr(rsp, 16);
2870       int stk_offset = wordSize-(1<<scale2);
2871       push(cnt2);
2872 
2873       bind(COPY_SUBSTR);
2874       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2875         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2876         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2877       } else if (ae == StrIntrinsicNode::UU) {
2878         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2879         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2880       }
2881       decrement(cnt2);
2882       jccb(Assembler::notZero, COPY_SUBSTR);
2883 
2884       pop(cnt2);
2885       movptr(str2, rsp);  // New substring address
2886     } // non constant
2887 
2888     bind(CHECK_STR);
2889     cmpl(cnt1, stride);
2890     jccb(Assembler::aboveEqual, BIG_STRINGS);
2891 
2892     // Check cross page boundary.
2893     movl(result, str1); // We need only low 32 bits
2894     andl(result, ((int)os::vm_page_size()-1));
2895     cmpl(result, ((int)os::vm_page_size()-16));
2896     jccb(Assembler::belowEqual, BIG_STRINGS);
2897 
2898     subptr(rsp, 16);
2899     int stk_offset = -(1<<scale1);
2900     if (int_cnt2 < 0) { // not constant
2901       push(cnt2);
2902       stk_offset += wordSize;
2903     }
2904     movl(cnt2, cnt1);
2905 
2906     bind(COPY_STR);
2907     if (ae == StrIntrinsicNode::LL) {
2908       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2909       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2910     } else {
2911       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2912       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2913     }
2914     decrement(cnt2);
2915     jccb(Assembler::notZero, COPY_STR);
2916 
2917     if (int_cnt2 < 0) { // not constant
2918       pop(cnt2);
2919     }
2920     movptr(str1, rsp);  // New string address
2921 
2922     bind(BIG_STRINGS);
2923     // Load substring.
2924     if (int_cnt2 < 0) { // -1
2925       if (ae == StrIntrinsicNode::UL) {
2926         pmovzxbw(vec, Address(str2, 0));
2927       } else {
2928         movdqu(vec, Address(str2, 0));
2929       }
2930       push(cnt2);       // substr count
2931       push(str2);       // substr addr
2932       push(str1);       // string addr
2933     } else {
2934       // Small (< 8 chars) constant substrings are loaded already.
2935       movl(cnt2, int_cnt2);
2936     }
2937     push(tmp);  // original SP
2938 
2939   } // Finished loading
2940 
2941   //========================================================
2942   // Start search
2943   //
2944 
2945   movptr(result, str1); // string addr
2946 
2947   if (int_cnt2  < 0) {  // Only for non constant substring
2948     jmpb(SCAN_TO_SUBSTR);
2949 
2950     // SP saved at sp+0
2951     // String saved at sp+1*wordSize
2952     // Substr saved at sp+2*wordSize
2953     // Substr count saved at sp+3*wordSize
2954 
2955     // Reload substr for rescan, this code
2956     // is executed only for large substrings (> 8 chars)
2957     bind(RELOAD_SUBSTR);
2958     movptr(str2, Address(rsp, 2*wordSize));
2959     movl(cnt2, Address(rsp, 3*wordSize));
2960     if (ae == StrIntrinsicNode::UL) {
2961       pmovzxbw(vec, Address(str2, 0));
2962     } else {
2963       movdqu(vec, Address(str2, 0));
2964     }
2965     // We came here after the beginning of the substring was
2966     // matched but the rest of it was not so we need to search
2967     // again. Start from the next element after the previous match.
2968     subptr(str1, result); // Restore counter
2969     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2970       shrl(str1, 1);
2971     }
2972     addl(cnt1, str1);
2973     decrementl(cnt1);   // Shift to next element
2974     cmpl(cnt1, cnt2);
2975     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2976 
2977     addptr(result, (1<<scale1));
2978   } // non constant
2979 
2980   // Scan string for start of substr in 16-byte vectors
2981   bind(SCAN_TO_SUBSTR);
2982   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2983   pcmpestri(vec, Address(result, 0), mode);
2984   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2985   subl(cnt1, stride);
2986   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2987   cmpl(cnt1, cnt2);
2988   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2989   addptr(result, 16);
2990 
2991   bind(ADJUST_STR);
2992   cmpl(cnt1, stride); // Do not read beyond string
2993   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2994   // Back-up string to avoid reading beyond string.
2995   lea(result, Address(result, cnt1, scale1, -16));
2996   movl(cnt1, stride);
2997   jmpb(SCAN_TO_SUBSTR);
2998 
2999   // Found a potential substr
3000   bind(FOUND_CANDIDATE);
3001   // After pcmpestri tmp(rcx) contains matched element index
3002 
3003   // Make sure string is still long enough
3004   subl(cnt1, tmp);
3005   cmpl(cnt1, cnt2);
3006   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3007   // Left less then substring.
3008 
3009   bind(RET_NOT_FOUND);
3010   movl(result, -1);
3011   jmp(CLEANUP);
3012 
3013   bind(FOUND_SUBSTR);
3014   // Compute start addr of substr
3015   lea(result, Address(result, tmp, scale1));
3016   if (int_cnt2 > 0) { // Constant substring
3017     // Repeat search for small substring (< 8 chars)
3018     // from new point without reloading substring.
3019     // Have to check that we don't read beyond string.
3020     cmpl(tmp, stride-int_cnt2);
3021     jccb(Assembler::greater, ADJUST_STR);
3022     // Fall through if matched whole substring.
3023   } else { // non constant
3024     assert(int_cnt2 == -1, "should be != 0");
3025 
3026     addl(tmp, cnt2);
3027     // Found result if we matched whole substring.
3028     cmpl(tmp, stride);
3029     jcc(Assembler::lessEqual, RET_FOUND);
3030 
3031     // Repeat search for small substring (<= 8 chars)
3032     // from new point 'str1' without reloading substring.
3033     cmpl(cnt2, stride);
3034     // Have to check that we don't read beyond string.
3035     jccb(Assembler::lessEqual, ADJUST_STR);
3036 
3037     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3038     // Compare the rest of substring (> 8 chars).
3039     movptr(str1, result);
3040 
3041     cmpl(tmp, cnt2);
3042     // First 8 chars are already matched.
3043     jccb(Assembler::equal, CHECK_NEXT);
3044 
3045     bind(SCAN_SUBSTR);
3046     pcmpestri(vec, Address(str1, 0), mode);
3047     // Need to reload strings pointers if not matched whole vector
3048     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3049 
3050     bind(CHECK_NEXT);
3051     subl(cnt2, stride);
3052     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3053     addptr(str1, 16);
3054     if (ae == StrIntrinsicNode::UL) {
3055       addptr(str2, 8);
3056     } else {
3057       addptr(str2, 16);
3058     }
3059     subl(cnt1, stride);
3060     cmpl(cnt2, stride); // Do not read beyond substring
3061     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3062     // Back-up strings to avoid reading beyond substring.
3063 
3064     if (ae == StrIntrinsicNode::UL) {
3065       lea(str2, Address(str2, cnt2, scale2, -8));
3066       lea(str1, Address(str1, cnt2, scale1, -16));
3067     } else {
3068       lea(str2, Address(str2, cnt2, scale2, -16));
3069       lea(str1, Address(str1, cnt2, scale1, -16));
3070     }
3071     subl(cnt1, cnt2);
3072     movl(cnt2, stride);
3073     addl(cnt1, stride);
3074     bind(CONT_SCAN_SUBSTR);
3075     if (ae == StrIntrinsicNode::UL) {
3076       pmovzxbw(vec, Address(str2, 0));
3077     } else {
3078       movdqu(vec, Address(str2, 0));
3079     }
3080     jmp(SCAN_SUBSTR);
3081 
3082     bind(RET_FOUND_LONG);
3083     movptr(str1, Address(rsp, wordSize));
3084   } // non constant
3085 
3086   bind(RET_FOUND);
3087   // Compute substr offset
3088   subptr(result, str1);
3089   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3090     shrl(result, 1); // index
3091   }
3092   bind(CLEANUP);
3093   pop(rsp); // restore SP
3094 
3095 } // string_indexof
3096 
3097 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3098                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3099   ShortBranchVerifier sbv(this);
3100   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3101 
3102   int stride = 8;
3103 
3104   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3105         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3106         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3107         FOUND_SEQ_CHAR, DONE_LABEL;
3108 
3109   movptr(result, str1);
3110   if (UseAVX >= 2) {
3111     cmpl(cnt1, stride);
3112     jcc(Assembler::less, SCAN_TO_CHAR);
3113     cmpl(cnt1, 2*stride);
3114     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3115     movdl(vec1, ch);
3116     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3117     vpxor(vec2, vec2);
3118     movl(tmp, cnt1);
3119     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3120     andl(cnt1,0x0000000F);  //tail count (in chars)
3121 
3122     bind(SCAN_TO_16_CHAR_LOOP);
3123     vmovdqu(vec3, Address(result, 0));
3124     vpcmpeqw(vec3, vec3, vec1, 1);
3125     vptest(vec2, vec3);
3126     jcc(Assembler::carryClear, FOUND_CHAR);
3127     addptr(result, 32);
3128     subl(tmp, 2*stride);
3129     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3130     jmp(SCAN_TO_8_CHAR);
3131     bind(SCAN_TO_8_CHAR_INIT);
3132     movdl(vec1, ch);
3133     pshuflw(vec1, vec1, 0x00);
3134     pshufd(vec1, vec1, 0);
3135     pxor(vec2, vec2);
3136   }
3137   bind(SCAN_TO_8_CHAR);
3138   cmpl(cnt1, stride);
3139   jcc(Assembler::less, SCAN_TO_CHAR);
3140   if (UseAVX < 2) {
3141     movdl(vec1, ch);
3142     pshuflw(vec1, vec1, 0x00);
3143     pshufd(vec1, vec1, 0);
3144     pxor(vec2, vec2);
3145   }
3146   movl(tmp, cnt1);
3147   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3148   andl(cnt1,0x00000007);  //tail count (in chars)
3149 
3150   bind(SCAN_TO_8_CHAR_LOOP);
3151   movdqu(vec3, Address(result, 0));
3152   pcmpeqw(vec3, vec1);
3153   ptest(vec2, vec3);
3154   jcc(Assembler::carryClear, FOUND_CHAR);
3155   addptr(result, 16);
3156   subl(tmp, stride);
3157   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3158   bind(SCAN_TO_CHAR);
3159   testl(cnt1, cnt1);
3160   jcc(Assembler::zero, RET_NOT_FOUND);
3161   bind(SCAN_TO_CHAR_LOOP);
3162   load_unsigned_short(tmp, Address(result, 0));
3163   cmpl(ch, tmp);
3164   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3165   addptr(result, 2);
3166   subl(cnt1, 1);
3167   jccb(Assembler::zero, RET_NOT_FOUND);
3168   jmp(SCAN_TO_CHAR_LOOP);
3169 
3170   bind(RET_NOT_FOUND);
3171   movl(result, -1);
3172   jmpb(DONE_LABEL);
3173 
3174   bind(FOUND_CHAR);
3175   if (UseAVX >= 2) {
3176     vpmovmskb(tmp, vec3);
3177   } else {
3178     pmovmskb(tmp, vec3);
3179   }
3180   bsfl(ch, tmp);
3181   addptr(result, ch);
3182 
3183   bind(FOUND_SEQ_CHAR);
3184   subptr(result, str1);
3185   shrl(result, 1);
3186 
3187   bind(DONE_LABEL);
3188 } // string_indexof_char
3189 
3190 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3191                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3192   ShortBranchVerifier sbv(this);
3193   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3194 
3195   int stride = 16;
3196 
3197   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3198         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3199         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3200         FOUND_SEQ_CHAR, DONE_LABEL;
3201 
3202   movptr(result, str1);
3203   if (UseAVX >= 2) {
3204     cmpl(cnt1, stride);
3205     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3206     cmpl(cnt1, stride*2);
3207     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3208     movdl(vec1, ch);
3209     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3210     vpxor(vec2, vec2);
3211     movl(tmp, cnt1);
3212     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3213     andl(cnt1,0x0000001F);  //tail count (in chars)
3214 
3215     bind(SCAN_TO_32_CHAR_LOOP);
3216     vmovdqu(vec3, Address(result, 0));
3217     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3218     vptest(vec2, vec3);
3219     jcc(Assembler::carryClear, FOUND_CHAR);
3220     addptr(result, 32);
3221     subl(tmp, stride*2);
3222     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3223     jmp(SCAN_TO_16_CHAR);
3224 
3225     bind(SCAN_TO_16_CHAR_INIT);
3226     movdl(vec1, ch);
3227     pxor(vec2, vec2);
3228     pshufb(vec1, vec2);
3229   }
3230 
3231   bind(SCAN_TO_16_CHAR);
3232   cmpl(cnt1, stride);
3233   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3234   if (UseAVX < 2) {
3235     movdl(vec1, ch);
3236     pxor(vec2, vec2);
3237     pshufb(vec1, vec2);
3238   }
3239   movl(tmp, cnt1);
3240   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3241   andl(cnt1,0x0000000F);  //tail count (in bytes)
3242 
3243   bind(SCAN_TO_16_CHAR_LOOP);
3244   movdqu(vec3, Address(result, 0));
3245   pcmpeqb(vec3, vec1);
3246   ptest(vec2, vec3);
3247   jcc(Assembler::carryClear, FOUND_CHAR);
3248   addptr(result, 16);
3249   subl(tmp, stride);
3250   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3251 
3252   bind(SCAN_TO_CHAR_INIT);
3253   testl(cnt1, cnt1);
3254   jcc(Assembler::zero, RET_NOT_FOUND);
3255   bind(SCAN_TO_CHAR_LOOP);
3256   load_unsigned_byte(tmp, Address(result, 0));
3257   cmpl(ch, tmp);
3258   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3259   addptr(result, 1);
3260   subl(cnt1, 1);
3261   jccb(Assembler::zero, RET_NOT_FOUND);
3262   jmp(SCAN_TO_CHAR_LOOP);
3263 
3264   bind(RET_NOT_FOUND);
3265   movl(result, -1);
3266   jmpb(DONE_LABEL);
3267 
3268   bind(FOUND_CHAR);
3269   if (UseAVX >= 2) {
3270     vpmovmskb(tmp, vec3);
3271   } else {
3272     pmovmskb(tmp, vec3);
3273   }
3274   bsfl(ch, tmp);
3275   addptr(result, ch);
3276 
3277   bind(FOUND_SEQ_CHAR);
3278   subptr(result, str1);
3279 
3280   bind(DONE_LABEL);
3281 } // stringL_indexof_char
3282 
3283 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3284   switch (eltype) {
3285   case T_BOOLEAN: return sizeof(jboolean);
3286   case T_BYTE:  return sizeof(jbyte);
3287   case T_SHORT: return sizeof(jshort);
3288   case T_CHAR:  return sizeof(jchar);
3289   case T_INT:   return sizeof(jint);
3290   default:
3291     ShouldNotReachHere();
3292     return -1;
3293   }
3294 }
3295 
3296 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3297   switch (eltype) {
3298   // T_BOOLEAN used as surrogate for unsigned byte
3299   case T_BOOLEAN: movzbl(dst, src);   break;
3300   case T_BYTE:    movsbl(dst, src);   break;
3301   case T_SHORT:   movswl(dst, src);   break;
3302   case T_CHAR:    movzwl(dst, src);   break;
3303   case T_INT:     movl(dst, src);     break;
3304   default:
3305     ShouldNotReachHere();
3306   }
3307 }
3308 
3309 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3310   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3311 }
3312 
3313 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3314   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3315 }
3316 
3317 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3318   const int vlen = Assembler::AVX_256bit;
3319   switch (eltype) {
3320   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3321   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3322   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3323   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3324   case T_INT:
3325     // do nothing
3326     break;
3327   default:
3328     ShouldNotReachHere();
3329   }
3330 }
3331 
3332 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3333                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3334                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3335                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3336                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3337                                         BasicType eltype) {
3338   ShortBranchVerifier sbv(this);
3339   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3340   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3341   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3342 
3343   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3344         SHORT_UNROLLED_LOOP_EXIT,
3345         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3346         UNROLLED_VECTOR_LOOP_BEGIN,
3347         END;
3348   switch (eltype) {
3349   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3350   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3351   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3352   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3353   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3354   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3355   }
3356 
3357   // For "renaming" for readibility of the code
3358   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3359                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3360                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3361 
3362   const int elsize = arrays_hashcode_elsize(eltype);
3363 
3364   /*
3365     if (cnt1 >= 2) {
3366       if (cnt1 >= 32) {
3367         UNROLLED VECTOR LOOP
3368       }
3369       UNROLLED SCALAR LOOP
3370     }
3371     SINGLE SCALAR
3372    */
3373 
3374   cmpl(cnt1, 32);
3375   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3376 
3377   // cnt1 >= 32 && generate_vectorized_loop
3378   xorl(index, index);
3379 
3380   // vresult = IntVector.zero(I256);
3381   for (int idx = 0; idx < 4; idx++) {
3382     vpxor(vresult[idx], vresult[idx]);
3383   }
3384   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3385   Register bound = tmp2;
3386   Register next = tmp3;
3387   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3388   movl(next, Address(tmp2, 0));
3389   movdl(vnext, next);
3390   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3391 
3392   // index = 0;
3393   // bound = cnt1 & ~(32 - 1);
3394   movl(bound, cnt1);
3395   andl(bound, ~(32 - 1));
3396   // for (; index < bound; index += 32) {
3397   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3398   // result *= next;
3399   imull(result, next);
3400   // loop fission to upfront the cost of fetching from memory, OOO execution
3401   // can then hopefully do a better job of prefetching
3402   for (int idx = 0; idx < 4; idx++) {
3403     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3404   }
3405   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3406   for (int idx = 0; idx < 4; idx++) {
3407     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3408     arrays_hashcode_elvcast(vtmp[idx], eltype);
3409     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3410   }
3411   // index += 32;
3412   addl(index, 32);
3413   // index < bound;
3414   cmpl(index, bound);
3415   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3416   // }
3417 
3418   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3419   subl(cnt1, bound);
3420   // release bound
3421 
3422   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3423   for (int idx = 0; idx < 4; idx++) {
3424     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3425     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3426     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3427   }
3428   // result += vresult.reduceLanes(ADD);
3429   for (int idx = 0; idx < 4; idx++) {
3430     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3431   }
3432 
3433   // } else if (cnt1 < 32) {
3434 
3435   bind(SHORT_UNROLLED_BEGIN);
3436   // int i = 1;
3437   movl(index, 1);
3438   cmpl(index, cnt1);
3439   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3440 
3441   // for (; i < cnt1 ; i += 2) {
3442   bind(SHORT_UNROLLED_LOOP_BEGIN);
3443   movl(tmp3, 961);
3444   imull(result, tmp3);
3445   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3446   movl(tmp3, tmp2);
3447   shll(tmp3, 5);
3448   subl(tmp3, tmp2);
3449   addl(result, tmp3);
3450   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3451   addl(result, tmp3);
3452   addl(index, 2);
3453   cmpl(index, cnt1);
3454   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3455 
3456   // }
3457   // if (i >= cnt1) {
3458   bind(SHORT_UNROLLED_LOOP_EXIT);
3459   jccb(Assembler::greater, END);
3460   movl(tmp2, result);
3461   shll(result, 5);
3462   subl(result, tmp2);
3463   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3464   addl(result, tmp3);
3465   // }
3466   bind(END);
3467 
3468   BLOCK_COMMENT("} // arrays_hashcode");
3469 
3470 } // arrays_hashcode
3471 
3472 // helper function for string_compare
3473 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3474                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3475                                            Address::ScaleFactor scale2, Register index, int ae) {
3476   if (ae == StrIntrinsicNode::LL) {
3477     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3478     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3479   } else if (ae == StrIntrinsicNode::UU) {
3480     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3481     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3482   } else {
3483     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3484     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3485   }
3486 }
3487 
3488 // Compare strings, used for char[] and byte[].
3489 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3490                                        Register cnt1, Register cnt2, Register result,
3491                                        XMMRegister vec1, int ae, KRegister mask) {
3492   ShortBranchVerifier sbv(this);
3493   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3494   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3495   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3496   int stride2x2 = 0x40;
3497   Address::ScaleFactor scale = Address::no_scale;
3498   Address::ScaleFactor scale1 = Address::no_scale;
3499   Address::ScaleFactor scale2 = Address::no_scale;
3500 
3501   if (ae != StrIntrinsicNode::LL) {
3502     stride2x2 = 0x20;
3503   }
3504 
3505   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3506     shrl(cnt2, 1);
3507   }
3508   // Compute the minimum of the string lengths and the
3509   // difference of the string lengths (stack).
3510   // Do the conditional move stuff
3511   movl(result, cnt1);
3512   subl(cnt1, cnt2);
3513   push(cnt1);
3514   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3515 
3516   // Is the minimum length zero?
3517   testl(cnt2, cnt2);
3518   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3519   if (ae == StrIntrinsicNode::LL) {
3520     // Load first bytes
3521     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3522     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3523   } else if (ae == StrIntrinsicNode::UU) {
3524     // Load first characters
3525     load_unsigned_short(result, Address(str1, 0));
3526     load_unsigned_short(cnt1, Address(str2, 0));
3527   } else {
3528     load_unsigned_byte(result, Address(str1, 0));
3529     load_unsigned_short(cnt1, Address(str2, 0));
3530   }
3531   subl(result, cnt1);
3532   jcc(Assembler::notZero,  POP_LABEL);
3533 
3534   if (ae == StrIntrinsicNode::UU) {
3535     // Divide length by 2 to get number of chars
3536     shrl(cnt2, 1);
3537   }
3538   cmpl(cnt2, 1);
3539   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3540 
3541   // Check if the strings start at the same location and setup scale and stride
3542   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3543     cmpptr(str1, str2);
3544     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3545     if (ae == StrIntrinsicNode::LL) {
3546       scale = Address::times_1;
3547       stride = 16;
3548     } else {
3549       scale = Address::times_2;
3550       stride = 8;
3551     }
3552   } else {
3553     scale1 = Address::times_1;
3554     scale2 = Address::times_2;
3555     // scale not used
3556     stride = 8;
3557   }
3558 
3559   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3560     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3561     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3562     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3563     Label COMPARE_TAIL_LONG;
3564     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3565 
3566     int pcmpmask = 0x19;
3567     if (ae == StrIntrinsicNode::LL) {
3568       pcmpmask &= ~0x01;
3569     }
3570 
3571     // Setup to compare 16-chars (32-bytes) vectors,
3572     // start from first character again because it has aligned address.
3573     if (ae == StrIntrinsicNode::LL) {
3574       stride2 = 32;
3575     } else {
3576       stride2 = 16;
3577     }
3578     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3579       adr_stride = stride << scale;
3580     } else {
3581       adr_stride1 = 8;  //stride << scale1;
3582       adr_stride2 = 16; //stride << scale2;
3583     }
3584 
3585     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3586     // rax and rdx are used by pcmpestri as elements counters
3587     movl(result, cnt2);
3588     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3589     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3590 
3591     // fast path : compare first 2 8-char vectors.
3592     bind(COMPARE_16_CHARS);
3593     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3594       movdqu(vec1, Address(str1, 0));
3595     } else {
3596       pmovzxbw(vec1, Address(str1, 0));
3597     }
3598     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3599     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3600 
3601     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3602       movdqu(vec1, Address(str1, adr_stride));
3603       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3604     } else {
3605       pmovzxbw(vec1, Address(str1, adr_stride1));
3606       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3607     }
3608     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3609     addl(cnt1, stride);
3610 
3611     // Compare the characters at index in cnt1
3612     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3613     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3614     subl(result, cnt2);
3615     jmp(POP_LABEL);
3616 
3617     // Setup the registers to start vector comparison loop
3618     bind(COMPARE_WIDE_VECTORS);
3619     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3620       lea(str1, Address(str1, result, scale));
3621       lea(str2, Address(str2, result, scale));
3622     } else {
3623       lea(str1, Address(str1, result, scale1));
3624       lea(str2, Address(str2, result, scale2));
3625     }
3626     subl(result, stride2);
3627     subl(cnt2, stride2);
3628     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3629     negptr(result);
3630 
3631     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3632     bind(COMPARE_WIDE_VECTORS_LOOP);
3633 
3634 #ifdef _LP64
3635     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3636       cmpl(cnt2, stride2x2);
3637       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3638       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3639       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3640 
3641       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3642       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3643         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3644         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3645       } else {
3646         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3647         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3648       }
3649       kortestql(mask, mask);
3650       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3651       addptr(result, stride2x2);  // update since we already compared at this addr
3652       subl(cnt2, stride2x2);      // and sub the size too
3653       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3654 
3655       vpxor(vec1, vec1);
3656       jmpb(COMPARE_WIDE_TAIL);
3657     }//if (VM_Version::supports_avx512vlbw())
3658 #endif // _LP64
3659 
3660 
3661     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3662     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3663       vmovdqu(vec1, Address(str1, result, scale));
3664       vpxor(vec1, Address(str2, result, scale));
3665     } else {
3666       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3667       vpxor(vec1, Address(str2, result, scale2));
3668     }
3669     vptest(vec1, vec1);
3670     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3671     addptr(result, stride2);
3672     subl(cnt2, stride2);
3673     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3674     // clean upper bits of YMM registers
3675     vpxor(vec1, vec1);
3676 
3677     // compare wide vectors tail
3678     bind(COMPARE_WIDE_TAIL);
3679     testptr(result, result);
3680     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3681 
3682     movl(result, stride2);
3683     movl(cnt2, result);
3684     negptr(result);
3685     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3686 
3687     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3688     bind(VECTOR_NOT_EQUAL);
3689     // clean upper bits of YMM registers
3690     vpxor(vec1, vec1);
3691     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3692       lea(str1, Address(str1, result, scale));
3693       lea(str2, Address(str2, result, scale));
3694     } else {
3695       lea(str1, Address(str1, result, scale1));
3696       lea(str2, Address(str2, result, scale2));
3697     }
3698     jmp(COMPARE_16_CHARS);
3699 
3700     // Compare tail chars, length between 1 to 15 chars
3701     bind(COMPARE_TAIL_LONG);
3702     movl(cnt2, result);
3703     cmpl(cnt2, stride);
3704     jcc(Assembler::less, COMPARE_SMALL_STR);
3705 
3706     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3707       movdqu(vec1, Address(str1, 0));
3708     } else {
3709       pmovzxbw(vec1, Address(str1, 0));
3710     }
3711     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3712     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3713     subptr(cnt2, stride);
3714     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3715     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3716       lea(str1, Address(str1, result, scale));
3717       lea(str2, Address(str2, result, scale));
3718     } else {
3719       lea(str1, Address(str1, result, scale1));
3720       lea(str2, Address(str2, result, scale2));
3721     }
3722     negptr(cnt2);
3723     jmpb(WHILE_HEAD_LABEL);
3724 
3725     bind(COMPARE_SMALL_STR);
3726   } else if (UseSSE42Intrinsics) {
3727     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3728     int pcmpmask = 0x19;
3729     // Setup to compare 8-char (16-byte) vectors,
3730     // start from first character again because it has aligned address.
3731     movl(result, cnt2);
3732     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3733     if (ae == StrIntrinsicNode::LL) {
3734       pcmpmask &= ~0x01;
3735     }
3736     jcc(Assembler::zero, COMPARE_TAIL);
3737     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3738       lea(str1, Address(str1, result, scale));
3739       lea(str2, Address(str2, result, scale));
3740     } else {
3741       lea(str1, Address(str1, result, scale1));
3742       lea(str2, Address(str2, result, scale2));
3743     }
3744     negptr(result);
3745 
3746     // pcmpestri
3747     //   inputs:
3748     //     vec1- substring
3749     //     rax - negative string length (elements count)
3750     //     mem - scanned string
3751     //     rdx - string length (elements count)
3752     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3753     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3754     //   outputs:
3755     //     rcx - first mismatched element index
3756     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3757 
3758     bind(COMPARE_WIDE_VECTORS);
3759     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3760       movdqu(vec1, Address(str1, result, scale));
3761       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3762     } else {
3763       pmovzxbw(vec1, Address(str1, result, scale1));
3764       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3765     }
3766     // After pcmpestri cnt1(rcx) contains mismatched element index
3767 
3768     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3769     addptr(result, stride);
3770     subptr(cnt2, stride);
3771     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3772 
3773     // compare wide vectors tail
3774     testptr(result, result);
3775     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3776 
3777     movl(cnt2, stride);
3778     movl(result, stride);
3779     negptr(result);
3780     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3781       movdqu(vec1, Address(str1, result, scale));
3782       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3783     } else {
3784       pmovzxbw(vec1, Address(str1, result, scale1));
3785       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3786     }
3787     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3788 
3789     // Mismatched characters in the vectors
3790     bind(VECTOR_NOT_EQUAL);
3791     addptr(cnt1, result);
3792     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3793     subl(result, cnt2);
3794     jmpb(POP_LABEL);
3795 
3796     bind(COMPARE_TAIL); // limit is zero
3797     movl(cnt2, result);
3798     // Fallthru to tail compare
3799   }
3800   // Shift str2 and str1 to the end of the arrays, negate min
3801   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3802     lea(str1, Address(str1, cnt2, scale));
3803     lea(str2, Address(str2, cnt2, scale));
3804   } else {
3805     lea(str1, Address(str1, cnt2, scale1));
3806     lea(str2, Address(str2, cnt2, scale2));
3807   }
3808   decrementl(cnt2);  // first character was compared already
3809   negptr(cnt2);
3810 
3811   // Compare the rest of the elements
3812   bind(WHILE_HEAD_LABEL);
3813   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3814   subl(result, cnt1);
3815   jccb(Assembler::notZero, POP_LABEL);
3816   increment(cnt2);
3817   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3818 
3819   // Strings are equal up to min length.  Return the length difference.
3820   bind(LENGTH_DIFF_LABEL);
3821   pop(result);
3822   if (ae == StrIntrinsicNode::UU) {
3823     // Divide diff by 2 to get number of chars
3824     sarl(result, 1);
3825   }
3826   jmpb(DONE_LABEL);
3827 
3828 #ifdef _LP64
3829   if (VM_Version::supports_avx512vlbw()) {
3830 
3831     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3832 
3833     kmovql(cnt1, mask);
3834     notq(cnt1);
3835     bsfq(cnt2, cnt1);
3836     if (ae != StrIntrinsicNode::LL) {
3837       // Divide diff by 2 to get number of chars
3838       sarl(cnt2, 1);
3839     }
3840     addq(result, cnt2);
3841     if (ae == StrIntrinsicNode::LL) {
3842       load_unsigned_byte(cnt1, Address(str2, result));
3843       load_unsigned_byte(result, Address(str1, result));
3844     } else if (ae == StrIntrinsicNode::UU) {
3845       load_unsigned_short(cnt1, Address(str2, result, scale));
3846       load_unsigned_short(result, Address(str1, result, scale));
3847     } else {
3848       load_unsigned_short(cnt1, Address(str2, result, scale2));
3849       load_unsigned_byte(result, Address(str1, result, scale1));
3850     }
3851     subl(result, cnt1);
3852     jmpb(POP_LABEL);
3853   }//if (VM_Version::supports_avx512vlbw())
3854 #endif // _LP64
3855 
3856   // Discard the stored length difference
3857   bind(POP_LABEL);
3858   pop(cnt1);
3859 
3860   // That's it
3861   bind(DONE_LABEL);
3862   if(ae == StrIntrinsicNode::UL) {
3863     negl(result);
3864   }
3865 
3866 }
3867 
3868 // Search for Non-ASCII character (Negative byte value) in a byte array,
3869 // return the index of the first such character, otherwise the length
3870 // of the array segment searched.
3871 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3872 //   @IntrinsicCandidate
3873 //   public static int countPositives(byte[] ba, int off, int len) {
3874 //     for (int i = off; i < off + len; i++) {
3875 //       if (ba[i] < 0) {
3876 //         return i - off;
3877 //       }
3878 //     }
3879 //     return len;
3880 //   }
3881 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3882   Register result, Register tmp1,
3883   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3884   // rsi: byte array
3885   // rcx: len
3886   // rax: result
3887   ShortBranchVerifier sbv(this);
3888   assert_different_registers(ary1, len, result, tmp1);
3889   assert_different_registers(vec1, vec2);
3890   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3891 
3892   movl(result, len); // copy
3893   // len == 0
3894   testl(len, len);
3895   jcc(Assembler::zero, DONE);
3896 
3897   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3898     VM_Version::supports_avx512vlbw() &&
3899     VM_Version::supports_bmi2()) {
3900 
3901     Label test_64_loop, test_tail, BREAK_LOOP;
3902     movl(tmp1, len);
3903     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3904 
3905     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
3906     andl(len,  0xffffffc0); // vector count (in chars)
3907     jccb(Assembler::zero, test_tail);
3908 
3909     lea(ary1, Address(ary1, len, Address::times_1));
3910     negptr(len);
3911 
3912     bind(test_64_loop);
3913     // Check whether our 64 elements of size byte contain negatives
3914     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3915     kortestql(mask1, mask1);
3916     jcc(Assembler::notZero, BREAK_LOOP);
3917 
3918     addptr(len, 64);
3919     jccb(Assembler::notZero, test_64_loop);
3920 
3921     bind(test_tail);
3922     // bail out when there is nothing to be done
3923     testl(tmp1, -1);
3924     jcc(Assembler::zero, DONE);
3925 
3926 
3927     // check the tail for absense of negatives
3928     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3929 #ifdef _LP64
3930     {
3931       Register tmp3_aliased = len;
3932       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3933       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3934       notq(tmp3_aliased);
3935       kmovql(mask2, tmp3_aliased);
3936     }
3937 #else
3938     Label k_init;
3939     jmp(k_init);
3940 
3941     // We could not read 64-bits from a general purpose register thus we move
3942     // data required to compose 64 1's to the instruction stream
3943     // We emit 64 byte wide series of elements from 0..63 which later on would
3944     // be used as a compare targets with tail count contained in tmp1 register.
3945     // Result would be a k register having tmp1 consecutive number or 1
3946     // counting from least significant bit.
3947     address tmp = pc();
3948     emit_int64(0x0706050403020100);
3949     emit_int64(0x0F0E0D0C0B0A0908);
3950     emit_int64(0x1716151413121110);
3951     emit_int64(0x1F1E1D1C1B1A1918);
3952     emit_int64(0x2726252423222120);
3953     emit_int64(0x2F2E2D2C2B2A2928);
3954     emit_int64(0x3736353433323130);
3955     emit_int64(0x3F3E3D3C3B3A3938);
3956 
3957     bind(k_init);
3958     lea(len, InternalAddress(tmp));
3959     // create mask to test for negative byte inside a vector
3960     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3961     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3962 
3963 #endif
3964     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3965     ktestq(mask1, mask2);
3966     jcc(Assembler::zero, DONE);
3967 
3968     // do a full check for negative registers in the tail
3969     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
3970                      // ary1 already pointing to the right place
3971     jmpb(TAIL_START);
3972 
3973     bind(BREAK_LOOP);
3974     // At least one byte in the last 64 byte block was negative.
3975     // Set up to look at the last 64 bytes as if they were a tail
3976     lea(ary1, Address(ary1, len, Address::times_1));
3977     addptr(result, len);
3978     // Ignore the very last byte: if all others are positive,
3979     // it must be negative, so we can skip right to the 2+1 byte
3980     // end comparison at this point
3981     orl(result, 63);
3982     movl(len, 63);
3983     // Fallthru to tail compare
3984   } else {
3985 
3986     if (UseAVX >= 2 && UseSSE >= 2) {
3987       // With AVX2, use 32-byte vector compare
3988       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3989 
3990       // Compare 32-byte vectors
3991       testl(len, 0xffffffe0);   // vector count (in bytes)
3992       jccb(Assembler::zero, TAIL_START);
3993 
3994       andl(len, 0xffffffe0);
3995       lea(ary1, Address(ary1, len, Address::times_1));
3996       negptr(len);
3997 
3998       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3999       movdl(vec2, tmp1);
4000       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4001 
4002       bind(COMPARE_WIDE_VECTORS);
4003       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4004       vptest(vec1, vec2);
4005       jccb(Assembler::notZero, BREAK_LOOP);
4006       addptr(len, 32);
4007       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4008 
4009       testl(result, 0x0000001f);   // any bytes remaining?
4010       jcc(Assembler::zero, DONE);
4011 
4012       // Quick test using the already prepared vector mask
4013       movl(len, result);
4014       andl(len, 0x0000001f);
4015       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4016       vptest(vec1, vec2);
4017       jcc(Assembler::zero, DONE);
4018       // There are zeros, jump to the tail to determine exactly where
4019       jmpb(TAIL_START);
4020 
4021       bind(BREAK_LOOP);
4022       // At least one byte in the last 32-byte vector is negative.
4023       // Set up to look at the last 32 bytes as if they were a tail
4024       lea(ary1, Address(ary1, len, Address::times_1));
4025       addptr(result, len);
4026       // Ignore the very last byte: if all others are positive,
4027       // it must be negative, so we can skip right to the 2+1 byte
4028       // end comparison at this point
4029       orl(result, 31);
4030       movl(len, 31);
4031       // Fallthru to tail compare
4032     } else if (UseSSE42Intrinsics) {
4033       // With SSE4.2, use double quad vector compare
4034       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4035 
4036       // Compare 16-byte vectors
4037       testl(len, 0xfffffff0);   // vector count (in bytes)
4038       jcc(Assembler::zero, TAIL_START);
4039 
4040       andl(len, 0xfffffff0);
4041       lea(ary1, Address(ary1, len, Address::times_1));
4042       negptr(len);
4043 
4044       movl(tmp1, 0x80808080);
4045       movdl(vec2, tmp1);
4046       pshufd(vec2, vec2, 0);
4047 
4048       bind(COMPARE_WIDE_VECTORS);
4049       movdqu(vec1, Address(ary1, len, Address::times_1));
4050       ptest(vec1, vec2);
4051       jccb(Assembler::notZero, BREAK_LOOP);
4052       addptr(len, 16);
4053       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4054 
4055       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4056       jcc(Assembler::zero, DONE);
4057 
4058       // Quick test using the already prepared vector mask
4059       movl(len, result);
4060       andl(len, 0x0000000f);   // tail count (in bytes)
4061       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4062       ptest(vec1, vec2);
4063       jcc(Assembler::zero, DONE);
4064       jmpb(TAIL_START);
4065 
4066       bind(BREAK_LOOP);
4067       // At least one byte in the last 16-byte vector is negative.
4068       // Set up and look at the last 16 bytes as if they were a tail
4069       lea(ary1, Address(ary1, len, Address::times_1));
4070       addptr(result, len);
4071       // Ignore the very last byte: if all others are positive,
4072       // it must be negative, so we can skip right to the 2+1 byte
4073       // end comparison at this point
4074       orl(result, 15);
4075       movl(len, 15);
4076       // Fallthru to tail compare
4077     }
4078   }
4079 
4080   bind(TAIL_START);
4081   // Compare 4-byte vectors
4082   andl(len, 0xfffffffc); // vector count (in bytes)
4083   jccb(Assembler::zero, COMPARE_CHAR);
4084 
4085   lea(ary1, Address(ary1, len, Address::times_1));
4086   negptr(len);
4087 
4088   bind(COMPARE_VECTORS);
4089   movl(tmp1, Address(ary1, len, Address::times_1));
4090   andl(tmp1, 0x80808080);
4091   jccb(Assembler::notZero, TAIL_ADJUST);
4092   addptr(len, 4);
4093   jccb(Assembler::notZero, COMPARE_VECTORS);
4094 
4095   // Compare trailing char (final 2-3 bytes), if any
4096   bind(COMPARE_CHAR);
4097 
4098   testl(result, 0x2);   // tail  char
4099   jccb(Assembler::zero, COMPARE_BYTE);
4100   load_unsigned_short(tmp1, Address(ary1, 0));
4101   andl(tmp1, 0x00008080);
4102   jccb(Assembler::notZero, CHAR_ADJUST);
4103   lea(ary1, Address(ary1, 2));
4104 
4105   bind(COMPARE_BYTE);
4106   testl(result, 0x1);   // tail  byte
4107   jccb(Assembler::zero, DONE);
4108   load_unsigned_byte(tmp1, Address(ary1, 0));
4109   testl(tmp1, 0x00000080);
4110   jccb(Assembler::zero, DONE);
4111   subptr(result, 1);
4112   jmpb(DONE);
4113 
4114   bind(TAIL_ADJUST);
4115   // there are negative bits in the last 4 byte block.
4116   // Adjust result and check the next three bytes
4117   addptr(result, len);
4118   orl(result, 3);
4119   lea(ary1, Address(ary1, len, Address::times_1));
4120   jmpb(COMPARE_CHAR);
4121 
4122   bind(CHAR_ADJUST);
4123   // We are looking at a char + optional byte tail, and found that one
4124   // of the bytes in the char is negative. Adjust the result, check the
4125   // first byte and readjust if needed.
4126   andl(result, 0xfffffffc);
4127   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4128   jccb(Assembler::notZero, DONE);
4129   addptr(result, 1);
4130 
4131   // That's it
4132   bind(DONE);
4133   if (UseAVX >= 2 && UseSSE >= 2) {
4134     // clean upper bits of YMM registers
4135     vpxor(vec1, vec1);
4136     vpxor(vec2, vec2);
4137   }
4138 }
4139 
4140 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4141 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4142                                       Register limit, Register result, Register chr,
4143                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4144                                       KRegister mask, bool expand_ary2) {
4145   // for expand_ary2, limit is the (smaller) size of the second array.
4146   ShortBranchVerifier sbv(this);
4147   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4148 
4149   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4150          "Expansion only implemented for AVX2");
4151 
4152   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4153   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4154 
4155   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4156   int scaleIncr = expand_ary2 ? 8 : 16;
4157 
4158   if (is_array_equ) {
4159     // Check the input args
4160     cmpoop(ary1, ary2);
4161     jcc(Assembler::equal, TRUE_LABEL);
4162 
4163     // Need additional checks for arrays_equals.
4164     testptr(ary1, ary1);
4165     jcc(Assembler::zero, FALSE_LABEL);
4166     testptr(ary2, ary2);
4167     jcc(Assembler::zero, FALSE_LABEL);
4168 
4169     // Check the lengths
4170     movl(limit, Address(ary1, length_offset));
4171     cmpl(limit, Address(ary2, length_offset));
4172     jcc(Assembler::notEqual, FALSE_LABEL);
4173   }
4174 
4175   // count == 0
4176   testl(limit, limit);
4177   jcc(Assembler::zero, TRUE_LABEL);
4178 
4179   if (is_array_equ) {
4180     // Load array address
4181     lea(ary1, Address(ary1, base_offset));
4182     lea(ary2, Address(ary2, base_offset));
4183   }
4184 
4185   if (is_array_equ && is_char) {
4186     // arrays_equals when used for char[].
4187     shll(limit, 1);      // byte count != 0
4188   }
4189   movl(result, limit); // copy
4190 
4191   if (UseAVX >= 2) {
4192     // With AVX2, use 32-byte vector compare
4193     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4194 
4195     // Compare 32-byte vectors
4196     if (expand_ary2) {
4197       andl(result, 0x0000000f);  //   tail count (in bytes)
4198       andl(limit, 0xfffffff0);   // vector count (in bytes)
4199       jcc(Assembler::zero, COMPARE_TAIL);
4200     } else {
4201       andl(result, 0x0000001f);  //   tail count (in bytes)
4202       andl(limit, 0xffffffe0);   // vector count (in bytes)
4203       jcc(Assembler::zero, COMPARE_TAIL_16);
4204     }
4205 
4206     lea(ary1, Address(ary1, limit, scaleFactor));
4207     lea(ary2, Address(ary2, limit, Address::times_1));
4208     negptr(limit);
4209 
4210 #ifdef _LP64
4211     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4212       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4213 
4214       cmpl(limit, -64);
4215       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4216 
4217       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4218 
4219       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4220       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4221       kortestql(mask, mask);
4222       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4223       addptr(limit, 64);  // update since we already compared at this addr
4224       cmpl(limit, -64);
4225       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4226 
4227       // At this point we may still need to compare -limit+result bytes.
4228       // We could execute the next two instruction and just continue via non-wide path:
4229       //  cmpl(limit, 0);
4230       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4231       // But since we stopped at the points ary{1,2}+limit which are
4232       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4233       // (|limit| <= 32 and result < 32),
4234       // we may just compare the last 64 bytes.
4235       //
4236       addptr(result, -64);   // it is safe, bc we just came from this area
4237       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4238       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4239       kortestql(mask, mask);
4240       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4241 
4242       jmp(TRUE_LABEL);
4243 
4244       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4245 
4246     }//if (VM_Version::supports_avx512vlbw())
4247 #endif //_LP64
4248     bind(COMPARE_WIDE_VECTORS);
4249     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4250     if (expand_ary2) {
4251       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4252     } else {
4253       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4254     }
4255     vpxor(vec1, vec2);
4256 
4257     vptest(vec1, vec1);
4258     jcc(Assembler::notZero, FALSE_LABEL);
4259     addptr(limit, scaleIncr * 2);
4260     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4261 
4262     testl(result, result);
4263     jcc(Assembler::zero, TRUE_LABEL);
4264 
4265     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4266     if (expand_ary2) {
4267       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4268     } else {
4269       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4270     }
4271     vpxor(vec1, vec2);
4272 
4273     vptest(vec1, vec1);
4274     jcc(Assembler::notZero, FALSE_LABEL);
4275     jmp(TRUE_LABEL);
4276 
4277     bind(COMPARE_TAIL_16); // limit is zero
4278     movl(limit, result);
4279 
4280     // Compare 16-byte chunks
4281     andl(result, 0x0000000f);  //   tail count (in bytes)
4282     andl(limit, 0xfffffff0);   // vector count (in bytes)
4283     jcc(Assembler::zero, COMPARE_TAIL);
4284 
4285     lea(ary1, Address(ary1, limit, scaleFactor));
4286     lea(ary2, Address(ary2, limit, Address::times_1));
4287     negptr(limit);
4288 
4289     bind(COMPARE_WIDE_VECTORS_16);
4290     movdqu(vec1, Address(ary1, limit, scaleFactor));
4291     if (expand_ary2) {
4292       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4293     } else {
4294       movdqu(vec2, Address(ary2, limit, Address::times_1));
4295     }
4296     pxor(vec1, vec2);
4297 
4298     ptest(vec1, vec1);
4299     jcc(Assembler::notZero, FALSE_LABEL);
4300     addptr(limit, scaleIncr);
4301     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4302 
4303     bind(COMPARE_TAIL); // limit is zero
4304     movl(limit, result);
4305     // Fallthru to tail compare
4306   } else if (UseSSE42Intrinsics) {
4307     // With SSE4.2, use double quad vector compare
4308     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4309 
4310     // Compare 16-byte vectors
4311     andl(result, 0x0000000f);  //   tail count (in bytes)
4312     andl(limit, 0xfffffff0);   // vector count (in bytes)
4313     jcc(Assembler::zero, COMPARE_TAIL);
4314 
4315     lea(ary1, Address(ary1, limit, Address::times_1));
4316     lea(ary2, Address(ary2, limit, Address::times_1));
4317     negptr(limit);
4318 
4319     bind(COMPARE_WIDE_VECTORS);
4320     movdqu(vec1, Address(ary1, limit, Address::times_1));
4321     movdqu(vec2, Address(ary2, limit, Address::times_1));
4322     pxor(vec1, vec2);
4323 
4324     ptest(vec1, vec1);
4325     jcc(Assembler::notZero, FALSE_LABEL);
4326     addptr(limit, 16);
4327     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4328 
4329     testl(result, result);
4330     jcc(Assembler::zero, TRUE_LABEL);
4331 
4332     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4333     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4334     pxor(vec1, vec2);
4335 
4336     ptest(vec1, vec1);
4337     jccb(Assembler::notZero, FALSE_LABEL);
4338     jmpb(TRUE_LABEL);
4339 
4340     bind(COMPARE_TAIL); // limit is zero
4341     movl(limit, result);
4342     // Fallthru to tail compare
4343   }
4344 
4345   // Compare 4-byte vectors
4346   if (expand_ary2) {
4347     testl(result, result);
4348     jccb(Assembler::zero, TRUE_LABEL);
4349   } else {
4350     andl(limit, 0xfffffffc); // vector count (in bytes)
4351     jccb(Assembler::zero, COMPARE_CHAR);
4352   }
4353 
4354   lea(ary1, Address(ary1, limit, scaleFactor));
4355   lea(ary2, Address(ary2, limit, Address::times_1));
4356   negptr(limit);
4357 
4358   bind(COMPARE_VECTORS);
4359   if (expand_ary2) {
4360     // There are no "vector" operations for bytes to shorts
4361     movzbl(chr, Address(ary2, limit, Address::times_1));
4362     cmpw(Address(ary1, limit, Address::times_2), chr);
4363     jccb(Assembler::notEqual, FALSE_LABEL);
4364     addptr(limit, 1);
4365     jcc(Assembler::notZero, COMPARE_VECTORS);
4366     jmp(TRUE_LABEL);
4367   } else {
4368     movl(chr, Address(ary1, limit, Address::times_1));
4369     cmpl(chr, Address(ary2, limit, Address::times_1));
4370     jccb(Assembler::notEqual, FALSE_LABEL);
4371     addptr(limit, 4);
4372     jcc(Assembler::notZero, COMPARE_VECTORS);
4373   }
4374 
4375   // Compare trailing char (final 2 bytes), if any
4376   bind(COMPARE_CHAR);
4377   testl(result, 0x2);   // tail  char
4378   jccb(Assembler::zero, COMPARE_BYTE);
4379   load_unsigned_short(chr, Address(ary1, 0));
4380   load_unsigned_short(limit, Address(ary2, 0));
4381   cmpl(chr, limit);
4382   jccb(Assembler::notEqual, FALSE_LABEL);
4383 
4384   if (is_array_equ && is_char) {
4385     bind(COMPARE_BYTE);
4386   } else {
4387     lea(ary1, Address(ary1, 2));
4388     lea(ary2, Address(ary2, 2));
4389 
4390     bind(COMPARE_BYTE);
4391     testl(result, 0x1);   // tail  byte
4392     jccb(Assembler::zero, TRUE_LABEL);
4393     load_unsigned_byte(chr, Address(ary1, 0));
4394     load_unsigned_byte(limit, Address(ary2, 0));
4395     cmpl(chr, limit);
4396     jccb(Assembler::notEqual, FALSE_LABEL);
4397   }
4398   bind(TRUE_LABEL);
4399   movl(result, 1);   // return true
4400   jmpb(DONE);
4401 
4402   bind(FALSE_LABEL);
4403   xorl(result, result); // return false
4404 
4405   // That's it
4406   bind(DONE);
4407   if (UseAVX >= 2) {
4408     // clean upper bits of YMM registers
4409     vpxor(vec1, vec1);
4410     vpxor(vec2, vec2);
4411   }
4412 }
4413 
4414 #ifdef _LP64
4415 
4416 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4417 #define __ masm.
4418   Register dst = stub.data<0>();
4419   XMMRegister src = stub.data<1>();
4420   address target = stub.data<2>();
4421   __ bind(stub.entry());
4422   __ subptr(rsp, 8);
4423   __ movdbl(Address(rsp), src);
4424   __ call(RuntimeAddress(target));
4425   __ pop(dst);
4426   __ jmp(stub.continuation());
4427 #undef __
4428 }
4429 
4430 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4431   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4432   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4433 
4434   address slowpath_target;
4435   if (dst_bt == T_INT) {
4436     if (src_bt == T_FLOAT) {
4437       cvttss2sil(dst, src);
4438       cmpl(dst, 0x80000000);
4439       slowpath_target = StubRoutines::x86::f2i_fixup();
4440     } else {
4441       cvttsd2sil(dst, src);
4442       cmpl(dst, 0x80000000);
4443       slowpath_target = StubRoutines::x86::d2i_fixup();
4444     }
4445   } else {
4446     if (src_bt == T_FLOAT) {
4447       cvttss2siq(dst, src);
4448       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4449       slowpath_target = StubRoutines::x86::f2l_fixup();
4450     } else {
4451       cvttsd2siq(dst, src);
4452       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4453       slowpath_target = StubRoutines::x86::d2l_fixup();
4454     }
4455   }
4456 
4457   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4458   jcc(Assembler::equal, stub->entry());
4459   bind(stub->continuation());
4460 }
4461 
4462 #endif // _LP64
4463 
4464 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4465                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4466   switch(ideal_opc) {
4467     case Op_LShiftVS:
4468       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4469     case Op_LShiftVI:
4470       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4471     case Op_LShiftVL:
4472       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4473     case Op_RShiftVS:
4474       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4475     case Op_RShiftVI:
4476       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4477     case Op_RShiftVL:
4478       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4479     case Op_URShiftVS:
4480       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4481     case Op_URShiftVI:
4482       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4483     case Op_URShiftVL:
4484       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4485     case Op_RotateRightV:
4486       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4487     case Op_RotateLeftV:
4488       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4489     default:
4490       fatal("Unsupported masked operation"); break;
4491   }
4492 }
4493 
4494 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4495                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4496                                     bool is_varshift) {
4497   switch (ideal_opc) {
4498     case Op_AddVB:
4499       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4500     case Op_AddVS:
4501       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4502     case Op_AddVI:
4503       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4504     case Op_AddVL:
4505       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4506     case Op_AddVF:
4507       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4508     case Op_AddVD:
4509       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4510     case Op_SubVB:
4511       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4512     case Op_SubVS:
4513       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4514     case Op_SubVI:
4515       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4516     case Op_SubVL:
4517       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4518     case Op_SubVF:
4519       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4520     case Op_SubVD:
4521       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4522     case Op_MulVS:
4523       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4524     case Op_MulVI:
4525       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4526     case Op_MulVL:
4527       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4528     case Op_MulVF:
4529       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4530     case Op_MulVD:
4531       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4532     case Op_DivVF:
4533       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4534     case Op_DivVD:
4535       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4536     case Op_SqrtVF:
4537       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4538     case Op_SqrtVD:
4539       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4540     case Op_AbsVB:
4541       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4542     case Op_AbsVS:
4543       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4544     case Op_AbsVI:
4545       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4546     case Op_AbsVL:
4547       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4548     case Op_FmaVF:
4549       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4550     case Op_FmaVD:
4551       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4552     case Op_VectorRearrange:
4553       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4554     case Op_LShiftVS:
4555       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4556     case Op_LShiftVI:
4557       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4558     case Op_LShiftVL:
4559       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4560     case Op_RShiftVS:
4561       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4562     case Op_RShiftVI:
4563       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4564     case Op_RShiftVL:
4565       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4566     case Op_URShiftVS:
4567       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4568     case Op_URShiftVI:
4569       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4570     case Op_URShiftVL:
4571       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4572     case Op_RotateLeftV:
4573       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4574     case Op_RotateRightV:
4575       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4576     case Op_MaxV:
4577       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4578     case Op_MinV:
4579       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4580     case Op_XorV:
4581       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4582     case Op_OrV:
4583       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4584     case Op_AndV:
4585       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4586     default:
4587       fatal("Unsupported masked operation"); break;
4588   }
4589 }
4590 
4591 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4592                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4593   switch (ideal_opc) {
4594     case Op_AddVB:
4595       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4596     case Op_AddVS:
4597       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4598     case Op_AddVI:
4599       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4600     case Op_AddVL:
4601       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4602     case Op_AddVF:
4603       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4604     case Op_AddVD:
4605       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4606     case Op_SubVB:
4607       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4608     case Op_SubVS:
4609       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4610     case Op_SubVI:
4611       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4612     case Op_SubVL:
4613       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4614     case Op_SubVF:
4615       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4616     case Op_SubVD:
4617       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4618     case Op_MulVS:
4619       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4620     case Op_MulVI:
4621       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4622     case Op_MulVL:
4623       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4624     case Op_MulVF:
4625       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4626     case Op_MulVD:
4627       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4628     case Op_DivVF:
4629       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4630     case Op_DivVD:
4631       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4632     case Op_FmaVF:
4633       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4634     case Op_FmaVD:
4635       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4636     case Op_MaxV:
4637       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4638     case Op_MinV:
4639       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4640     case Op_XorV:
4641       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4642     case Op_OrV:
4643       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4644     case Op_AndV:
4645       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4646     default:
4647       fatal("Unsupported masked operation"); break;
4648   }
4649 }
4650 
4651 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4652                                   KRegister src1, KRegister src2) {
4653   BasicType etype = T_ILLEGAL;
4654   switch(mask_len) {
4655     case 2:
4656     case 4:
4657     case 8:  etype = T_BYTE; break;
4658     case 16: etype = T_SHORT; break;
4659     case 32: etype = T_INT; break;
4660     case 64: etype = T_LONG; break;
4661     default: fatal("Unsupported type"); break;
4662   }
4663   assert(etype != T_ILLEGAL, "");
4664   switch(ideal_opc) {
4665     case Op_AndVMask:
4666       kand(etype, dst, src1, src2); break;
4667     case Op_OrVMask:
4668       kor(etype, dst, src1, src2); break;
4669     case Op_XorVMask:
4670       kxor(etype, dst, src1, src2); break;
4671     default:
4672       fatal("Unsupported masked operation"); break;
4673   }
4674 }
4675 
4676 /*
4677  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4678  * If src is NaN, the result is 0.
4679  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4680  * the result is equal to the value of Integer.MIN_VALUE.
4681  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4682  * the result is equal to the value of Integer.MAX_VALUE.
4683  */
4684 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4685                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4686                                                                    Register rscratch, AddressLiteral float_sign_flip,
4687                                                                    int vec_enc) {
4688   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4689   Label done;
4690   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4691   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4692   vptest(xtmp2, xtmp2, vec_enc);
4693   jccb(Assembler::equal, done);
4694 
4695   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4696   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4697 
4698   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4699   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4700   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4701 
4702   // Recompute the mask for remaining special value.
4703   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4704   // Extract SRC values corresponding to TRUE mask lanes.
4705   vpand(xtmp4, xtmp2, src, vec_enc);
4706   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4707   // values are set.
4708   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4709 
4710   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4711   bind(done);
4712 }
4713 
4714 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4715                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4716                                                                     Register rscratch, AddressLiteral float_sign_flip,
4717                                                                     int vec_enc) {
4718   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4719   Label done;
4720   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4721   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4722   kortestwl(ktmp1, ktmp1);
4723   jccb(Assembler::equal, done);
4724 
4725   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4726   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4727   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4728 
4729   kxorwl(ktmp1, ktmp1, ktmp2);
4730   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4731   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4732   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4733   bind(done);
4734 }
4735 
4736 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4737                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4738                                                                      Register rscratch, AddressLiteral double_sign_flip,
4739                                                                      int vec_enc) {
4740   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4741 
4742   Label done;
4743   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4744   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4745   kortestwl(ktmp1, ktmp1);
4746   jccb(Assembler::equal, done);
4747 
4748   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4749   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4750   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4751 
4752   kxorwl(ktmp1, ktmp1, ktmp2);
4753   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4754   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4755   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4756   bind(done);
4757 }
4758 
4759 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4760                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4761                                                                      Register rscratch, AddressLiteral float_sign_flip,
4762                                                                      int vec_enc) {
4763   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4764   Label done;
4765   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4766   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4767   kortestwl(ktmp1, ktmp1);
4768   jccb(Assembler::equal, done);
4769 
4770   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4771   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4772   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4773 
4774   kxorwl(ktmp1, ktmp1, ktmp2);
4775   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4776   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4777   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4778   bind(done);
4779 }
4780 
4781 /*
4782  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4783  * If src is NaN, the result is 0.
4784  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4785  * the result is equal to the value of Long.MIN_VALUE.
4786  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4787  * the result is equal to the value of Long.MAX_VALUE.
4788  */
4789 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4790                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4791                                                                       Register rscratch, AddressLiteral double_sign_flip,
4792                                                                       int vec_enc) {
4793   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4794 
4795   Label done;
4796   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4797   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4798   kortestwl(ktmp1, ktmp1);
4799   jccb(Assembler::equal, done);
4800 
4801   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4802   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4803   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4804 
4805   kxorwl(ktmp1, ktmp1, ktmp2);
4806   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4807   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4808   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4809   bind(done);
4810 }
4811 
4812 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4813                                                              XMMRegister xtmp, int index, int vec_enc) {
4814    assert(vec_enc < Assembler::AVX_512bit, "");
4815    if (vec_enc == Assembler::AVX_256bit) {
4816      vextractf128_high(xtmp, src);
4817      vshufps(dst, src, xtmp, index, vec_enc);
4818    } else {
4819      vshufps(dst, src, zero, index, vec_enc);
4820    }
4821 }
4822 
4823 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4824                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4825                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4826   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4827 
4828   Label done;
4829   // Compare the destination lanes with float_sign_flip
4830   // value to get mask for all special values.
4831   movdqu(xtmp1, float_sign_flip, rscratch);
4832   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4833   ptest(xtmp2, xtmp2);
4834   jccb(Assembler::equal, done);
4835 
4836   // Flip float_sign_flip to get max integer value.
4837   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4838   pxor(xtmp1, xtmp4);
4839 
4840   // Set detination lanes corresponding to unordered source lanes as zero.
4841   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
4842   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
4843 
4844   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4845   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4846   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
4847 
4848   // Recompute the mask for remaining special value.
4849   pxor(xtmp2, xtmp3);
4850   // Extract mask corresponding to non-negative source lanes.
4851   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
4852 
4853   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4854   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4855   pand(xtmp3, xtmp2);
4856 
4857   // Replace destination lanes holding special value(0x80000000) with max int
4858   // if corresponding source lane holds a +ve value.
4859   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
4860   bind(done);
4861 }
4862 
4863 
4864 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
4865                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
4866   switch(to_elem_bt) {
4867     case T_SHORT:
4868       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
4869       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
4870       vpackusdw(dst, dst, zero, vec_enc);
4871       if (vec_enc == Assembler::AVX_256bit) {
4872         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4873       }
4874       break;
4875     case  T_BYTE:
4876       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
4877       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
4878       vpackusdw(dst, dst, zero, vec_enc);
4879       if (vec_enc == Assembler::AVX_256bit) {
4880         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4881       }
4882       vpackuswb(dst, dst, zero, vec_enc);
4883       break;
4884     default: assert(false, "%s", type2name(to_elem_bt));
4885   }
4886 }
4887 
4888 /*
4889  * Algorithm for vector D2L and F2I conversions:-
4890  * a) Perform vector D2L/F2I cast.
4891  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4892  *    It signifies that source value could be any of the special floating point
4893  *    values(NaN,-Inf,Inf,Max,-Min).
4894  * c) Set destination to zero if source is NaN value.
4895  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4896  */
4897 
4898 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4899                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4900                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4901   int to_elem_sz = type2aelembytes(to_elem_bt);
4902   assert(to_elem_sz <= 4, "");
4903   vcvttps2dq(dst, src, vec_enc);
4904   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
4905   if (to_elem_sz < 4) {
4906     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4907     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
4908   }
4909 }
4910 
4911 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4912                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
4913                                             Register rscratch, int vec_enc) {
4914   int to_elem_sz = type2aelembytes(to_elem_bt);
4915   assert(to_elem_sz <= 4, "");
4916   vcvttps2dq(dst, src, vec_enc);
4917   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
4918   switch(to_elem_bt) {
4919     case T_INT:
4920       break;
4921     case T_SHORT:
4922       evpmovdw(dst, dst, vec_enc);
4923       break;
4924     case T_BYTE:
4925       evpmovdb(dst, dst, vec_enc);
4926       break;
4927     default: assert(false, "%s", type2name(to_elem_bt));
4928   }
4929 }
4930 
4931 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4932                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
4933                                             Register rscratch, int vec_enc) {
4934   evcvttps2qq(dst, src, vec_enc);
4935   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
4936 }
4937 
4938 // Handling for downcasting from double to integer or sub-word types on AVX2.
4939 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4940                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
4941                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4942   int to_elem_sz = type2aelembytes(to_elem_bt);
4943   assert(to_elem_sz < 8, "");
4944   vcvttpd2dq(dst, src, vec_enc);
4945   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
4946                                               float_sign_flip, vec_enc);
4947   if (to_elem_sz < 4) {
4948     // xtmp4 holds all zero lanes.
4949     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
4950   }
4951 }
4952 
4953 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
4954                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
4955                                             KRegister ktmp2, AddressLiteral sign_flip,
4956                                             Register rscratch, int vec_enc) {
4957   if (VM_Version::supports_avx512dq()) {
4958     evcvttpd2qq(dst, src, vec_enc);
4959     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4960     switch(to_elem_bt) {
4961       case T_LONG:
4962         break;
4963       case T_INT:
4964         evpmovsqd(dst, dst, vec_enc);
4965         break;
4966       case T_SHORT:
4967         evpmovsqd(dst, dst, vec_enc);
4968         evpmovdw(dst, dst, vec_enc);
4969         break;
4970       case T_BYTE:
4971         evpmovsqd(dst, dst, vec_enc);
4972         evpmovdb(dst, dst, vec_enc);
4973         break;
4974       default: assert(false, "%s", type2name(to_elem_bt));
4975     }
4976   } else {
4977     assert(type2aelembytes(to_elem_bt) <= 4, "");
4978     vcvttpd2dq(dst, src, vec_enc);
4979     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4980     switch(to_elem_bt) {
4981       case T_INT:
4982         break;
4983       case T_SHORT:
4984         evpmovdw(dst, dst, vec_enc);
4985         break;
4986       case T_BYTE:
4987         evpmovdb(dst, dst, vec_enc);
4988         break;
4989       default: assert(false, "%s", type2name(to_elem_bt));
4990     }
4991   }
4992 }
4993 
4994 #ifdef _LP64
4995 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
4996                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4997                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4998   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4999   // and re-instantiate original MXCSR.RC mode after that.
5000   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5001 
5002   mov64(tmp, julong_cast(0.5L));
5003   evpbroadcastq(xtmp1, tmp, vec_enc);
5004   vaddpd(xtmp1, src , xtmp1, vec_enc);
5005   evcvtpd2qq(dst, xtmp1, vec_enc);
5006   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5007                                                 double_sign_flip, vec_enc);;
5008 
5009   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5010 }
5011 
5012 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5013                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5014                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5015   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5016   // and re-instantiate original MXCSR.RC mode after that.
5017   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5018 
5019   movl(tmp, jint_cast(0.5));
5020   movq(xtmp1, tmp);
5021   vbroadcastss(xtmp1, xtmp1, vec_enc);
5022   vaddps(xtmp1, src , xtmp1, vec_enc);
5023   vcvtps2dq(dst, xtmp1, vec_enc);
5024   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5025                                               float_sign_flip, vec_enc);
5026 
5027   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5028 }
5029 
5030 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5031                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5032                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5033   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5034   // and re-instantiate original MXCSR.RC mode after that.
5035   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5036 
5037   movl(tmp, jint_cast(0.5));
5038   movq(xtmp1, tmp);
5039   vbroadcastss(xtmp1, xtmp1, vec_enc);
5040   vaddps(xtmp1, src , xtmp1, vec_enc);
5041   vcvtps2dq(dst, xtmp1, vec_enc);
5042   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5043 
5044   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5045 }
5046 #endif // _LP64
5047 
5048 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5049                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5050   switch (from_elem_bt) {
5051     case T_BYTE:
5052       switch (to_elem_bt) {
5053         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5054         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5055         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5056         default: ShouldNotReachHere();
5057       }
5058       break;
5059     case T_SHORT:
5060       switch (to_elem_bt) {
5061         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5062         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5063         default: ShouldNotReachHere();
5064       }
5065       break;
5066     case T_INT:
5067       assert(to_elem_bt == T_LONG, "");
5068       vpmovzxdq(dst, src, vlen_enc);
5069       break;
5070     default:
5071       ShouldNotReachHere();
5072   }
5073 }
5074 
5075 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5076                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5077   switch (from_elem_bt) {
5078     case T_BYTE:
5079       switch (to_elem_bt) {
5080         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5081         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5082         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5083         default: ShouldNotReachHere();
5084       }
5085       break;
5086     case T_SHORT:
5087       switch (to_elem_bt) {
5088         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5089         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5090         default: ShouldNotReachHere();
5091       }
5092       break;
5093     case T_INT:
5094       assert(to_elem_bt == T_LONG, "");
5095       vpmovsxdq(dst, src, vlen_enc);
5096       break;
5097     default:
5098       ShouldNotReachHere();
5099   }
5100 }
5101 
5102 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5103                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5104   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5105   assert(vlen_enc != AVX_512bit, "");
5106 
5107   int dst_bt_size = type2aelembytes(dst_bt);
5108   int src_bt_size = type2aelembytes(src_bt);
5109   if (dst_bt_size > src_bt_size) {
5110     switch (dst_bt_size / src_bt_size) {
5111       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5112       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5113       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5114       default: ShouldNotReachHere();
5115     }
5116   } else {
5117     assert(dst_bt_size < src_bt_size, "");
5118     switch (src_bt_size / dst_bt_size) {
5119       case 2: {
5120         if (vlen_enc == AVX_128bit) {
5121           vpacksswb(dst, src, src, vlen_enc);
5122         } else {
5123           vpacksswb(dst, src, src, vlen_enc);
5124           vpermq(dst, dst, 0x08, vlen_enc);
5125         }
5126         break;
5127       }
5128       case 4: {
5129         if (vlen_enc == AVX_128bit) {
5130           vpackssdw(dst, src, src, vlen_enc);
5131           vpacksswb(dst, dst, dst, vlen_enc);
5132         } else {
5133           vpackssdw(dst, src, src, vlen_enc);
5134           vpermq(dst, dst, 0x08, vlen_enc);
5135           vpacksswb(dst, dst, dst, AVX_128bit);
5136         }
5137         break;
5138       }
5139       case 8: {
5140         if (vlen_enc == AVX_128bit) {
5141           vpshufd(dst, src, 0x08, vlen_enc);
5142           vpackssdw(dst, dst, dst, vlen_enc);
5143           vpacksswb(dst, dst, dst, vlen_enc);
5144         } else {
5145           vpshufd(dst, src, 0x08, vlen_enc);
5146           vpermq(dst, dst, 0x08, vlen_enc);
5147           vpackssdw(dst, dst, dst, AVX_128bit);
5148           vpacksswb(dst, dst, dst, AVX_128bit);
5149         }
5150         break;
5151       }
5152       default: ShouldNotReachHere();
5153     }
5154   }
5155 }
5156 
5157 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5158                                    bool merge, BasicType bt, int vlen_enc) {
5159   if (bt == T_INT) {
5160     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5161   } else {
5162     assert(bt == T_LONG, "");
5163     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5164   }
5165 }
5166 
5167 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5168                                    bool merge, BasicType bt, int vlen_enc) {
5169   if (bt == T_INT) {
5170     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5171   } else {
5172     assert(bt == T_LONG, "");
5173     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5174   }
5175 }
5176 
5177 #ifdef _LP64
5178 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5179                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5180                                                int vec_enc) {
5181   int index = 0;
5182   int vindex = 0;
5183   mov64(rtmp1, 0x0101010101010101L);
5184   pdepq(rtmp1, src, rtmp1);
5185   if (mask_len > 8) {
5186     movq(rtmp2, src);
5187     vpxor(xtmp, xtmp, xtmp, vec_enc);
5188     movq(xtmp, rtmp1);
5189   }
5190   movq(dst, rtmp1);
5191 
5192   mask_len -= 8;
5193   while (mask_len > 0) {
5194     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5195     index++;
5196     if ((index % 2) == 0) {
5197       pxor(xtmp, xtmp);
5198     }
5199     mov64(rtmp1, 0x0101010101010101L);
5200     shrq(rtmp2, 8);
5201     pdepq(rtmp1, rtmp2, rtmp1);
5202     pinsrq(xtmp, rtmp1, index % 2);
5203     vindex = index / 2;
5204     if (vindex) {
5205       // Write entire 16 byte vector when both 64 bit
5206       // lanes are update to save redundant instructions.
5207       if (index % 2) {
5208         vinsertf128(dst, dst, xtmp, vindex);
5209       }
5210     } else {
5211       vmovdqu(dst, xtmp);
5212     }
5213     mask_len -= 8;
5214   }
5215 }
5216 
5217 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5218   switch(opc) {
5219     case Op_VectorMaskTrueCount:
5220       popcntq(dst, tmp);
5221       break;
5222     case Op_VectorMaskLastTrue:
5223       if (VM_Version::supports_lzcnt()) {
5224         lzcntq(tmp, tmp);
5225         movl(dst, 63);
5226         subl(dst, tmp);
5227       } else {
5228         movl(dst, -1);
5229         bsrq(tmp, tmp);
5230         cmov32(Assembler::notZero, dst, tmp);
5231       }
5232       break;
5233     case Op_VectorMaskFirstTrue:
5234       if (VM_Version::supports_bmi1()) {
5235         if (masklen < 32) {
5236           orl(tmp, 1 << masklen);
5237           tzcntl(dst, tmp);
5238         } else if (masklen == 32) {
5239           tzcntl(dst, tmp);
5240         } else {
5241           assert(masklen == 64, "");
5242           tzcntq(dst, tmp);
5243         }
5244       } else {
5245         if (masklen < 32) {
5246           orl(tmp, 1 << masklen);
5247           bsfl(dst, tmp);
5248         } else {
5249           assert(masklen == 32 || masklen == 64, "");
5250           movl(dst, masklen);
5251           if (masklen == 32)  {
5252             bsfl(tmp, tmp);
5253           } else {
5254             bsfq(tmp, tmp);
5255           }
5256           cmov32(Assembler::notZero, dst, tmp);
5257         }
5258       }
5259       break;
5260     case Op_VectorMaskToLong:
5261       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5262       break;
5263     default: assert(false, "Unhandled mask operation");
5264   }
5265 }
5266 
5267 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5268                                               int masklen, int masksize, int vec_enc) {
5269   assert(VM_Version::supports_popcnt(), "");
5270 
5271   if(VM_Version::supports_avx512bw()) {
5272     kmovql(tmp, mask);
5273   } else {
5274     assert(masklen <= 16, "");
5275     kmovwl(tmp, mask);
5276   }
5277 
5278   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5279   // operations needs to be clipped.
5280   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5281     andq(tmp, (1 << masklen) - 1);
5282   }
5283 
5284   vector_mask_operation_helper(opc, dst, tmp, masklen);
5285 }
5286 
5287 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5288                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5289   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5290          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5291   assert(VM_Version::supports_popcnt(), "");
5292 
5293   bool need_clip = false;
5294   switch(bt) {
5295     case T_BOOLEAN:
5296       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5297       vpxor(xtmp, xtmp, xtmp, vec_enc);
5298       vpsubb(xtmp, xtmp, mask, vec_enc);
5299       vpmovmskb(tmp, xtmp, vec_enc);
5300       need_clip = masklen < 16;
5301       break;
5302     case T_BYTE:
5303       vpmovmskb(tmp, mask, vec_enc);
5304       need_clip = masklen < 16;
5305       break;
5306     case T_SHORT:
5307       vpacksswb(xtmp, mask, mask, vec_enc);
5308       if (masklen >= 16) {
5309         vpermpd(xtmp, xtmp, 8, vec_enc);
5310       }
5311       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5312       need_clip = masklen < 16;
5313       break;
5314     case T_INT:
5315     case T_FLOAT:
5316       vmovmskps(tmp, mask, vec_enc);
5317       need_clip = masklen < 4;
5318       break;
5319     case T_LONG:
5320     case T_DOUBLE:
5321       vmovmskpd(tmp, mask, vec_enc);
5322       need_clip = masklen < 2;
5323       break;
5324     default: assert(false, "Unhandled type, %s", type2name(bt));
5325   }
5326 
5327   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5328   // operations needs to be clipped.
5329   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5330     // need_clip implies masklen < 32
5331     andq(tmp, (1 << masklen) - 1);
5332   }
5333 
5334   vector_mask_operation_helper(opc, dst, tmp, masklen);
5335 }
5336 
5337 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5338                                              Register rtmp2, int mask_len) {
5339   kmov(rtmp1, src);
5340   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5341   mov64(rtmp2, -1L);
5342   pextq(rtmp2, rtmp2, rtmp1);
5343   kmov(dst, rtmp2);
5344 }
5345 
5346 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5347                                                     XMMRegister mask, Register rtmp, Register rscratch,
5348                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5349                                                     int vec_enc) {
5350   assert(type2aelembytes(bt) >= 4, "");
5351   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5352   address compress_perm_table = nullptr;
5353   address expand_perm_table = nullptr;
5354   if (type2aelembytes(bt) == 8) {
5355     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5356     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5357     vmovmskpd(rtmp, mask, vec_enc);
5358   } else {
5359     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5360     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5361     vmovmskps(rtmp, mask, vec_enc);
5362   }
5363   shlq(rtmp, 5); // for 32 byte permute row.
5364   if (opcode == Op_CompressV) {
5365     lea(rscratch, ExternalAddress(compress_perm_table));
5366   } else {
5367     lea(rscratch, ExternalAddress(expand_perm_table));
5368   }
5369   addptr(rtmp, rscratch);
5370   vmovdqu(permv, Address(rtmp));
5371   vpermps(dst, permv, src, Assembler::AVX_256bit);
5372   vpxor(xtmp, xtmp, xtmp, vec_enc);
5373   // Blend the result with zero vector using permute mask, each column entry
5374   // in a permute table row contains either a valid permute index or a -1 (default)
5375   // value, this can potentially be used as a blending mask after
5376   // compressing/expanding the source vector lanes.
5377   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5378 }
5379 
5380 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5381                                                bool merge, BasicType bt, int vec_enc) {
5382   if (opcode == Op_CompressV) {
5383     switch(bt) {
5384     case T_BYTE:
5385       evpcompressb(dst, mask, src, merge, vec_enc);
5386       break;
5387     case T_CHAR:
5388     case T_SHORT:
5389       evpcompressw(dst, mask, src, merge, vec_enc);
5390       break;
5391     case T_INT:
5392       evpcompressd(dst, mask, src, merge, vec_enc);
5393       break;
5394     case T_FLOAT:
5395       evcompressps(dst, mask, src, merge, vec_enc);
5396       break;
5397     case T_LONG:
5398       evpcompressq(dst, mask, src, merge, vec_enc);
5399       break;
5400     case T_DOUBLE:
5401       evcompresspd(dst, mask, src, merge, vec_enc);
5402       break;
5403     default:
5404       fatal("Unsupported type %s", type2name(bt));
5405       break;
5406     }
5407   } else {
5408     assert(opcode == Op_ExpandV, "");
5409     switch(bt) {
5410     case T_BYTE:
5411       evpexpandb(dst, mask, src, merge, vec_enc);
5412       break;
5413     case T_CHAR:
5414     case T_SHORT:
5415       evpexpandw(dst, mask, src, merge, vec_enc);
5416       break;
5417     case T_INT:
5418       evpexpandd(dst, mask, src, merge, vec_enc);
5419       break;
5420     case T_FLOAT:
5421       evexpandps(dst, mask, src, merge, vec_enc);
5422       break;
5423     case T_LONG:
5424       evpexpandq(dst, mask, src, merge, vec_enc);
5425       break;
5426     case T_DOUBLE:
5427       evexpandpd(dst, mask, src, merge, vec_enc);
5428       break;
5429     default:
5430       fatal("Unsupported type %s", type2name(bt));
5431       break;
5432     }
5433   }
5434 }
5435 #endif
5436 
5437 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5438                                            KRegister ktmp1, int vec_enc) {
5439   if (opcode == Op_SignumVD) {
5440     vsubpd(dst, zero, one, vec_enc);
5441     // if src < 0 ? -1 : 1
5442     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5443     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5444     // if src == NaN, -0.0 or 0.0 return src.
5445     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5446     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5447   } else {
5448     assert(opcode == Op_SignumVF, "");
5449     vsubps(dst, zero, one, vec_enc);
5450     // if src < 0 ? -1 : 1
5451     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5452     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5453     // if src == NaN, -0.0 or 0.0 return src.
5454     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5455     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5456   }
5457 }
5458 
5459 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5460                                           XMMRegister xtmp1, int vec_enc) {
5461   if (opcode == Op_SignumVD) {
5462     vsubpd(dst, zero, one, vec_enc);
5463     // if src < 0 ? -1 : 1
5464     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5465     // if src == NaN, -0.0 or 0.0 return src.
5466     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5467     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5468   } else {
5469     assert(opcode == Op_SignumVF, "");
5470     vsubps(dst, zero, one, vec_enc);
5471     // if src < 0 ? -1 : 1
5472     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5473     // if src == NaN, -0.0 or 0.0 return src.
5474     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5475     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5476   }
5477 }
5478 
5479 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5480   if (VM_Version::supports_avx512bw()) {
5481     if (mask_len > 32) {
5482       kmovql(dst, src);
5483     } else {
5484       kmovdl(dst, src);
5485       if (mask_len != 32) {
5486         kshiftrdl(dst, dst, 32 - mask_len);
5487       }
5488     }
5489   } else {
5490     assert(mask_len <= 16, "");
5491     kmovwl(dst, src);
5492     if (mask_len != 16) {
5493       kshiftrwl(dst, dst, 16 - mask_len);
5494     }
5495   }
5496 }
5497 
5498 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5499   int lane_size = type2aelembytes(bt);
5500   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5501   if ((is_LP64 || lane_size < 8) &&
5502       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5503        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5504     movptr(rtmp, imm32);
5505     switch(lane_size) {
5506       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5507       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5508       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5509       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5510       fatal("Unsupported lane size %d", lane_size);
5511       break;
5512     }
5513   } else {
5514     movptr(rtmp, imm32);
5515     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5516     switch(lane_size) {
5517       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5518       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5519       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5520       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5521       fatal("Unsupported lane size %d", lane_size);
5522       break;
5523     }
5524   }
5525 }
5526 
5527 //
5528 // Following is lookup table based popcount computation algorithm:-
5529 //       Index   Bit set count
5530 //     [ 0000 ->   0,
5531 //       0001 ->   1,
5532 //       0010 ->   1,
5533 //       0011 ->   2,
5534 //       0100 ->   1,
5535 //       0101 ->   2,
5536 //       0110 ->   2,
5537 //       0111 ->   3,
5538 //       1000 ->   1,
5539 //       1001 ->   2,
5540 //       1010 ->   3,
5541 //       1011 ->   3,
5542 //       1100 ->   2,
5543 //       1101 ->   3,
5544 //       1111 ->   4 ]
5545 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5546 //     shuffle indices for lookup table access.
5547 //  b. Right shift each byte of vector lane by 4 positions.
5548 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5549 //     shuffle indices for lookup table access.
5550 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5551 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5552 //     count of all the bytes of a quadword.
5553 //  f. Perform step e. for upper 128bit vector lane.
5554 //  g. Pack the bitset count of quadwords back to double word.
5555 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5556 
5557 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5558                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5559   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5560   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5561   vpsrlw(dst, src, 4, vec_enc);
5562   vpand(dst, dst, xtmp1, vec_enc);
5563   vpand(xtmp1, src, xtmp1, vec_enc);
5564   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5565   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5566   vpshufb(dst, xtmp2, dst, vec_enc);
5567   vpaddb(dst, dst, xtmp1, vec_enc);
5568 }
5569 
5570 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5571                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5572   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5573   // Following code is as per steps e,f,g and h of above algorithm.
5574   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5575   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5576   vpsadbw(dst, dst, xtmp2, vec_enc);
5577   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5578   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5579   vpackuswb(dst, xtmp1, dst, vec_enc);
5580 }
5581 
5582 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5583                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5584   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5585   // Add the popcount of upper and lower bytes of word.
5586   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5587   vpsrlw(dst, xtmp1, 8, vec_enc);
5588   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5589   vpaddw(dst, dst, xtmp1, vec_enc);
5590 }
5591 
5592 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5593                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5594   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5595   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5596   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5597 }
5598 
5599 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5600                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5601   switch(bt) {
5602     case T_LONG:
5603       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5604       break;
5605     case T_INT:
5606       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5607       break;
5608     case T_CHAR:
5609     case T_SHORT:
5610       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5611       break;
5612     case T_BYTE:
5613     case T_BOOLEAN:
5614       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5615       break;
5616     default:
5617       fatal("Unsupported type %s", type2name(bt));
5618       break;
5619   }
5620 }
5621 
5622 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5623                                                       KRegister mask, bool merge, int vec_enc) {
5624   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5625   switch(bt) {
5626     case T_LONG:
5627       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5628       evpopcntq(dst, mask, src, merge, vec_enc);
5629       break;
5630     case T_INT:
5631       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5632       evpopcntd(dst, mask, src, merge, vec_enc);
5633       break;
5634     case T_CHAR:
5635     case T_SHORT:
5636       assert(VM_Version::supports_avx512_bitalg(), "");
5637       evpopcntw(dst, mask, src, merge, vec_enc);
5638       break;
5639     case T_BYTE:
5640     case T_BOOLEAN:
5641       assert(VM_Version::supports_avx512_bitalg(), "");
5642       evpopcntb(dst, mask, src, merge, vec_enc);
5643       break;
5644     default:
5645       fatal("Unsupported type %s", type2name(bt));
5646       break;
5647   }
5648 }
5649 
5650 #ifndef _LP64
5651 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5652   assert(VM_Version::supports_avx512bw(), "");
5653   kmovdl(tmp, src);
5654   kunpckdql(dst, tmp, tmp);
5655 }
5656 #endif
5657 
5658 // Bit reversal algorithm first reverses the bits of each byte followed by
5659 // a byte level reversal for multi-byte primitive types (short/int/long).
5660 // Algorithm performs a lookup table access to get reverse bit sequence
5661 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5662 // is obtained by swapping the reverse bit sequences of upper and lower
5663 // nibble of a byte.
5664 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5665                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5666   if (VM_Version::supports_avx512vlbw()) {
5667 
5668     // Get the reverse bit sequence of lower nibble of each byte.
5669     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5670     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5671     evpandq(dst, xtmp2, src, vec_enc);
5672     vpshufb(dst, xtmp1, dst, vec_enc);
5673     vpsllq(dst, dst, 4, vec_enc);
5674 
5675     // Get the reverse bit sequence of upper nibble of each byte.
5676     vpandn(xtmp2, xtmp2, src, vec_enc);
5677     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5678     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5679 
5680     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5681     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5682     evporq(xtmp2, dst, xtmp2, vec_enc);
5683     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5684 
5685   } else if(vec_enc == Assembler::AVX_512bit) {
5686     // Shift based bit reversal.
5687     assert(bt == T_LONG || bt == T_INT, "");
5688 
5689     // Swap lower and upper nibble of each byte.
5690     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5691 
5692     // Swap two least and most significant bits of each nibble.
5693     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5694 
5695     // Swap adjacent pair of bits.
5696     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5697     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5698 
5699     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5700     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5701   } else {
5702     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5703     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5704 
5705     // Get the reverse bit sequence of lower nibble of each byte.
5706     vpand(dst, xtmp2, src, vec_enc);
5707     vpshufb(dst, xtmp1, dst, vec_enc);
5708     vpsllq(dst, dst, 4, vec_enc);
5709 
5710     // Get the reverse bit sequence of upper nibble of each byte.
5711     vpandn(xtmp2, xtmp2, src, vec_enc);
5712     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5713     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5714 
5715     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5716     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5717     vpor(xtmp2, dst, xtmp2, vec_enc);
5718     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5719   }
5720 }
5721 
5722 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5723                                                 XMMRegister xtmp, Register rscratch) {
5724   assert(VM_Version::supports_gfni(), "");
5725   assert(rscratch != noreg || always_reachable(mask), "missing");
5726 
5727   // Galois field instruction based bit reversal based on following algorithm.
5728   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5729   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5730   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5731   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5732 }
5733 
5734 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5735                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5736   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5737   evpandq(dst, xtmp1, src, vec_enc);
5738   vpsllq(dst, dst, nbits, vec_enc);
5739   vpandn(xtmp1, xtmp1, src, vec_enc);
5740   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5741   evporq(dst, dst, xtmp1, vec_enc);
5742 }
5743 
5744 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5745                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5746   // Shift based bit reversal.
5747   assert(VM_Version::supports_evex(), "");
5748   switch(bt) {
5749     case T_LONG:
5750       // Swap upper and lower double word of each quad word.
5751       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5752       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5753       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5754       break;
5755     case T_INT:
5756       // Swap upper and lower word of each double word.
5757       evprord(xtmp1, k0, src, 16, true, vec_enc);
5758       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5759       break;
5760     case T_CHAR:
5761     case T_SHORT:
5762       // Swap upper and lower byte of each word.
5763       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5764       break;
5765     case T_BYTE:
5766       evmovdquq(dst, k0, src, true, vec_enc);
5767       break;
5768     default:
5769       fatal("Unsupported type %s", type2name(bt));
5770       break;
5771   }
5772 }
5773 
5774 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5775   if (bt == T_BYTE) {
5776     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5777       evmovdquq(dst, k0, src, true, vec_enc);
5778     } else {
5779       vmovdqu(dst, src);
5780     }
5781     return;
5782   }
5783   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5784   // pre-computed shuffle indices.
5785   switch(bt) {
5786     case T_LONG:
5787       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5788       break;
5789     case T_INT:
5790       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5791       break;
5792     case T_CHAR:
5793     case T_SHORT:
5794       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5795       break;
5796     default:
5797       fatal("Unsupported type %s", type2name(bt));
5798       break;
5799   }
5800   vpshufb(dst, src, dst, vec_enc);
5801 }
5802 
5803 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5804                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5805                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5806   assert(is_integral_type(bt), "");
5807   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5808   assert(VM_Version::supports_avx512cd(), "");
5809   switch(bt) {
5810     case T_LONG:
5811       evplzcntq(dst, ktmp, src, merge, vec_enc);
5812       break;
5813     case T_INT:
5814       evplzcntd(dst, ktmp, src, merge, vec_enc);
5815       break;
5816     case T_SHORT:
5817       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5818       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5819       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5820       vpunpckhwd(dst, xtmp1, src, vec_enc);
5821       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5822       vpackusdw(dst, xtmp2, dst, vec_enc);
5823       break;
5824     case T_BYTE:
5825       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5826       // accessing the lookup table.
5827       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5828       // accessing the lookup table.
5829       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5830       assert(VM_Version::supports_avx512bw(), "");
5831       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5832       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5833       vpand(xtmp2, dst, src, vec_enc);
5834       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5835       vpsrlw(xtmp3, src, 4, vec_enc);
5836       vpand(xtmp3, dst, xtmp3, vec_enc);
5837       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5838       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5839       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5840       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5841       break;
5842     default:
5843       fatal("Unsupported type %s", type2name(bt));
5844       break;
5845   }
5846 }
5847 
5848 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5849                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5850   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5851   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5852   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5853   // accessing the lookup table.
5854   vpand(dst, xtmp2, src, vec_enc);
5855   vpshufb(dst, xtmp1, dst, vec_enc);
5856   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5857   // accessing the lookup table.
5858   vpsrlw(xtmp3, src, 4, vec_enc);
5859   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5860   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5861   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5862   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5863   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5864   vpaddb(dst, dst, xtmp2, vec_enc);
5865   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5866 }
5867 
5868 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5869                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5870   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5871   // Add zero counts of lower byte and upper byte of a word if
5872   // upper byte holds a zero value.
5873   vpsrlw(xtmp3, src, 8, vec_enc);
5874   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5875   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5876   vpsllw(xtmp2, dst, 8, vec_enc);
5877   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5878   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5879   vpsrlw(dst, dst, 8, vec_enc);
5880 }
5881 
5882 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5883                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5884   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5885   // hence biased exponent can be used to compute leading zero count as per
5886   // following formula:-
5887   // LZCNT = 32 - (biased_exp - 127)
5888   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5889 
5890   // Broadcast 0xFF
5891   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5892   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5893 
5894   // Extract biased exponent.
5895   vcvtdq2ps(dst, src, vec_enc);
5896   vpsrld(dst, dst, 23, vec_enc);
5897   vpand(dst, dst, xtmp1, vec_enc);
5898 
5899   // Broadcast 127.
5900   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5901   // Exponent = biased_exp - 127
5902   vpsubd(dst, dst, xtmp1, vec_enc);
5903 
5904   // Exponent = Exponent  + 1
5905   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5906   vpaddd(dst, dst, xtmp3, vec_enc);
5907 
5908   // Replace -ve exponent with zero, exponent is -ve when src
5909   // lane contains a zero value.
5910   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5911   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5912 
5913   // Rematerialize broadcast 32.
5914   vpslld(xtmp1, xtmp3, 5, vec_enc);
5915   // Exponent is 32 if corresponding source lane contains max_int value.
5916   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5917   // LZCNT = 32 - exponent
5918   vpsubd(dst, xtmp1, dst, vec_enc);
5919 
5920   // Replace LZCNT with a value 1 if corresponding source lane
5921   // contains max_int value.
5922   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5923 
5924   // Replace biased_exp with 0 if source lane value is less than zero.
5925   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5926   vblendvps(dst, dst, xtmp2, src, vec_enc);
5927 }
5928 
5929 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5930                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5931   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5932   // Add zero counts of lower word and upper word of a double word if
5933   // upper word holds a zero value.
5934   vpsrld(xtmp3, src, 16, vec_enc);
5935   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5936   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
5937   vpslld(xtmp2, dst, 16, vec_enc);
5938   vpaddd(xtmp2, xtmp2, dst, vec_enc);
5939   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5940   vpsrld(dst, dst, 16, vec_enc);
5941   // Add zero counts of lower doubleword and upper doubleword of a
5942   // quadword if upper doubleword holds a zero value.
5943   vpsrlq(xtmp3, src, 32, vec_enc);
5944   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
5945   vpsllq(xtmp2, dst, 32, vec_enc);
5946   vpaddq(xtmp2, xtmp2, dst, vec_enc);
5947   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5948   vpsrlq(dst, dst, 32, vec_enc);
5949 }
5950 
5951 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
5952                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5953                                                        Register rtmp, int vec_enc) {
5954   assert(is_integral_type(bt), "unexpected type");
5955   assert(vec_enc < Assembler::AVX_512bit, "");
5956   switch(bt) {
5957     case T_LONG:
5958       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5959       break;
5960     case T_INT:
5961       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
5962       break;
5963     case T_SHORT:
5964       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5965       break;
5966     case T_BYTE:
5967       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5968       break;
5969     default:
5970       fatal("Unsupported type %s", type2name(bt));
5971       break;
5972   }
5973 }
5974 
5975 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5976   switch(bt) {
5977     case T_BYTE:
5978       vpsubb(dst, src1, src2, vec_enc);
5979       break;
5980     case T_SHORT:
5981       vpsubw(dst, src1, src2, vec_enc);
5982       break;
5983     case T_INT:
5984       vpsubd(dst, src1, src2, vec_enc);
5985       break;
5986     case T_LONG:
5987       vpsubq(dst, src1, src2, vec_enc);
5988       break;
5989     default:
5990       fatal("Unsupported type %s", type2name(bt));
5991       break;
5992   }
5993 }
5994 
5995 // Trailing zero count computation is based on leading zero count operation as per
5996 // following equation. All AVX3 targets support AVX512CD feature which offers
5997 // direct vector instruction to compute leading zero count.
5998 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5999 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6000                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6001                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6002   assert(is_integral_type(bt), "");
6003   // xtmp = -1
6004   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6005   // xtmp = xtmp + src
6006   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6007   // xtmp = xtmp & ~src
6008   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6009   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6010   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6011   vpsub(bt, dst, xtmp4, dst, vec_enc);
6012 }
6013 
6014 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6015 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6016 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6017                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6018   assert(is_integral_type(bt), "");
6019   // xtmp = 0
6020   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6021   // xtmp = 0 - src
6022   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6023   // xtmp = xtmp | src
6024   vpor(xtmp3, xtmp3, src, vec_enc);
6025   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6026   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6027   vpsub(bt, dst, xtmp1, dst, vec_enc);
6028 }
6029 
6030 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6031   Label done;
6032   Label neg_divisor_fastpath;
6033   cmpl(divisor, 0);
6034   jccb(Assembler::less, neg_divisor_fastpath);
6035   xorl(rdx, rdx);
6036   divl(divisor);
6037   jmpb(done);
6038   bind(neg_divisor_fastpath);
6039   // Fastpath for divisor < 0:
6040   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6041   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6042   movl(rdx, rax);
6043   subl(rdx, divisor);
6044   if (VM_Version::supports_bmi1()) {
6045     andnl(rax, rdx, rax);
6046   } else {
6047     notl(rdx);
6048     andl(rax, rdx);
6049   }
6050   shrl(rax, 31);
6051   bind(done);
6052 }
6053 
6054 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6055   Label done;
6056   Label neg_divisor_fastpath;
6057   cmpl(divisor, 0);
6058   jccb(Assembler::less, neg_divisor_fastpath);
6059   xorl(rdx, rdx);
6060   divl(divisor);
6061   jmpb(done);
6062   bind(neg_divisor_fastpath);
6063   // Fastpath when divisor < 0:
6064   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6065   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6066   movl(rdx, rax);
6067   subl(rax, divisor);
6068   if (VM_Version::supports_bmi1()) {
6069     andnl(rax, rax, rdx);
6070   } else {
6071     notl(rax);
6072     andl(rax, rdx);
6073   }
6074   sarl(rax, 31);
6075   andl(rax, divisor);
6076   subl(rdx, rax);
6077   bind(done);
6078 }
6079 
6080 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6081   Label done;
6082   Label neg_divisor_fastpath;
6083 
6084   cmpl(divisor, 0);
6085   jccb(Assembler::less, neg_divisor_fastpath);
6086   xorl(rdx, rdx);
6087   divl(divisor);
6088   jmpb(done);
6089   bind(neg_divisor_fastpath);
6090   // Fastpath for divisor < 0:
6091   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6092   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6093   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6094   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6095   movl(rdx, rax);
6096   subl(rax, divisor);
6097   if (VM_Version::supports_bmi1()) {
6098     andnl(rax, rax, rdx);
6099   } else {
6100     notl(rax);
6101     andl(rax, rdx);
6102   }
6103   movl(tmp, rax);
6104   shrl(rax, 31); // quotient
6105   sarl(tmp, 31);
6106   andl(tmp, divisor);
6107   subl(rdx, tmp); // remainder
6108   bind(done);
6109 }
6110 
6111 #ifdef _LP64
6112 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6113                                  XMMRegister xtmp2, Register rtmp) {
6114   if(VM_Version::supports_gfni()) {
6115     // Galois field instruction based bit reversal based on following algorithm.
6116     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6117     mov64(rtmp, 0x8040201008040201L);
6118     movq(xtmp1, src);
6119     movq(xtmp2, rtmp);
6120     gf2p8affineqb(xtmp1, xtmp2, 0);
6121     movq(dst, xtmp1);
6122   } else {
6123     // Swap even and odd numbered bits.
6124     movl(rtmp, src);
6125     andl(rtmp, 0x55555555);
6126     shll(rtmp, 1);
6127     movl(dst, src);
6128     andl(dst, 0xAAAAAAAA);
6129     shrl(dst, 1);
6130     orl(dst, rtmp);
6131 
6132     // Swap LSB and MSB 2 bits of each nibble.
6133     movl(rtmp, dst);
6134     andl(rtmp, 0x33333333);
6135     shll(rtmp, 2);
6136     andl(dst, 0xCCCCCCCC);
6137     shrl(dst, 2);
6138     orl(dst, rtmp);
6139 
6140     // Swap LSB and MSB 4 bits of each byte.
6141     movl(rtmp, dst);
6142     andl(rtmp, 0x0F0F0F0F);
6143     shll(rtmp, 4);
6144     andl(dst, 0xF0F0F0F0);
6145     shrl(dst, 4);
6146     orl(dst, rtmp);
6147   }
6148   bswapl(dst);
6149 }
6150 
6151 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6152                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6153   if(VM_Version::supports_gfni()) {
6154     // Galois field instruction based bit reversal based on following algorithm.
6155     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6156     mov64(rtmp1, 0x8040201008040201L);
6157     movq(xtmp1, src);
6158     movq(xtmp2, rtmp1);
6159     gf2p8affineqb(xtmp1, xtmp2, 0);
6160     movq(dst, xtmp1);
6161   } else {
6162     // Swap even and odd numbered bits.
6163     movq(rtmp1, src);
6164     mov64(rtmp2, 0x5555555555555555L);
6165     andq(rtmp1, rtmp2);
6166     shlq(rtmp1, 1);
6167     movq(dst, src);
6168     notq(rtmp2);
6169     andq(dst, rtmp2);
6170     shrq(dst, 1);
6171     orq(dst, rtmp1);
6172 
6173     // Swap LSB and MSB 2 bits of each nibble.
6174     movq(rtmp1, dst);
6175     mov64(rtmp2, 0x3333333333333333L);
6176     andq(rtmp1, rtmp2);
6177     shlq(rtmp1, 2);
6178     notq(rtmp2);
6179     andq(dst, rtmp2);
6180     shrq(dst, 2);
6181     orq(dst, rtmp1);
6182 
6183     // Swap LSB and MSB 4 bits of each byte.
6184     movq(rtmp1, dst);
6185     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6186     andq(rtmp1, rtmp2);
6187     shlq(rtmp1, 4);
6188     notq(rtmp2);
6189     andq(dst, rtmp2);
6190     shrq(dst, 4);
6191     orq(dst, rtmp1);
6192   }
6193   bswapq(dst);
6194 }
6195 
6196 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6197   Label done;
6198   Label neg_divisor_fastpath;
6199   cmpq(divisor, 0);
6200   jccb(Assembler::less, neg_divisor_fastpath);
6201   xorl(rdx, rdx);
6202   divq(divisor);
6203   jmpb(done);
6204   bind(neg_divisor_fastpath);
6205   // Fastpath for divisor < 0:
6206   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6207   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6208   movq(rdx, rax);
6209   subq(rdx, divisor);
6210   if (VM_Version::supports_bmi1()) {
6211     andnq(rax, rdx, rax);
6212   } else {
6213     notq(rdx);
6214     andq(rax, rdx);
6215   }
6216   shrq(rax, 63);
6217   bind(done);
6218 }
6219 
6220 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6221   Label done;
6222   Label neg_divisor_fastpath;
6223   cmpq(divisor, 0);
6224   jccb(Assembler::less, neg_divisor_fastpath);
6225   xorq(rdx, rdx);
6226   divq(divisor);
6227   jmp(done);
6228   bind(neg_divisor_fastpath);
6229   // Fastpath when divisor < 0:
6230   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6231   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6232   movq(rdx, rax);
6233   subq(rax, divisor);
6234   if (VM_Version::supports_bmi1()) {
6235     andnq(rax, rax, rdx);
6236   } else {
6237     notq(rax);
6238     andq(rax, rdx);
6239   }
6240   sarq(rax, 63);
6241   andq(rax, divisor);
6242   subq(rdx, rax);
6243   bind(done);
6244 }
6245 
6246 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6247   Label done;
6248   Label neg_divisor_fastpath;
6249   cmpq(divisor, 0);
6250   jccb(Assembler::less, neg_divisor_fastpath);
6251   xorq(rdx, rdx);
6252   divq(divisor);
6253   jmp(done);
6254   bind(neg_divisor_fastpath);
6255   // Fastpath for divisor < 0:
6256   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6257   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6258   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6259   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6260   movq(rdx, rax);
6261   subq(rax, divisor);
6262   if (VM_Version::supports_bmi1()) {
6263     andnq(rax, rax, rdx);
6264   } else {
6265     notq(rax);
6266     andq(rax, rdx);
6267   }
6268   movq(tmp, rax);
6269   shrq(rax, 63); // quotient
6270   sarq(tmp, 63);
6271   andq(tmp, divisor);
6272   subq(rdx, tmp); // remainder
6273   bind(done);
6274 }
6275 #endif
6276 
6277 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6278                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6279                                         int vlen_enc) {
6280   assert(VM_Version::supports_avx512bw(), "");
6281   // Byte shuffles are inlane operations and indices are determined using
6282   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6283   // normalized to index range 0-15. This makes sure that all the multiples
6284   // of an index value are placed at same relative position in 128 bit
6285   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6286   // will be 16th element in their respective 128 bit lanes.
6287   movl(rtmp, 16);
6288   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6289 
6290   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6291   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6292   // original shuffle indices and move the shuffled lanes corresponding to true
6293   // mask to destination vector.
6294   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6295   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6296   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6297 
6298   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6299   // and broadcasting second 128 bit lane.
6300   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6301   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6302   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6303   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6304   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6305 
6306   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6307   // and broadcasting third 128 bit lane.
6308   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6309   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6310   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6311   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6312   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6313 
6314   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6315   // and broadcasting third 128 bit lane.
6316   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6317   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6318   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6319   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6320   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6321 }
6322 
6323 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6324                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6325   if (vlen_enc == AVX_128bit) {
6326     vpermilps(dst, src, shuffle, vlen_enc);
6327   } else if (bt == T_INT) {
6328     vpermd(dst, shuffle, src, vlen_enc);
6329   } else {
6330     assert(bt == T_FLOAT, "");
6331     vpermps(dst, shuffle, src, vlen_enc);
6332   }
6333 }