1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  54 
  55   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  56   // NativeJump::patch_verified_entry will be able to patch out the entry
  57   // code safely. The push to verify stack depth is ok at 5 bytes,
  58   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  59   // stack bang then we must use the 6 byte frame allocation even if
  60   // we have no frame. :-(
  61   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  62 
  63   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  64   // Remove word for return addr
  65   framesize -= wordSize;
  66   stack_bang_size -= wordSize;
  67 
  68   // Calls to C2R adapters often do not accept exceptional returns.
  69   // We require that their callers must bang for them.  But be careful, because
  70   // some VM calls (such as call site linkage) can use several kilobytes of
  71   // stack.  But the stack safety zone should account for that.
  72   // See bugs 4446381, 4468289, 4497237.
  73   if (stack_bang_size > 0) {
  74     generate_stack_overflow_check(stack_bang_size);
  75 
  76     // We always push rbp, so that on return to interpreter rbp, will be
  77     // restored correctly and we can correct the stack.
  78     push(rbp);
  79     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  80     if (PreserveFramePointer) {
  81       mov(rbp, rsp);
  82     }
  83     // Remove word for ebp
  84     framesize -= wordSize;
  85 
  86     // Create frame
  87     if (framesize) {
  88       subptr(rsp, framesize);
  89     }
  90   } else {
  91     // Create frame (force generation of a 4 byte immediate value)
  92     subptr_imm32(rsp, framesize);
  93 
  94     // Save RBP register now.
  95     framesize -= wordSize;
  96     movptr(Address(rsp, framesize), rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       movptr(rbp, rsp);
 100       if (framesize > 0) {
 101         addptr(rbp, framesize);
 102       }
 103     }
 104   }
 105 
 106   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 107     framesize -= wordSize;
 108     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 109   }
 110 
 111 #ifndef _LP64
 112   // If method sets FPU control word do it now
 113   if (fp_mode_24b) {
 114     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 115   }
 116   if (UseSSE >= 2 && VerifyFPU) {
 117     verify_FPU(0, "FPU stack must be clean on entry");
 118   }
 119 #endif
 120 
 121 #ifdef ASSERT
 122   if (VerifyStackAtCalls) {
 123     Label L;
 124     push(rax);
 125     mov(rax, rsp);
 126     andptr(rax, StackAlignmentInBytes-1);
 127     cmpptr(rax, StackAlignmentInBytes-wordSize);
 128     pop(rax);
 129     jcc(Assembler::equal, L);
 130     STOP("Stack is not properly aligned!");
 131     bind(L);
 132   }
 133 #endif
 134 
 135   if (!is_stub) {
 136     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137  #ifdef _LP64
 138     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 139       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 140       Label dummy_slow_path;
 141       Label dummy_continuation;
 142       Label* slow_path = &dummy_slow_path;
 143       Label* continuation = &dummy_continuation;
 144       if (!Compile::current()->output()->in_scratch_emit_size()) {
 145         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 146         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 147         Compile::current()->output()->add_stub(stub);
 148         slow_path = &stub->entry();
 149         continuation = &stub->continuation();
 150       }
 151       bs->nmethod_entry_barrier(this, slow_path, continuation);
 152     }
 153 #else
 154     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 155     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 156 #endif
 157   }
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 // fast_lock and fast_unlock used by C2
 176 
 177 // Because the transitions from emitted code to the runtime
 178 // monitorenter/exit helper stubs are so slow it's critical that
 179 // we inline both the stack-locking fast path and the inflated fast path.
 180 //
 181 // See also: cmpFastLock and cmpFastUnlock.
 182 //
 183 // What follows is a specialized inline transliteration of the code
 184 // in enter() and exit(). If we're concerned about I$ bloat another
 185 // option would be to emit TrySlowEnter and TrySlowExit methods
 186 // at startup-time.  These methods would accept arguments as
 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 188 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 190 // In practice, however, the # of lock sites is bounded and is usually small.
 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 192 // if the processor uses simple bimodal branch predictors keyed by EIP
 193 // Since the helper routines would be called from multiple synchronization
 194 // sites.
 195 //
 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 198 // to those specialized methods.  That'd give us a mostly platform-independent
 199 // implementation that the JITs could optimize and inline at their pleasure.
 200 // Done correctly, the only time we'd need to cross to native could would be
 201 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 203 // (b) explicit barriers or fence operations.
 204 //
 205 // TODO:
 206 //
 207 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 208 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 209 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 210 //    the lock operators would typically be faster than reifying Self.
 211 //
 212 // *  Ideally I'd define the primitives as:
 213 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 214 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 215 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 216 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 217 //    Furthermore the register assignments are overconstrained, possibly resulting in
 218 //    sub-optimal code near the synchronization site.
 219 //
 220 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 221 //    Alternately, use a better sp-proximity test.
 222 //
 223 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 224 //    Either one is sufficient to uniquely identify a thread.
 225 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 226 //
 227 // *  Intrinsify notify() and notifyAll() for the common cases where the
 228 //    object is locked by the calling thread but the waitlist is empty.
 229 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 230 //
 231 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 232 //    But beware of excessive branch density on AMD Opterons.
 233 //
 234 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 235 //    or failure of the fast path.  If the fast path fails then we pass
 236 //    control to the slow path, typically in C.  In fast_lock and
 237 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 238 //    will emit a conditional branch immediately after the node.
 239 //    So we have branches to branches and lots of ICC.ZF games.
 240 //    Instead, it might be better to have C2 pass a "FailureLabel"
 241 //    into fast_lock and fast_unlock.  In the case of success, control
 242 //    will drop through the node.  ICC.ZF is undefined at exit.
 243 //    In the case of failure, the node will branch directly to the
 244 //    FailureLabel
 245 
 246 
 247 // obj: object to lock
 248 // box: on-stack box address (displaced header location) - KILLED
 249 // rax,: tmp -- KILLED
 250 // scr: tmp -- KILLED
 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 252                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 253                                  Metadata* method_data) {
 254   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 255   // Ensure the register assignments are disjoint
 256   assert(tmpReg == rax, "");
 257   assert(cx1Reg == noreg, "");
 258   assert(cx2Reg == noreg, "");
 259   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 260 
 261   // Possible cases that we'll encounter in fast_lock
 262   // ------------------------------------------------
 263   // * Inflated
 264   //    -- unlocked
 265   //    -- Locked
 266   //       = by self
 267   //       = by other
 268   // * neutral
 269   // * stack-locked
 270   //    -- by self
 271   //       = sp-proximity test hits
 272   //       = sp-proximity test generates false-negative
 273   //    -- by other
 274   //
 275 
 276   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 277 
 278   if (DiagnoseSyncOnValueBasedClasses != 0) {
 279     load_klass(tmpReg, objReg, scrReg);
 280     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 281     jcc(Assembler::notZero, DONE_LABEL);
 282   }
 283 
 284   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 285   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 286   jcc(Assembler::notZero, IsInflated);
 287 
 288   if (LockingMode == LM_MONITOR) {
 289     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 290     testptr(objReg, objReg);
 291   } else {
 292     assert(LockingMode == LM_LEGACY, "must be");
 293     // Attempt stack-locking ...
 294     orptr (tmpReg, markWord::unlocked_value);
 295     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 296     lock();
 297     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 298     jcc(Assembler::equal, COUNT);           // Success
 299 
 300     // Recursive locking.
 301     // The object is stack-locked: markword contains stack pointer to BasicLock.
 302     // Locked by current thread if difference with current SP is less than one page.
 303     subptr(tmpReg, rsp);
 304     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 305     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 306     movptr(Address(boxReg, 0), tmpReg);
 307   }
 308   jmp(DONE_LABEL);
 309 
 310   bind(IsInflated);
 311   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 312 
 313 #ifndef _LP64
 314   // The object is inflated.
 315 
 316   // boxReg refers to the on-stack BasicLock in the current frame.
 317   // We'd like to write:
 318   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 319   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 320   // additional latency as we have another ST in the store buffer that must drain.
 321 
 322   // avoid ST-before-CAS
 323   // register juggle because we need tmpReg for cmpxchgptr below
 324   movptr(scrReg, boxReg);
 325   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 326 
 327   // Optimistic form: consider XORL tmpReg,tmpReg
 328   movptr(tmpReg, NULL_WORD);
 329 
 330   // Appears unlocked - try to swing _owner from null to non-null.
 331   // Ideally, I'd manifest "Self" with get_thread and then attempt
 332   // to CAS the register containing Self into m->Owner.
 333   // But we don't have enough registers, so instead we can either try to CAS
 334   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 335   // we later store "Self" into m->Owner.  Transiently storing a stack address
 336   // (rsp or the address of the box) into  m->owner is harmless.
 337   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 338   lock();
 339   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 340   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 341   // If we weren't able to swing _owner from null to the BasicLock
 342   // then take the slow path.
 343   jccb  (Assembler::notZero, NO_COUNT);
 344   // update _owner from BasicLock to thread
 345   get_thread (scrReg);                    // beware: clobbers ICCs
 346   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 347   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 348 
 349   // If the CAS fails we can either retry or pass control to the slow path.
 350   // We use the latter tactic.
 351   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 352   // If the CAS was successful ...
 353   //   Self has acquired the lock
 354   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 355   // Intentional fall-through into DONE_LABEL ...
 356 #else // _LP64
 357   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 358   movq(scrReg, tmpReg);
 359   xorq(tmpReg, tmpReg);
 360   lock();
 361   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 362   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 363   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 364   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 365   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 366   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 367 
 368   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 369   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 370   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 371   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 372 #endif // _LP64
 373   bind(DONE_LABEL);
 374 
 375   // ZFlag == 1 count in fast path
 376   // ZFlag == 0 count in slow path
 377   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 378 
 379   bind(COUNT);
 380   // Count monitors in fast path
 381   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 382 
 383   xorl(tmpReg, tmpReg); // Set ZF == 1
 384 
 385   bind(NO_COUNT);
 386 
 387   // At NO_COUNT the icc ZFlag is set as follows ...
 388   // fast_unlock uses the same protocol.
 389   // ZFlag == 1 -> Success
 390   // ZFlag == 0 -> Failure - force control through the slow path
 391 }
 392 
 393 // obj: object to unlock
 394 // box: box address (displaced header location), killed.  Must be EAX.
 395 // tmp: killed, cannot be obj nor box.
 396 //
 397 // Some commentary on balanced locking:
 398 //
 399 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 400 // Methods that don't have provably balanced locking are forced to run in the
 401 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 402 // The interpreter provides two properties:
 403 // I1:  At return-time the interpreter automatically and quietly unlocks any
 404 //      objects acquired the current activation (frame).  Recall that the
 405 //      interpreter maintains an on-stack list of locks currently held by
 406 //      a frame.
 407 // I2:  If a method attempts to unlock an object that is not held by the
 408 //      the frame the interpreter throws IMSX.
 409 //
 410 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 411 // B() doesn't have provably balanced locking so it runs in the interpreter.
 412 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 413 // is still locked by A().
 414 //
 415 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 416 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 417 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 418 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 419 // Arguably given that the spec legislates the JNI case as undefined our implementation
 420 // could reasonably *avoid* checking owner in fast_unlock().
 421 // In the interest of performance we elide m->Owner==Self check in unlock.
 422 // A perfectly viable alternative is to elide the owner check except when
 423 // Xcheck:jni is enabled.
 424 
 425 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 426   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 427   assert(boxReg == rax, "");
 428   assert_different_registers(objReg, boxReg, tmpReg);
 429 
 430   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 431 
 432   if (LockingMode == LM_LEGACY) {
 433     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 434     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 435   }
 436   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 437   if (LockingMode != LM_MONITOR) {
 438     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 439     jcc(Assembler::zero, Stacked);
 440   }
 441 
 442   // It's inflated.
 443 
 444   // Despite our balanced locking property we still check that m->_owner == Self
 445   // as java routines or native JNI code called by this thread might
 446   // have released the lock.
 447   // Refer to the comments in synchronizer.cpp for how we might encode extra
 448   // state in _succ so we can avoid fetching EntryList|cxq.
 449   //
 450   // If there's no contention try a 1-0 exit.  That is, exit without
 451   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 452   // we detect and recover from the race that the 1-0 exit admits.
 453   //
 454   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 455   // before it STs null into _owner, releasing the lock.  Updates
 456   // to data protected by the critical section must be visible before
 457   // we drop the lock (and thus before any other thread could acquire
 458   // the lock and observe the fields protected by the lock).
 459   // IA32's memory-model is SPO, so STs are ordered with respect to
 460   // each other and there's no need for an explicit barrier (fence).
 461   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 462   Label LSuccess, LNotRecursive;
 463 
 464   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 465   jccb(Assembler::equal, LNotRecursive);
 466 
 467   // Recursive inflated unlock
 468   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 469   jmpb(LSuccess);
 470 
 471   bind(LNotRecursive);
 472 
 473   // Set owner to null.
 474   // Release to satisfy the JMM
 475   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 476   // We need a full fence after clearing owner to avoid stranding.
 477   // StoreLoad achieves this.
 478   membar(StoreLoad);
 479 
 480   // Check if the entry lists are empty (EntryList first - by convention).
 481   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 482   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 483   jccb(Assembler::zero, LSuccess);    // If so we are done.
 484 
 485   // Check if there is a successor.
 486   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 487   jccb(Assembler::notZero, LSuccess); // If so we are done.
 488 
 489   // Save the monitor pointer in the current thread, so we can try to
 490   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 491   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 492 #ifndef _LP64
 493   get_thread(boxReg);
 494   movptr(Address(boxReg, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 495 #else // _LP64
 496   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 497 #endif
 498 
 499   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 500   jmpb  (DONE_LABEL);
 501 
 502   bind  (LSuccess);
 503   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 504   jmpb  (DONE_LABEL);
 505 
 506   if (LockingMode == LM_LEGACY) {
 507     bind  (Stacked);
 508     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 509     lock();
 510     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 511     // Intentional fall-thru into DONE_LABEL
 512   }
 513 
 514   bind(DONE_LABEL);
 515 
 516   // ZFlag == 1 count in fast path
 517   // ZFlag == 0 count in slow path
 518   jccb(Assembler::notZero, NO_COUNT);
 519 
 520   bind(COUNT);
 521   // Count monitors in fast path
 522 #ifndef _LP64
 523   get_thread(tmpReg);
 524   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 525 #else // _LP64
 526   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 527 #endif
 528 
 529   xorl(tmpReg, tmpReg); // Set ZF == 1
 530 
 531   bind(NO_COUNT);
 532 }
 533 
 534 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 535                                               Register t, Register thread) {
 536   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 537   assert(rax_reg == rax, "Used for CAS");
 538   assert_different_registers(obj, box, rax_reg, t, thread);
 539 
 540   // Handle inflated monitor.
 541   Label inflated;
 542   // Finish fast lock successfully. ZF value is irrelevant.
 543   Label locked;
 544   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 545   Label slow_path;
 546 
 547   if (UseObjectMonitorTable) {
 548     // Clear cache in case fast locking succeeds.
 549     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 550   }
 551 
 552   if (DiagnoseSyncOnValueBasedClasses != 0) {
 553     load_klass(rax_reg, obj, t);
 554     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 555     jcc(Assembler::notZero, slow_path);
 556   }
 557 
 558   const Register mark = t;
 559 
 560   { // Lightweight Lock
 561 
 562     Label push;
 563 
 564     const Register top = UseObjectMonitorTable ? rax_reg : box;
 565 
 566     // Load the mark.
 567     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 568 
 569     // Prefetch top.
 570     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 571 
 572     // Check for monitor (0b10).
 573     testptr(mark, markWord::monitor_value);
 574     jcc(Assembler::notZero, inflated);
 575 
 576     // Check if lock-stack is full.
 577     cmpl(top, LockStack::end_offset() - 1);
 578     jcc(Assembler::greater, slow_path);
 579 
 580     // Check if recursive.
 581     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 582     jccb(Assembler::equal, push);
 583 
 584     // Try to lock. Transition lock bits 0b01 => 0b00
 585     movptr(rax_reg, mark);
 586     orptr(rax_reg, markWord::unlocked_value);
 587     andptr(mark, ~(int32_t)markWord::unlocked_value);
 588     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 589     jcc(Assembler::notEqual, slow_path);
 590 
 591     if (UseObjectMonitorTable) {
 592       // Need to reload top, clobbered by CAS.
 593       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 594     }
 595     bind(push);
 596     // After successful lock, push object on lock-stack.
 597     movptr(Address(thread, top), obj);
 598     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 599     jmpb(locked);
 600   }
 601 
 602   { // Handle inflated monitor.
 603     bind(inflated);
 604 
 605     const Register monitor = t;
 606 
 607     if (!UseObjectMonitorTable) {
 608       assert(mark == monitor, "should be the same here");
 609     } else {
 610       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 611       // Fetch ObjectMonitor* from the cache or take the slow-path.
 612       Label monitor_found;
 613 
 614       // Load cache address
 615       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 616 
 617       const int num_unrolled = 2;
 618       for (int i = 0; i < num_unrolled; i++) {
 619         cmpptr(obj, Address(t));
 620         jccb(Assembler::equal, monitor_found);
 621         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 622       }
 623 
 624       Label loop;
 625 
 626       // Search for obj in cache.
 627       bind(loop);
 628 
 629       // Check for match.
 630       cmpptr(obj, Address(t));
 631       jccb(Assembler::equal, monitor_found);
 632 
 633       // Search until null encountered, guaranteed _null_sentinel at end.
 634       cmpptr(Address(t), 1);
 635       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 636       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 637       jmpb(loop);
 638 
 639       // Cache hit.
 640       bind(monitor_found);
 641       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 642     }
 643     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 644     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 645     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 646 
 647     Label monitor_locked;
 648     // Lock the monitor.
 649 
 650     // CAS owner (null => current thread).
 651     xorptr(rax_reg, rax_reg);
 652     lock(); cmpxchgptr(thread, owner_address);
 653     jccb(Assembler::equal, monitor_locked);
 654 
 655     // Check if recursive.
 656     cmpptr(thread, rax_reg);
 657     jccb(Assembler::notEqual, slow_path);
 658 
 659     // Recursive.
 660     increment(recursions_address);
 661 
 662     bind(monitor_locked);
 663     if (UseObjectMonitorTable) {
 664       // Cache the monitor for unlock
 665       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 666     }
 667   }
 668 
 669   bind(locked);
 670   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 671   // Set ZF = 1
 672   xorl(rax_reg, rax_reg);
 673 
 674 #ifdef ASSERT
 675   // Check that locked label is reached with ZF set.
 676   Label zf_correct;
 677   Label zf_bad_zero;
 678   jcc(Assembler::zero, zf_correct);
 679   jmp(zf_bad_zero);
 680 #endif
 681 
 682   bind(slow_path);
 683 #ifdef ASSERT
 684   // Check that slow_path label is reached with ZF not set.
 685   jcc(Assembler::notZero, zf_correct);
 686   stop("Fast Lock ZF != 0");
 687   bind(zf_bad_zero);
 688   stop("Fast Lock ZF != 1");
 689   bind(zf_correct);
 690 #endif
 691   // C2 uses the value of ZF to determine the continuation.
 692 }
 693 
 694 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 695   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 696   assert(reg_rax == rax, "Used for CAS");
 697   assert_different_registers(obj, reg_rax, t);
 698 
 699   // Handle inflated monitor.
 700   Label inflated, inflated_check_lock_stack;
 701   // Finish fast unlock successfully.  MUST jump with ZF == 1
 702   Label unlocked, slow_path;
 703 
 704   const Register mark = t;
 705   const Register monitor = t;
 706   const Register top = UseObjectMonitorTable ? t : reg_rax;
 707   const Register box = reg_rax;
 708 
 709   Label dummy;
 710   C2FastUnlockLightweightStub* stub = nullptr;
 711 
 712   if (!Compile::current()->output()->in_scratch_emit_size()) {
 713     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 714     Compile::current()->output()->add_stub(stub);
 715   }
 716 
 717   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 718 
 719   { // Lightweight Unlock
 720 
 721     // Load top.
 722     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 723 
 724     if (!UseObjectMonitorTable) {
 725       // Prefetch mark.
 726       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 727     }
 728 
 729     // Check if obj is top of lock-stack.
 730     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 731     // Top of lock stack was not obj. Must be monitor.
 732     jcc(Assembler::notEqual, inflated_check_lock_stack);
 733 
 734     // Pop lock-stack.
 735     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 736     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 737 
 738     // Check if recursive.
 739     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 740     jcc(Assembler::equal, unlocked);
 741 
 742     // We elide the monitor check, let the CAS fail instead.
 743 
 744     if (UseObjectMonitorTable) {
 745       // Load mark.
 746       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 747     }
 748 
 749     // Try to unlock. Transition lock bits 0b00 => 0b01
 750     movptr(reg_rax, mark);
 751     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 752     orptr(mark, markWord::unlocked_value);
 753     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 754     jcc(Assembler::notEqual, push_and_slow_path);
 755     jmp(unlocked);
 756   }
 757 
 758 
 759   { // Handle inflated monitor.
 760     bind(inflated_check_lock_stack);
 761 #ifdef ASSERT
 762     Label check_done;
 763     subl(top, oopSize);
 764     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 765     jcc(Assembler::below, check_done);
 766     cmpptr(obj, Address(thread, top));
 767     jccb(Assembler::notEqual, inflated_check_lock_stack);
 768     stop("Fast Unlock lock on stack");
 769     bind(check_done);
 770     if (UseObjectMonitorTable) {
 771       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 772     }
 773     testptr(mark, markWord::monitor_value);
 774     jccb(Assembler::notZero, inflated);
 775     stop("Fast Unlock not monitor");
 776 #endif
 777 
 778     bind(inflated);
 779 
 780     if (!UseObjectMonitorTable) {
 781       assert(mark == monitor, "should be the same here");
 782     } else {
 783       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 784       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 785       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 786       cmpptr(monitor, alignof(ObjectMonitor*));
 787       jcc(Assembler::below, slow_path);
 788     }
 789     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 790     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 791     const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag};
 792     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 793     const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag};
 794     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 795 
 796     Label recursive;
 797 
 798     // Check if recursive.
 799     cmpptr(recursions_address, 0);
 800     jccb(Assembler::notZero, recursive);
 801 
 802     // Set owner to null.
 803     // Release to satisfy the JMM
 804     movptr(owner_address, NULL_WORD);
 805     // We need a full fence after clearing owner to avoid stranding.
 806     // StoreLoad achieves this.
 807     membar(StoreLoad);
 808 
 809     // Check if the entry lists are empty (EntryList first - by convention).
 810     movptr(reg_rax, EntryList_address);
 811     orptr(reg_rax, cxq_address);
 812     jccb(Assembler::zero, unlocked);    // If so we are done.
 813 
 814     // Check if there is a successor.
 815     cmpptr(succ_address, NULL_WORD);
 816     jccb(Assembler::notZero, unlocked); // If so we are done.
 817 
 818     // Save the monitor pointer in the current thread, so we can try to
 819     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 820     if (!UseObjectMonitorTable) {
 821       andptr(monitor, ~(int32_t)markWord::monitor_value);
 822     }
 823     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 824 
 825     orl(t, 1); // Fast Unlock ZF = 0
 826     jmpb(slow_path);
 827 
 828     // Recursive unlock.
 829     bind(recursive);
 830     decrement(recursions_address);
 831   }
 832 
 833   bind(unlocked);
 834   decrement(Address(thread, JavaThread::held_monitor_count_offset()));
 835   xorl(t, t); // Fast Unlock ZF = 1
 836 
 837 #ifdef ASSERT
 838   // Check that unlocked label is reached with ZF set.
 839   Label zf_correct;
 840   Label zf_bad_zero;
 841   jcc(Assembler::zero, zf_correct);
 842   jmp(zf_bad_zero);
 843 #endif
 844 
 845   bind(slow_path);
 846   if (stub != nullptr) {
 847     bind(stub->slow_path_continuation());
 848   }
 849 #ifdef ASSERT
 850   // Check that stub->continuation() label is reached with ZF not set.
 851   jcc(Assembler::notZero, zf_correct);
 852   stop("Fast Unlock ZF != 0");
 853   bind(zf_bad_zero);
 854   stop("Fast Unlock ZF != 1");
 855   bind(zf_correct);
 856 #endif
 857   // C2 uses the value of ZF to determine the continuation.
 858 }
 859 
 860 //-------------------------------------------------------------------------------------------
 861 // Generic instructions support for use in .ad files C2 code generation
 862 
 863 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 864   if (dst != src) {
 865     movdqu(dst, src);
 866   }
 867   if (opcode == Op_AbsVD) {
 868     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 869   } else {
 870     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 871     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 872   }
 873 }
 874 
 875 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 876   if (opcode == Op_AbsVD) {
 877     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 878   } else {
 879     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 880     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 881   }
 882 }
 883 
 884 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 885   if (dst != src) {
 886     movdqu(dst, src);
 887   }
 888   if (opcode == Op_AbsVF) {
 889     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 890   } else {
 891     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 892     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 893   }
 894 }
 895 
 896 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 897   if (opcode == Op_AbsVF) {
 898     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 899   } else {
 900     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 901     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 902   }
 903 }
 904 
 905 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 906   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 907   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 908 
 909   if (opcode == Op_MinV) {
 910     if (elem_bt == T_BYTE) {
 911       pminsb(dst, src);
 912     } else if (elem_bt == T_SHORT) {
 913       pminsw(dst, src);
 914     } else if (elem_bt == T_INT) {
 915       pminsd(dst, src);
 916     } else {
 917       assert(elem_bt == T_LONG, "required");
 918       assert(tmp == xmm0, "required");
 919       assert_different_registers(dst, src, tmp);
 920       movdqu(xmm0, dst);
 921       pcmpgtq(xmm0, src);
 922       blendvpd(dst, src);  // xmm0 as mask
 923     }
 924   } else { // opcode == Op_MaxV
 925     if (elem_bt == T_BYTE) {
 926       pmaxsb(dst, src);
 927     } else if (elem_bt == T_SHORT) {
 928       pmaxsw(dst, src);
 929     } else if (elem_bt == T_INT) {
 930       pmaxsd(dst, src);
 931     } else {
 932       assert(elem_bt == T_LONG, "required");
 933       assert(tmp == xmm0, "required");
 934       assert_different_registers(dst, src, tmp);
 935       movdqu(xmm0, src);
 936       pcmpgtq(xmm0, dst);
 937       blendvpd(dst, src);  // xmm0 as mask
 938     }
 939   }
 940 }
 941 
 942 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 943                                   XMMRegister src1, Address src2, int vlen_enc) {
 944   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 945   if (opcode == Op_UMinV) {
 946     switch(elem_bt) {
 947       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 948       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 949       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 950       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 951       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 952     }
 953   } else {
 954     assert(opcode == Op_UMaxV, "required");
 955     switch(elem_bt) {
 956       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 957       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 958       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 959       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 960       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 961     }
 962   }
 963 }
 964 
 965 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 966   // For optimality, leverage a full vector width of 512 bits
 967   // for operations over smaller vector sizes on AVX512 targets.
 968   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 969     if (opcode == Op_UMaxV) {
 970       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 971     } else {
 972       assert(opcode == Op_UMinV, "required");
 973       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 974     }
 975   } else {
 976     // T1 = -1
 977     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 978     // T1 = -1 << 63
 979     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 980     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 981     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 982     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 983     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 984     // Mask = T2 > T1
 985     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 986     if (opcode == Op_UMaxV) {
 987       // Res = Mask ? Src2 : Src1
 988       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 989     } else {
 990       // Res = Mask ? Src1 : Src2
 991       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 992     }
 993   }
 994 }
 995 
 996 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 997                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 998   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 999   if (opcode == Op_UMinV) {
1000     switch(elem_bt) {
1001       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
1002       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
1003       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
1004       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
1005       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1006     }
1007   } else {
1008     assert(opcode == Op_UMaxV, "required");
1009     switch(elem_bt) {
1010       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
1011       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
1012       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
1013       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
1014       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1015     }
1016   }
1017 }
1018 
1019 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1020                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1021                                  int vlen_enc) {
1022   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1023 
1024   if (opcode == Op_MinV) {
1025     if (elem_bt == T_BYTE) {
1026       vpminsb(dst, src1, src2, vlen_enc);
1027     } else if (elem_bt == T_SHORT) {
1028       vpminsw(dst, src1, src2, vlen_enc);
1029     } else if (elem_bt == T_INT) {
1030       vpminsd(dst, src1, src2, vlen_enc);
1031     } else {
1032       assert(elem_bt == T_LONG, "required");
1033       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1034         vpminsq(dst, src1, src2, vlen_enc);
1035       } else {
1036         assert_different_registers(dst, src1, src2);
1037         vpcmpgtq(dst, src1, src2, vlen_enc);
1038         vblendvpd(dst, src1, src2, dst, vlen_enc);
1039       }
1040     }
1041   } else { // opcode == Op_MaxV
1042     if (elem_bt == T_BYTE) {
1043       vpmaxsb(dst, src1, src2, vlen_enc);
1044     } else if (elem_bt == T_SHORT) {
1045       vpmaxsw(dst, src1, src2, vlen_enc);
1046     } else if (elem_bt == T_INT) {
1047       vpmaxsd(dst, src1, src2, vlen_enc);
1048     } else {
1049       assert(elem_bt == T_LONG, "required");
1050       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1051         vpmaxsq(dst, src1, src2, vlen_enc);
1052       } else {
1053         assert_different_registers(dst, src1, src2);
1054         vpcmpgtq(dst, src1, src2, vlen_enc);
1055         vblendvpd(dst, src2, src1, dst, vlen_enc);
1056       }
1057     }
1058   }
1059 }
1060 
1061 // Float/Double min max
1062 
1063 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1064                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1065                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1066                                    int vlen_enc) {
1067   assert(UseAVX > 0, "required");
1068   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1069          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1070   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1071   assert_different_registers(a, tmp, atmp, btmp);
1072   assert_different_registers(b, tmp, atmp, btmp);
1073 
1074   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1075   bool is_double_word = is_double_word_type(elem_bt);
1076 
1077   /* Note on 'non-obvious' assembly sequence:
1078    *
1079    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1080    * and Java on how they handle floats:
1081    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1082    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1083    *
1084    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1085    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1086    *                (only useful when signs differ, noop otherwise)
1087    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1088 
1089    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1090    *   btmp = (b < +0.0) ? a : b
1091    *   atmp = (b < +0.0) ? b : a
1092    *   Tmp  = Max_Float(atmp , btmp)
1093    *   Res  = (atmp == NaN) ? atmp : Tmp
1094    */
1095 
1096   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1097   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1098   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1099   XMMRegister mask;
1100 
1101   if (!is_double_word && is_min) {
1102     mask = a;
1103     vblend = &MacroAssembler::vblendvps;
1104     vmaxmin = &MacroAssembler::vminps;
1105     vcmp = &MacroAssembler::vcmpps;
1106   } else if (!is_double_word && !is_min) {
1107     mask = b;
1108     vblend = &MacroAssembler::vblendvps;
1109     vmaxmin = &MacroAssembler::vmaxps;
1110     vcmp = &MacroAssembler::vcmpps;
1111   } else if (is_double_word && is_min) {
1112     mask = a;
1113     vblend = &MacroAssembler::vblendvpd;
1114     vmaxmin = &MacroAssembler::vminpd;
1115     vcmp = &MacroAssembler::vcmppd;
1116   } else {
1117     assert(is_double_word && !is_min, "sanity");
1118     mask = b;
1119     vblend = &MacroAssembler::vblendvpd;
1120     vmaxmin = &MacroAssembler::vmaxpd;
1121     vcmp = &MacroAssembler::vcmppd;
1122   }
1123 
1124   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1125   XMMRegister maxmin, scratch;
1126   if (dst == btmp) {
1127     maxmin = btmp;
1128     scratch = tmp;
1129   } else {
1130     maxmin = tmp;
1131     scratch = btmp;
1132   }
1133 
1134   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1135   if (precompute_mask && !is_double_word) {
1136     vpsrad(tmp, mask, 32, vlen_enc);
1137     mask = tmp;
1138   } else if (precompute_mask && is_double_word) {
1139     vpxor(tmp, tmp, tmp, vlen_enc);
1140     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1141     mask = tmp;
1142   }
1143 
1144   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1145   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1146   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1147   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1148   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1149 }
1150 
1151 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1152                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1153                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1154                                     int vlen_enc) {
1155   assert(UseAVX > 2, "required");
1156   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1157          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1158   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1159   assert_different_registers(dst, a, atmp, btmp);
1160   assert_different_registers(dst, b, atmp, btmp);
1161 
1162   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1163   bool is_double_word = is_double_word_type(elem_bt);
1164   bool merge = true;
1165 
1166   if (!is_double_word && is_min) {
1167     evpmovd2m(ktmp, a, vlen_enc);
1168     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1169     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1170     vminps(dst, atmp, btmp, vlen_enc);
1171     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1172     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1173   } else if (!is_double_word && !is_min) {
1174     evpmovd2m(ktmp, b, vlen_enc);
1175     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1176     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1177     vmaxps(dst, atmp, btmp, vlen_enc);
1178     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1179     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1180   } else if (is_double_word && is_min) {
1181     evpmovq2m(ktmp, a, vlen_enc);
1182     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1183     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1184     vminpd(dst, atmp, btmp, vlen_enc);
1185     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1186     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1187   } else {
1188     assert(is_double_word && !is_min, "sanity");
1189     evpmovq2m(ktmp, b, vlen_enc);
1190     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1191     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1192     vmaxpd(dst, atmp, btmp, vlen_enc);
1193     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1194     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1195   }
1196 }
1197 
1198 // Float/Double signum
1199 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1200   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1201 
1202   Label DONE_LABEL;
1203 
1204   if (opcode == Op_SignumF) {
1205     assert(UseSSE > 0, "required");
1206     ucomiss(dst, zero);
1207     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1208     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1209     movflt(dst, one);
1210     jcc(Assembler::above, DONE_LABEL);
1211     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1212   } else if (opcode == Op_SignumD) {
1213     assert(UseSSE > 1, "required");
1214     ucomisd(dst, zero);
1215     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1216     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1217     movdbl(dst, one);
1218     jcc(Assembler::above, DONE_LABEL);
1219     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1220   }
1221 
1222   bind(DONE_LABEL);
1223 }
1224 
1225 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1226   if (sign) {
1227     pmovsxbw(dst, src);
1228   } else {
1229     pmovzxbw(dst, src);
1230   }
1231 }
1232 
1233 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1234   if (sign) {
1235     vpmovsxbw(dst, src, vector_len);
1236   } else {
1237     vpmovzxbw(dst, src, vector_len);
1238   }
1239 }
1240 
1241 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1242   if (sign) {
1243     vpmovsxbd(dst, src, vector_len);
1244   } else {
1245     vpmovzxbd(dst, src, vector_len);
1246   }
1247 }
1248 
1249 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1250   if (sign) {
1251     vpmovsxwd(dst, src, vector_len);
1252   } else {
1253     vpmovzxwd(dst, src, vector_len);
1254   }
1255 }
1256 
1257 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1258                                      int shift, int vector_len) {
1259   if (opcode == Op_RotateLeftV) {
1260     if (etype == T_INT) {
1261       evprold(dst, src, shift, vector_len);
1262     } else {
1263       assert(etype == T_LONG, "expected type T_LONG");
1264       evprolq(dst, src, shift, vector_len);
1265     }
1266   } else {
1267     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1268     if (etype == T_INT) {
1269       evprord(dst, src, shift, vector_len);
1270     } else {
1271       assert(etype == T_LONG, "expected type T_LONG");
1272       evprorq(dst, src, shift, vector_len);
1273     }
1274   }
1275 }
1276 
1277 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1278                                      XMMRegister shift, int vector_len) {
1279   if (opcode == Op_RotateLeftV) {
1280     if (etype == T_INT) {
1281       evprolvd(dst, src, shift, vector_len);
1282     } else {
1283       assert(etype == T_LONG, "expected type T_LONG");
1284       evprolvq(dst, src, shift, vector_len);
1285     }
1286   } else {
1287     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1288     if (etype == T_INT) {
1289       evprorvd(dst, src, shift, vector_len);
1290     } else {
1291       assert(etype == T_LONG, "expected type T_LONG");
1292       evprorvq(dst, src, shift, vector_len);
1293     }
1294   }
1295 }
1296 
1297 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1298   if (opcode == Op_RShiftVI) {
1299     psrad(dst, shift);
1300   } else if (opcode == Op_LShiftVI) {
1301     pslld(dst, shift);
1302   } else {
1303     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1304     psrld(dst, shift);
1305   }
1306 }
1307 
1308 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1309   switch (opcode) {
1310     case Op_RShiftVI:  psrad(dst, shift); break;
1311     case Op_LShiftVI:  pslld(dst, shift); break;
1312     case Op_URShiftVI: psrld(dst, shift); break;
1313 
1314     default: assert(false, "%s", NodeClassNames[opcode]);
1315   }
1316 }
1317 
1318 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1319   if (opcode == Op_RShiftVI) {
1320     vpsrad(dst, nds, shift, vector_len);
1321   } else if (opcode == Op_LShiftVI) {
1322     vpslld(dst, nds, shift, vector_len);
1323   } else {
1324     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1325     vpsrld(dst, nds, shift, vector_len);
1326   }
1327 }
1328 
1329 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1330   switch (opcode) {
1331     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1332     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1333     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1334 
1335     default: assert(false, "%s", NodeClassNames[opcode]);
1336   }
1337 }
1338 
1339 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1340   switch (opcode) {
1341     case Op_RShiftVB:  // fall-through
1342     case Op_RShiftVS:  psraw(dst, shift); break;
1343 
1344     case Op_LShiftVB:  // fall-through
1345     case Op_LShiftVS:  psllw(dst, shift);   break;
1346 
1347     case Op_URShiftVS: // fall-through
1348     case Op_URShiftVB: psrlw(dst, shift);  break;
1349 
1350     default: assert(false, "%s", NodeClassNames[opcode]);
1351   }
1352 }
1353 
1354 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1355   switch (opcode) {
1356     case Op_RShiftVB:  // fall-through
1357     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1358 
1359     case Op_LShiftVB:  // fall-through
1360     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1361 
1362     case Op_URShiftVS: // fall-through
1363     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1364 
1365     default: assert(false, "%s", NodeClassNames[opcode]);
1366   }
1367 }
1368 
1369 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1370   switch (opcode) {
1371     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1372     case Op_LShiftVL:  psllq(dst, shift); break;
1373     case Op_URShiftVL: psrlq(dst, shift); break;
1374 
1375     default: assert(false, "%s", NodeClassNames[opcode]);
1376   }
1377 }
1378 
1379 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1380   if (opcode == Op_RShiftVL) {
1381     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1382   } else if (opcode == Op_LShiftVL) {
1383     psllq(dst, shift);
1384   } else {
1385     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1386     psrlq(dst, shift);
1387   }
1388 }
1389 
1390 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1391   switch (opcode) {
1392     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1393     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1394     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1395 
1396     default: assert(false, "%s", NodeClassNames[opcode]);
1397   }
1398 }
1399 
1400 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1401   if (opcode == Op_RShiftVL) {
1402     evpsraq(dst, nds, shift, vector_len);
1403   } else if (opcode == Op_LShiftVL) {
1404     vpsllq(dst, nds, shift, vector_len);
1405   } else {
1406     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1407     vpsrlq(dst, nds, shift, vector_len);
1408   }
1409 }
1410 
1411 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1412   switch (opcode) {
1413     case Op_RShiftVB:  // fall-through
1414     case Op_RShiftVS:  // fall-through
1415     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1416 
1417     case Op_LShiftVB:  // fall-through
1418     case Op_LShiftVS:  // fall-through
1419     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1420 
1421     case Op_URShiftVB: // fall-through
1422     case Op_URShiftVS: // fall-through
1423     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1424 
1425     default: assert(false, "%s", NodeClassNames[opcode]);
1426   }
1427 }
1428 
1429 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1430   switch (opcode) {
1431     case Op_RShiftVB:  // fall-through
1432     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1433 
1434     case Op_LShiftVB:  // fall-through
1435     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1436 
1437     case Op_URShiftVB: // fall-through
1438     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1439 
1440     default: assert(false, "%s", NodeClassNames[opcode]);
1441   }
1442 }
1443 
1444 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1445   assert(UseAVX >= 2, "required");
1446   switch (opcode) {
1447     case Op_RShiftVL: {
1448       if (UseAVX > 2) {
1449         assert(tmp == xnoreg, "not used");
1450         if (!VM_Version::supports_avx512vl()) {
1451           vlen_enc = Assembler::AVX_512bit;
1452         }
1453         evpsravq(dst, src, shift, vlen_enc);
1454       } else {
1455         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1456         vpsrlvq(dst, src, shift, vlen_enc);
1457         vpsrlvq(tmp, tmp, shift, vlen_enc);
1458         vpxor(dst, dst, tmp, vlen_enc);
1459         vpsubq(dst, dst, tmp, vlen_enc);
1460       }
1461       break;
1462     }
1463     case Op_LShiftVL: {
1464       assert(tmp == xnoreg, "not used");
1465       vpsllvq(dst, src, shift, vlen_enc);
1466       break;
1467     }
1468     case Op_URShiftVL: {
1469       assert(tmp == xnoreg, "not used");
1470       vpsrlvq(dst, src, shift, vlen_enc);
1471       break;
1472     }
1473     default: assert(false, "%s", NodeClassNames[opcode]);
1474   }
1475 }
1476 
1477 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1478 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1479   assert(opcode == Op_LShiftVB ||
1480          opcode == Op_RShiftVB ||
1481          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1482   bool sign = (opcode != Op_URShiftVB);
1483   assert(vector_len == 0, "required");
1484   vextendbd(sign, dst, src, 1);
1485   vpmovzxbd(vtmp, shift, 1);
1486   varshiftd(opcode, dst, dst, vtmp, 1);
1487   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1488   vextracti128_high(vtmp, dst);
1489   vpackusdw(dst, dst, vtmp, 0);
1490 }
1491 
1492 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1493 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1494   assert(opcode == Op_LShiftVB ||
1495          opcode == Op_RShiftVB ||
1496          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1497   bool sign = (opcode != Op_URShiftVB);
1498   int ext_vector_len = vector_len + 1;
1499   vextendbw(sign, dst, src, ext_vector_len);
1500   vpmovzxbw(vtmp, shift, ext_vector_len);
1501   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1502   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1503   if (vector_len == 0) {
1504     vextracti128_high(vtmp, dst);
1505     vpackuswb(dst, dst, vtmp, vector_len);
1506   } else {
1507     vextracti64x4_high(vtmp, dst);
1508     vpackuswb(dst, dst, vtmp, vector_len);
1509     vpermq(dst, dst, 0xD8, vector_len);
1510   }
1511 }
1512 
1513 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1514   switch(typ) {
1515     case T_BYTE:
1516       pinsrb(dst, val, idx);
1517       break;
1518     case T_SHORT:
1519       pinsrw(dst, val, idx);
1520       break;
1521     case T_INT:
1522       pinsrd(dst, val, idx);
1523       break;
1524     case T_LONG:
1525       pinsrq(dst, val, idx);
1526       break;
1527     default:
1528       assert(false,"Should not reach here.");
1529       break;
1530   }
1531 }
1532 
1533 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1534   switch(typ) {
1535     case T_BYTE:
1536       vpinsrb(dst, src, val, idx);
1537       break;
1538     case T_SHORT:
1539       vpinsrw(dst, src, val, idx);
1540       break;
1541     case T_INT:
1542       vpinsrd(dst, src, val, idx);
1543       break;
1544     case T_LONG:
1545       vpinsrq(dst, src, val, idx);
1546       break;
1547     default:
1548       assert(false,"Should not reach here.");
1549       break;
1550   }
1551 }
1552 
1553 #ifdef _LP64
1554 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1555                                                 XMMRegister dst, Register base,
1556                                                 Register idx_base,
1557                                                 Register offset, Register mask,
1558                                                 Register mask_idx, Register rtmp,
1559                                                 int vlen_enc) {
1560   vpxor(dst, dst, dst, vlen_enc);
1561   if (elem_bt == T_SHORT) {
1562     for (int i = 0; i < 4; i++) {
1563       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1564       Label skip_load;
1565       btq(mask, mask_idx);
1566       jccb(Assembler::carryClear, skip_load);
1567       movl(rtmp, Address(idx_base, i * 4));
1568       if (offset != noreg) {
1569         addl(rtmp, offset);
1570       }
1571       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1572       bind(skip_load);
1573       incq(mask_idx);
1574     }
1575   } else {
1576     assert(elem_bt == T_BYTE, "");
1577     for (int i = 0; i < 8; i++) {
1578       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1579       Label skip_load;
1580       btq(mask, mask_idx);
1581       jccb(Assembler::carryClear, skip_load);
1582       movl(rtmp, Address(idx_base, i * 4));
1583       if (offset != noreg) {
1584         addl(rtmp, offset);
1585       }
1586       pinsrb(dst, Address(base, rtmp), i);
1587       bind(skip_load);
1588       incq(mask_idx);
1589     }
1590   }
1591 }
1592 #endif // _LP64
1593 
1594 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1595                                          Register base, Register idx_base,
1596                                          Register offset, Register rtmp,
1597                                          int vlen_enc) {
1598   vpxor(dst, dst, dst, vlen_enc);
1599   if (elem_bt == T_SHORT) {
1600     for (int i = 0; i < 4; i++) {
1601       // dst[i] = src[offset + idx_base[i]]
1602       movl(rtmp, Address(idx_base, i * 4));
1603       if (offset != noreg) {
1604         addl(rtmp, offset);
1605       }
1606       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1607     }
1608   } else {
1609     assert(elem_bt == T_BYTE, "");
1610     for (int i = 0; i < 8; i++) {
1611       // dst[i] = src[offset + idx_base[i]]
1612       movl(rtmp, Address(idx_base, i * 4));
1613       if (offset != noreg) {
1614         addl(rtmp, offset);
1615       }
1616       pinsrb(dst, Address(base, rtmp), i);
1617     }
1618   }
1619 }
1620 
1621 /*
1622  * Gather using hybrid algorithm, first partially unroll scalar loop
1623  * to accumulate values from gather indices into a quad-word(64bit) slice.
1624  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1625  * permutation to place the slice into appropriate vector lane
1626  * locations in destination vector. Following pseudo code describes the
1627  * algorithm in detail:
1628  *
1629  * DST_VEC = ZERO_VEC
1630  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1631  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1632  * FOREACH_ITER:
1633  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1634  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1635  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1636  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1637  *
1638  * With each iteration, doubleword permute indices (0,1) corresponding
1639  * to gathered quadword gets right shifted by two lane positions.
1640  *
1641  */
1642 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1643                                         Register base, Register idx_base,
1644                                         Register offset, Register mask,
1645                                         XMMRegister xtmp1, XMMRegister xtmp2,
1646                                         XMMRegister temp_dst, Register rtmp,
1647                                         Register mask_idx, Register length,
1648                                         int vector_len, int vlen_enc) {
1649   Label GATHER8_LOOP;
1650   assert(is_subword_type(elem_ty), "");
1651   movl(length, vector_len);
1652   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1653   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1654   vallones(xtmp2, vlen_enc);
1655   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1656   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1657   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1658 
1659   bind(GATHER8_LOOP);
1660     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1661     if (mask == noreg) {
1662       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1663     } else {
1664       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1665     }
1666     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1667     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1668     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1669     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1670     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1671     vpor(dst, dst, temp_dst, vlen_enc);
1672     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1673     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1674     jcc(Assembler::notEqual, GATHER8_LOOP);
1675 }
1676 
1677 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1678   switch(typ) {
1679     case T_INT:
1680       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1681       break;
1682     case T_FLOAT:
1683       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1684       break;
1685     case T_LONG:
1686       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1687       break;
1688     case T_DOUBLE:
1689       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1690       break;
1691     default:
1692       assert(false,"Should not reach here.");
1693       break;
1694   }
1695 }
1696 
1697 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1698   switch(typ) {
1699     case T_INT:
1700       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1701       break;
1702     case T_FLOAT:
1703       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1704       break;
1705     case T_LONG:
1706       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1707       break;
1708     case T_DOUBLE:
1709       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1710       break;
1711     default:
1712       assert(false,"Should not reach here.");
1713       break;
1714   }
1715 }
1716 
1717 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1718   switch(typ) {
1719     case T_INT:
1720       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1721       break;
1722     case T_FLOAT:
1723       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1724       break;
1725     case T_LONG:
1726       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1727       break;
1728     case T_DOUBLE:
1729       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1730       break;
1731     default:
1732       assert(false,"Should not reach here.");
1733       break;
1734   }
1735 }
1736 
1737 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1738   if (vlen_in_bytes <= 16) {
1739     pxor (dst, dst);
1740     psubb(dst, src);
1741     switch (elem_bt) {
1742       case T_BYTE:   /* nothing to do */ break;
1743       case T_SHORT:  pmovsxbw(dst, dst); break;
1744       case T_INT:    pmovsxbd(dst, dst); break;
1745       case T_FLOAT:  pmovsxbd(dst, dst); break;
1746       case T_LONG:   pmovsxbq(dst, dst); break;
1747       case T_DOUBLE: pmovsxbq(dst, dst); break;
1748 
1749       default: assert(false, "%s", type2name(elem_bt));
1750     }
1751   } else {
1752     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1753     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1754 
1755     vpxor (dst, dst, dst, vlen_enc);
1756     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1757 
1758     switch (elem_bt) {
1759       case T_BYTE:   /* nothing to do */            break;
1760       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1761       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1762       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1763       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1764       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1765 
1766       default: assert(false, "%s", type2name(elem_bt));
1767     }
1768   }
1769 }
1770 
1771 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1772   if (novlbwdq) {
1773     vpmovsxbd(xtmp, src, vlen_enc);
1774     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1775             Assembler::eq, true, vlen_enc, noreg);
1776   } else {
1777     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1778     vpsubb(xtmp, xtmp, src, vlen_enc);
1779     evpmovb2m(dst, xtmp, vlen_enc);
1780   }
1781 }
1782 
1783 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1784   switch (vlen_in_bytes) {
1785     case 4:  movdl(dst, src);   break;
1786     case 8:  movq(dst, src);    break;
1787     case 16: movdqu(dst, src);  break;
1788     case 32: vmovdqu(dst, src); break;
1789     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1790     default: ShouldNotReachHere();
1791   }
1792 }
1793 
1794 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1795   assert(rscratch != noreg || always_reachable(src), "missing");
1796 
1797   if (reachable(src)) {
1798     load_vector(dst, as_Address(src), vlen_in_bytes);
1799   } else {
1800     lea(rscratch, src);
1801     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1802   }
1803 }
1804 
1805 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1806   int vlen_enc = vector_length_encoding(vlen);
1807   if (VM_Version::supports_avx()) {
1808     if (bt == T_LONG) {
1809       if (VM_Version::supports_avx2()) {
1810         vpbroadcastq(dst, src, vlen_enc);
1811       } else {
1812         vmovddup(dst, src, vlen_enc);
1813       }
1814     } else if (bt == T_DOUBLE) {
1815       if (vlen_enc != Assembler::AVX_128bit) {
1816         vbroadcastsd(dst, src, vlen_enc, noreg);
1817       } else {
1818         vmovddup(dst, src, vlen_enc);
1819       }
1820     } else {
1821       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1822         vpbroadcastd(dst, src, vlen_enc);
1823       } else {
1824         vbroadcastss(dst, src, vlen_enc);
1825       }
1826     }
1827   } else if (VM_Version::supports_sse3()) {
1828     movddup(dst, src);
1829   } else {
1830     movq(dst, src);
1831     if (vlen == 16) {
1832       punpcklqdq(dst, dst);
1833     }
1834   }
1835 }
1836 
1837 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1838   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1839   int offset = exact_log2(type2aelembytes(bt)) << 6;
1840   if (is_floating_point_type(bt)) {
1841     offset += 128;
1842   }
1843   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1844   load_vector(dst, addr, vlen_in_bytes);
1845 }
1846 
1847 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1848 
1849 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1850   int vector_len = Assembler::AVX_128bit;
1851 
1852   switch (opcode) {
1853     case Op_AndReductionV:  pand(dst, src); break;
1854     case Op_OrReductionV:   por (dst, src); break;
1855     case Op_XorReductionV:  pxor(dst, src); break;
1856     case Op_MinReductionV:
1857       switch (typ) {
1858         case T_BYTE:        pminsb(dst, src); break;
1859         case T_SHORT:       pminsw(dst, src); break;
1860         case T_INT:         pminsd(dst, src); break;
1861         case T_LONG:        assert(UseAVX > 2, "required");
1862                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1863         default:            assert(false, "wrong type");
1864       }
1865       break;
1866     case Op_MaxReductionV:
1867       switch (typ) {
1868         case T_BYTE:        pmaxsb(dst, src); break;
1869         case T_SHORT:       pmaxsw(dst, src); break;
1870         case T_INT:         pmaxsd(dst, src); break;
1871         case T_LONG:        assert(UseAVX > 2, "required");
1872                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1873         default:            assert(false, "wrong type");
1874       }
1875       break;
1876     case Op_AddReductionVF: addss(dst, src); break;
1877     case Op_AddReductionVD: addsd(dst, src); break;
1878     case Op_AddReductionVI:
1879       switch (typ) {
1880         case T_BYTE:        paddb(dst, src); break;
1881         case T_SHORT:       paddw(dst, src); break;
1882         case T_INT:         paddd(dst, src); break;
1883         default:            assert(false, "wrong type");
1884       }
1885       break;
1886     case Op_AddReductionVL: paddq(dst, src); break;
1887     case Op_MulReductionVF: mulss(dst, src); break;
1888     case Op_MulReductionVD: mulsd(dst, src); break;
1889     case Op_MulReductionVI:
1890       switch (typ) {
1891         case T_SHORT:       pmullw(dst, src); break;
1892         case T_INT:         pmulld(dst, src); break;
1893         default:            assert(false, "wrong type");
1894       }
1895       break;
1896     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1897                             evpmullq(dst, dst, src, vector_len); break;
1898     default:                assert(false, "wrong opcode");
1899   }
1900 }
1901 
1902 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1903   switch (opcode) {
1904     case Op_AddReductionVF: addps(dst, src); break;
1905     case Op_AddReductionVD: addpd(dst, src); break;
1906     case Op_MulReductionVF: mulps(dst, src); break;
1907     case Op_MulReductionVD: mulpd(dst, src); break;
1908     default:                assert(false, "%s", NodeClassNames[opcode]);
1909   }
1910 }
1911 
1912 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1913   int vector_len = Assembler::AVX_256bit;
1914 
1915   switch (opcode) {
1916     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1917     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1918     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1919     case Op_MinReductionV:
1920       switch (typ) {
1921         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1922         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1923         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1924         case T_LONG:        assert(UseAVX > 2, "required");
1925                             vpminsq(dst, src1, src2, vector_len); break;
1926         default:            assert(false, "wrong type");
1927       }
1928       break;
1929     case Op_MaxReductionV:
1930       switch (typ) {
1931         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1932         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1933         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1934         case T_LONG:        assert(UseAVX > 2, "required");
1935                             vpmaxsq(dst, src1, src2, vector_len); break;
1936         default:            assert(false, "wrong type");
1937       }
1938       break;
1939     case Op_AddReductionVI:
1940       switch (typ) {
1941         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1942         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1943         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1944         default:            assert(false, "wrong type");
1945       }
1946       break;
1947     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1948     case Op_MulReductionVI:
1949       switch (typ) {
1950         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1951         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1952         default:            assert(false, "wrong type");
1953       }
1954       break;
1955     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1956     default:                assert(false, "wrong opcode");
1957   }
1958 }
1959 
1960 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1961   int vector_len = Assembler::AVX_256bit;
1962 
1963   switch (opcode) {
1964     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1965     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1966     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1967     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1968     default:                assert(false, "%s", NodeClassNames[opcode]);
1969   }
1970 }
1971 
1972 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1973                                   XMMRegister dst, XMMRegister src,
1974                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1975   switch (opcode) {
1976     case Op_AddReductionVF:
1977     case Op_MulReductionVF:
1978       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1979       break;
1980 
1981     case Op_AddReductionVD:
1982     case Op_MulReductionVD:
1983       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1984       break;
1985 
1986     default: assert(false, "wrong opcode");
1987   }
1988 }
1989 
1990 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1991                                             XMMRegister dst, XMMRegister src,
1992                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1993   switch (opcode) {
1994     case Op_AddReductionVF:
1995     case Op_MulReductionVF:
1996       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1997       break;
1998 
1999     case Op_AddReductionVD:
2000     case Op_MulReductionVD:
2001       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2002       break;
2003 
2004     default: assert(false, "%s", NodeClassNames[opcode]);
2005   }
2006 }
2007 
2008 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2009                              Register dst, Register src1, XMMRegister src2,
2010                              XMMRegister vtmp1, XMMRegister vtmp2) {
2011   switch (vlen) {
2012     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2013     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2014     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2015     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2016 
2017     default: assert(false, "wrong vector length");
2018   }
2019 }
2020 
2021 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2022                              Register dst, Register src1, XMMRegister src2,
2023                              XMMRegister vtmp1, XMMRegister vtmp2) {
2024   switch (vlen) {
2025     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2026     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2027     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2028     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2029 
2030     default: assert(false, "wrong vector length");
2031   }
2032 }
2033 
2034 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2035                              Register dst, Register src1, XMMRegister src2,
2036                              XMMRegister vtmp1, XMMRegister vtmp2) {
2037   switch (vlen) {
2038     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2039     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2040     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2041     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2042 
2043     default: assert(false, "wrong vector length");
2044   }
2045 }
2046 
2047 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2048                              Register dst, Register src1, XMMRegister src2,
2049                              XMMRegister vtmp1, XMMRegister vtmp2) {
2050   switch (vlen) {
2051     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2052     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2053     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2054     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2055 
2056     default: assert(false, "wrong vector length");
2057   }
2058 }
2059 
2060 #ifdef _LP64
2061 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2062                              Register dst, Register src1, XMMRegister src2,
2063                              XMMRegister vtmp1, XMMRegister vtmp2) {
2064   switch (vlen) {
2065     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2066     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2067     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2068 
2069     default: assert(false, "wrong vector length");
2070   }
2071 }
2072 #endif // _LP64
2073 
2074 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2075   switch (vlen) {
2076     case 2:
2077       assert(vtmp2 == xnoreg, "");
2078       reduce2F(opcode, dst, src, vtmp1);
2079       break;
2080     case 4:
2081       assert(vtmp2 == xnoreg, "");
2082       reduce4F(opcode, dst, src, vtmp1);
2083       break;
2084     case 8:
2085       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2086       break;
2087     case 16:
2088       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2089       break;
2090     default: assert(false, "wrong vector length");
2091   }
2092 }
2093 
2094 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2095   switch (vlen) {
2096     case 2:
2097       assert(vtmp2 == xnoreg, "");
2098       reduce2D(opcode, dst, src, vtmp1);
2099       break;
2100     case 4:
2101       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2102       break;
2103     case 8:
2104       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2105       break;
2106     default: assert(false, "wrong vector length");
2107   }
2108 }
2109 
2110 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2111   switch (vlen) {
2112     case 2:
2113       assert(vtmp1 == xnoreg, "");
2114       assert(vtmp2 == xnoreg, "");
2115       unorderedReduce2F(opcode, dst, src);
2116       break;
2117     case 4:
2118       assert(vtmp2 == xnoreg, "");
2119       unorderedReduce4F(opcode, dst, src, vtmp1);
2120       break;
2121     case 8:
2122       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2123       break;
2124     case 16:
2125       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2126       break;
2127     default: assert(false, "wrong vector length");
2128   }
2129 }
2130 
2131 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2132   switch (vlen) {
2133     case 2:
2134       assert(vtmp1 == xnoreg, "");
2135       assert(vtmp2 == xnoreg, "");
2136       unorderedReduce2D(opcode, dst, src);
2137       break;
2138     case 4:
2139       assert(vtmp2 == xnoreg, "");
2140       unorderedReduce4D(opcode, dst, src, vtmp1);
2141       break;
2142     case 8:
2143       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2144       break;
2145     default: assert(false, "wrong vector length");
2146   }
2147 }
2148 
2149 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2150   if (opcode == Op_AddReductionVI) {
2151     if (vtmp1 != src2) {
2152       movdqu(vtmp1, src2);
2153     }
2154     phaddd(vtmp1, vtmp1);
2155   } else {
2156     pshufd(vtmp1, src2, 0x1);
2157     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2158   }
2159   movdl(vtmp2, src1);
2160   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2161   movdl(dst, vtmp1);
2162 }
2163 
2164 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2165   if (opcode == Op_AddReductionVI) {
2166     if (vtmp1 != src2) {
2167       movdqu(vtmp1, src2);
2168     }
2169     phaddd(vtmp1, src2);
2170     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2171   } else {
2172     pshufd(vtmp2, src2, 0xE);
2173     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2174     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2175   }
2176 }
2177 
2178 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2179   if (opcode == Op_AddReductionVI) {
2180     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2181     vextracti128_high(vtmp2, vtmp1);
2182     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2183     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2184   } else {
2185     vextracti128_high(vtmp1, src2);
2186     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2187     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2188   }
2189 }
2190 
2191 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2192   vextracti64x4_high(vtmp2, src2);
2193   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2194   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2195 }
2196 
2197 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2198   pshufd(vtmp2, src2, 0x1);
2199   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2200   movdqu(vtmp1, vtmp2);
2201   psrldq(vtmp1, 2);
2202   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2203   movdqu(vtmp2, vtmp1);
2204   psrldq(vtmp2, 1);
2205   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2206   movdl(vtmp2, src1);
2207   pmovsxbd(vtmp1, vtmp1);
2208   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2209   pextrb(dst, vtmp1, 0x0);
2210   movsbl(dst, dst);
2211 }
2212 
2213 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2214   pshufd(vtmp1, src2, 0xE);
2215   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2216   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2217 }
2218 
2219 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2220   vextracti128_high(vtmp2, src2);
2221   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2222   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2223 }
2224 
2225 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2226   vextracti64x4_high(vtmp1, src2);
2227   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2228   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2229 }
2230 
2231 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2232   pmovsxbw(vtmp2, src2);
2233   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2234 }
2235 
2236 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2237   if (UseAVX > 1) {
2238     int vector_len = Assembler::AVX_256bit;
2239     vpmovsxbw(vtmp1, src2, vector_len);
2240     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2241   } else {
2242     pmovsxbw(vtmp2, src2);
2243     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2244     pshufd(vtmp2, src2, 0x1);
2245     pmovsxbw(vtmp2, src2);
2246     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2247   }
2248 }
2249 
2250 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2251   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2252     int vector_len = Assembler::AVX_512bit;
2253     vpmovsxbw(vtmp1, src2, vector_len);
2254     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2255   } else {
2256     assert(UseAVX >= 2,"Should not reach here.");
2257     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2258     vextracti128_high(vtmp2, src2);
2259     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2260   }
2261 }
2262 
2263 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2264   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2265   vextracti64x4_high(vtmp2, src2);
2266   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2267 }
2268 
2269 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2270   if (opcode == Op_AddReductionVI) {
2271     if (vtmp1 != src2) {
2272       movdqu(vtmp1, src2);
2273     }
2274     phaddw(vtmp1, vtmp1);
2275     phaddw(vtmp1, vtmp1);
2276   } else {
2277     pshufd(vtmp2, src2, 0x1);
2278     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2279     movdqu(vtmp1, vtmp2);
2280     psrldq(vtmp1, 2);
2281     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2282   }
2283   movdl(vtmp2, src1);
2284   pmovsxwd(vtmp1, vtmp1);
2285   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2286   pextrw(dst, vtmp1, 0x0);
2287   movswl(dst, dst);
2288 }
2289 
2290 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2291   if (opcode == Op_AddReductionVI) {
2292     if (vtmp1 != src2) {
2293       movdqu(vtmp1, src2);
2294     }
2295     phaddw(vtmp1, src2);
2296   } else {
2297     pshufd(vtmp1, src2, 0xE);
2298     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2299   }
2300   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2301 }
2302 
2303 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2304   if (opcode == Op_AddReductionVI) {
2305     int vector_len = Assembler::AVX_256bit;
2306     vphaddw(vtmp2, src2, src2, vector_len);
2307     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2308   } else {
2309     vextracti128_high(vtmp2, src2);
2310     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2311   }
2312   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2313 }
2314 
2315 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2316   int vector_len = Assembler::AVX_256bit;
2317   vextracti64x4_high(vtmp1, src2);
2318   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2319   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2320 }
2321 
2322 #ifdef _LP64
2323 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2324   pshufd(vtmp2, src2, 0xE);
2325   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2326   movdq(vtmp1, src1);
2327   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2328   movdq(dst, vtmp1);
2329 }
2330 
2331 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2332   vextracti128_high(vtmp1, src2);
2333   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2334   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2335 }
2336 
2337 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2338   vextracti64x4_high(vtmp2, src2);
2339   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2340   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2341 }
2342 
2343 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2344   mov64(temp, -1L);
2345   bzhiq(temp, temp, len);
2346   kmovql(dst, temp);
2347 }
2348 #endif // _LP64
2349 
2350 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2351   reduce_operation_128(T_FLOAT, opcode, dst, src);
2352   pshufd(vtmp, src, 0x1);
2353   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2354 }
2355 
2356 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2357   reduce2F(opcode, dst, src, vtmp);
2358   pshufd(vtmp, src, 0x2);
2359   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2360   pshufd(vtmp, src, 0x3);
2361   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2362 }
2363 
2364 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2365   reduce4F(opcode, dst, src, vtmp2);
2366   vextractf128_high(vtmp2, src);
2367   reduce4F(opcode, dst, vtmp2, vtmp1);
2368 }
2369 
2370 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2371   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2372   vextracti64x4_high(vtmp1, src);
2373   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2374 }
2375 
2376 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2377   pshufd(dst, src, 0x1);
2378   reduce_operation_128(T_FLOAT, opcode, dst, src);
2379 }
2380 
2381 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2382   pshufd(vtmp, src, 0xE);
2383   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2384   unorderedReduce2F(opcode, dst, vtmp);
2385 }
2386 
2387 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2388   vextractf128_high(vtmp1, src);
2389   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2390   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2391 }
2392 
2393 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2394   vextractf64x4_high(vtmp2, src);
2395   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2396   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2397 }
2398 
2399 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2400   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2401   pshufd(vtmp, src, 0xE);
2402   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2403 }
2404 
2405 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2406   reduce2D(opcode, dst, src, vtmp2);
2407   vextractf128_high(vtmp2, src);
2408   reduce2D(opcode, dst, vtmp2, vtmp1);
2409 }
2410 
2411 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2412   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2413   vextracti64x4_high(vtmp1, src);
2414   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2415 }
2416 
2417 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2418   pshufd(dst, src, 0xE);
2419   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2420 }
2421 
2422 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2423   vextractf128_high(vtmp, src);
2424   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2425   unorderedReduce2D(opcode, dst, vtmp);
2426 }
2427 
2428 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2429   vextractf64x4_high(vtmp2, src);
2430   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2431   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2432 }
2433 
2434 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2435   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2436 }
2437 
2438 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2439   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2440 }
2441 
2442 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2443   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2444 }
2445 
2446 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2447                                  int vec_enc) {
2448   switch(elem_bt) {
2449     case T_INT:
2450     case T_FLOAT:
2451       vmaskmovps(dst, src, mask, vec_enc);
2452       break;
2453     case T_LONG:
2454     case T_DOUBLE:
2455       vmaskmovpd(dst, src, mask, vec_enc);
2456       break;
2457     default:
2458       fatal("Unsupported type %s", type2name(elem_bt));
2459       break;
2460   }
2461 }
2462 
2463 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2464                                  int vec_enc) {
2465   switch(elem_bt) {
2466     case T_INT:
2467     case T_FLOAT:
2468       vmaskmovps(dst, src, mask, vec_enc);
2469       break;
2470     case T_LONG:
2471     case T_DOUBLE:
2472       vmaskmovpd(dst, src, mask, vec_enc);
2473       break;
2474     default:
2475       fatal("Unsupported type %s", type2name(elem_bt));
2476       break;
2477   }
2478 }
2479 
2480 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2481                                           XMMRegister dst, XMMRegister src,
2482                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2483                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2484   const int permconst[] = {1, 14};
2485   XMMRegister wsrc = src;
2486   XMMRegister wdst = xmm_0;
2487   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2488 
2489   int vlen_enc = Assembler::AVX_128bit;
2490   if (vlen == 16) {
2491     vlen_enc = Assembler::AVX_256bit;
2492   }
2493 
2494   for (int i = log2(vlen) - 1; i >=0; i--) {
2495     if (i == 0 && !is_dst_valid) {
2496       wdst = dst;
2497     }
2498     if (i == 3) {
2499       vextracti64x4_high(wtmp, wsrc);
2500     } else if (i == 2) {
2501       vextracti128_high(wtmp, wsrc);
2502     } else { // i = [0,1]
2503       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2504     }
2505     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2506     wsrc = wdst;
2507     vlen_enc = Assembler::AVX_128bit;
2508   }
2509   if (is_dst_valid) {
2510     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2511   }
2512 }
2513 
2514 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2515                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2516                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2517   XMMRegister wsrc = src;
2518   XMMRegister wdst = xmm_0;
2519   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2520   int vlen_enc = Assembler::AVX_128bit;
2521   if (vlen == 8) {
2522     vlen_enc = Assembler::AVX_256bit;
2523   }
2524   for (int i = log2(vlen) - 1; i >=0; i--) {
2525     if (i == 0 && !is_dst_valid) {
2526       wdst = dst;
2527     }
2528     if (i == 1) {
2529       vextracti128_high(wtmp, wsrc);
2530     } else if (i == 2) {
2531       vextracti64x4_high(wtmp, wsrc);
2532     } else {
2533       assert(i == 0, "%d", i);
2534       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2535     }
2536     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2537     wsrc = wdst;
2538     vlen_enc = Assembler::AVX_128bit;
2539   }
2540   if (is_dst_valid) {
2541     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2542   }
2543 }
2544 
2545 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2546   switch (bt) {
2547     case T_BYTE:  pextrb(dst, src, idx); break;
2548     case T_SHORT: pextrw(dst, src, idx); break;
2549     case T_INT:   pextrd(dst, src, idx); break;
2550     case T_LONG:  pextrq(dst, src, idx); break;
2551 
2552     default:
2553       assert(false,"Should not reach here.");
2554       break;
2555   }
2556 }
2557 
2558 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2559   int esize =  type2aelembytes(typ);
2560   int elem_per_lane = 16/esize;
2561   int lane = elemindex / elem_per_lane;
2562   int eindex = elemindex % elem_per_lane;
2563 
2564   if (lane >= 2) {
2565     assert(UseAVX > 2, "required");
2566     vextractf32x4(dst, src, lane & 3);
2567     return dst;
2568   } else if (lane > 0) {
2569     assert(UseAVX > 0, "required");
2570     vextractf128(dst, src, lane);
2571     return dst;
2572   } else {
2573     return src;
2574   }
2575 }
2576 
2577 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2578   if (typ == T_BYTE) {
2579     movsbl(dst, dst);
2580   } else if (typ == T_SHORT) {
2581     movswl(dst, dst);
2582   }
2583 }
2584 
2585 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2586   int esize =  type2aelembytes(typ);
2587   int elem_per_lane = 16/esize;
2588   int eindex = elemindex % elem_per_lane;
2589   assert(is_integral_type(typ),"required");
2590 
2591   if (eindex == 0) {
2592     if (typ == T_LONG) {
2593       movq(dst, src);
2594     } else {
2595       movdl(dst, src);
2596       movsxl(typ, dst);
2597     }
2598   } else {
2599     extract(typ, dst, src, eindex);
2600     movsxl(typ, dst);
2601   }
2602 }
2603 
2604 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2605   int esize =  type2aelembytes(typ);
2606   int elem_per_lane = 16/esize;
2607   int eindex = elemindex % elem_per_lane;
2608   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2609 
2610   if (eindex == 0) {
2611     movq(dst, src);
2612   } else {
2613     if (typ == T_FLOAT) {
2614       if (UseAVX == 0) {
2615         movdqu(dst, src);
2616         shufps(dst, dst, eindex);
2617       } else {
2618         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2619       }
2620     } else {
2621       if (UseAVX == 0) {
2622         movdqu(dst, src);
2623         psrldq(dst, eindex*esize);
2624       } else {
2625         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2626       }
2627       movq(dst, dst);
2628     }
2629   }
2630   // Zero upper bits
2631   if (typ == T_FLOAT) {
2632     if (UseAVX == 0) {
2633       assert(vtmp != xnoreg, "required.");
2634       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2635       pand(dst, vtmp);
2636     } else {
2637       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2638     }
2639   }
2640 }
2641 
2642 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2643   switch(typ) {
2644     case T_BYTE:
2645     case T_BOOLEAN:
2646       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2647       break;
2648     case T_SHORT:
2649     case T_CHAR:
2650       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2651       break;
2652     case T_INT:
2653     case T_FLOAT:
2654       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2655       break;
2656     case T_LONG:
2657     case T_DOUBLE:
2658       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2659       break;
2660     default:
2661       assert(false,"Should not reach here.");
2662       break;
2663   }
2664 }
2665 
2666 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2667   assert(rscratch != noreg || always_reachable(src2), "missing");
2668 
2669   switch(typ) {
2670     case T_BOOLEAN:
2671     case T_BYTE:
2672       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2673       break;
2674     case T_CHAR:
2675     case T_SHORT:
2676       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2677       break;
2678     case T_INT:
2679     case T_FLOAT:
2680       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2681       break;
2682     case T_LONG:
2683     case T_DOUBLE:
2684       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2685       break;
2686     default:
2687       assert(false,"Should not reach here.");
2688       break;
2689   }
2690 }
2691 
2692 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2693   switch(typ) {
2694     case T_BYTE:
2695       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2696       break;
2697     case T_SHORT:
2698       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2699       break;
2700     case T_INT:
2701     case T_FLOAT:
2702       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2703       break;
2704     case T_LONG:
2705     case T_DOUBLE:
2706       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2707       break;
2708     default:
2709       assert(false,"Should not reach here.");
2710       break;
2711   }
2712 }
2713 
2714 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2715   assert(vlen_in_bytes <= 32, "");
2716   int esize = type2aelembytes(bt);
2717   if (vlen_in_bytes == 32) {
2718     assert(vtmp == xnoreg, "required.");
2719     if (esize >= 4) {
2720       vtestps(src1, src2, AVX_256bit);
2721     } else {
2722       vptest(src1, src2, AVX_256bit);
2723     }
2724     return;
2725   }
2726   if (vlen_in_bytes < 16) {
2727     // Duplicate the lower part to fill the whole register,
2728     // Don't need to do so for src2
2729     assert(vtmp != xnoreg, "required");
2730     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2731     pshufd(vtmp, src1, shuffle_imm);
2732   } else {
2733     assert(vtmp == xnoreg, "required");
2734     vtmp = src1;
2735   }
2736   if (esize >= 4 && VM_Version::supports_avx()) {
2737     vtestps(vtmp, src2, AVX_128bit);
2738   } else {
2739     ptest(vtmp, src2);
2740   }
2741 }
2742 
2743 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2744 #ifdef ASSERT
2745   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2746   bool is_bw_supported = VM_Version::supports_avx512bw();
2747   if (is_bw && !is_bw_supported) {
2748     assert(vlen_enc != Assembler::AVX_512bit, "required");
2749     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2750            "XMM register should be 0-15");
2751   }
2752 #endif // ASSERT
2753   switch (elem_bt) {
2754     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2755     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2756     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2757     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2758     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2759     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2760     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2761   }
2762 }
2763 
2764 #ifdef _LP64
2765 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2766   assert(UseAVX >= 2, "required");
2767   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2768   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2769   if ((UseAVX > 2) &&
2770       (!is_bw || VM_Version::supports_avx512bw()) &&
2771       (!is_vl || VM_Version::supports_avx512vl())) {
2772     switch (elem_bt) {
2773       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2774       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2775       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2776       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2777       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2778     }
2779   } else {
2780     assert(vlen_enc != Assembler::AVX_512bit, "required");
2781     assert((dst->encoding() < 16),"XMM register should be 0-15");
2782     switch (elem_bt) {
2783       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2784       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2785       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2786       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2787       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2788       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2789       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2790     }
2791   }
2792 }
2793 #endif
2794 
2795 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2796   switch (to_elem_bt) {
2797     case T_SHORT:
2798       vpmovsxbw(dst, src, vlen_enc);
2799       break;
2800     case T_INT:
2801       vpmovsxbd(dst, src, vlen_enc);
2802       break;
2803     case T_FLOAT:
2804       vpmovsxbd(dst, src, vlen_enc);
2805       vcvtdq2ps(dst, dst, vlen_enc);
2806       break;
2807     case T_LONG:
2808       vpmovsxbq(dst, src, vlen_enc);
2809       break;
2810     case T_DOUBLE: {
2811       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2812       vpmovsxbd(dst, src, mid_vlen_enc);
2813       vcvtdq2pd(dst, dst, vlen_enc);
2814       break;
2815     }
2816     default:
2817       fatal("Unsupported type %s", type2name(to_elem_bt));
2818       break;
2819   }
2820 }
2821 
2822 //-------------------------------------------------------------------------------------------
2823 
2824 // IndexOf for constant substrings with size >= 8 chars
2825 // which don't need to be loaded through stack.
2826 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2827                                          Register cnt1, Register cnt2,
2828                                          int int_cnt2,  Register result,
2829                                          XMMRegister vec, Register tmp,
2830                                          int ae) {
2831   ShortBranchVerifier sbv(this);
2832   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2833   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2834 
2835   // This method uses the pcmpestri instruction with bound registers
2836   //   inputs:
2837   //     xmm - substring
2838   //     rax - substring length (elements count)
2839   //     mem - scanned string
2840   //     rdx - string length (elements count)
2841   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2842   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2843   //   outputs:
2844   //     rcx - matched index in string
2845   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2846   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2847   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2848   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2849   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2850 
2851   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2852         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2853         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2854 
2855   // Note, inline_string_indexOf() generates checks:
2856   // if (substr.count > string.count) return -1;
2857   // if (substr.count == 0) return 0;
2858   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2859 
2860   // Load substring.
2861   if (ae == StrIntrinsicNode::UL) {
2862     pmovzxbw(vec, Address(str2, 0));
2863   } else {
2864     movdqu(vec, Address(str2, 0));
2865   }
2866   movl(cnt2, int_cnt2);
2867   movptr(result, str1); // string addr
2868 
2869   if (int_cnt2 > stride) {
2870     jmpb(SCAN_TO_SUBSTR);
2871 
2872     // Reload substr for rescan, this code
2873     // is executed only for large substrings (> 8 chars)
2874     bind(RELOAD_SUBSTR);
2875     if (ae == StrIntrinsicNode::UL) {
2876       pmovzxbw(vec, Address(str2, 0));
2877     } else {
2878       movdqu(vec, Address(str2, 0));
2879     }
2880     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2881 
2882     bind(RELOAD_STR);
2883     // We came here after the beginning of the substring was
2884     // matched but the rest of it was not so we need to search
2885     // again. Start from the next element after the previous match.
2886 
2887     // cnt2 is number of substring reminding elements and
2888     // cnt1 is number of string reminding elements when cmp failed.
2889     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2890     subl(cnt1, cnt2);
2891     addl(cnt1, int_cnt2);
2892     movl(cnt2, int_cnt2); // Now restore cnt2
2893 
2894     decrementl(cnt1);     // Shift to next element
2895     cmpl(cnt1, cnt2);
2896     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2897 
2898     addptr(result, (1<<scale1));
2899 
2900   } // (int_cnt2 > 8)
2901 
2902   // Scan string for start of substr in 16-byte vectors
2903   bind(SCAN_TO_SUBSTR);
2904   pcmpestri(vec, Address(result, 0), mode);
2905   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2906   subl(cnt1, stride);
2907   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2908   cmpl(cnt1, cnt2);
2909   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2910   addptr(result, 16);
2911   jmpb(SCAN_TO_SUBSTR);
2912 
2913   // Found a potential substr
2914   bind(FOUND_CANDIDATE);
2915   // Matched whole vector if first element matched (tmp(rcx) == 0).
2916   if (int_cnt2 == stride) {
2917     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2918   } else { // int_cnt2 > 8
2919     jccb(Assembler::overflow, FOUND_SUBSTR);
2920   }
2921   // After pcmpestri tmp(rcx) contains matched element index
2922   // Compute start addr of substr
2923   lea(result, Address(result, tmp, scale1));
2924 
2925   // Make sure string is still long enough
2926   subl(cnt1, tmp);
2927   cmpl(cnt1, cnt2);
2928   if (int_cnt2 == stride) {
2929     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2930   } else { // int_cnt2 > 8
2931     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2932   }
2933   // Left less then substring.
2934 
2935   bind(RET_NOT_FOUND);
2936   movl(result, -1);
2937   jmp(EXIT);
2938 
2939   if (int_cnt2 > stride) {
2940     // This code is optimized for the case when whole substring
2941     // is matched if its head is matched.
2942     bind(MATCH_SUBSTR_HEAD);
2943     pcmpestri(vec, Address(result, 0), mode);
2944     // Reload only string if does not match
2945     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2946 
2947     Label CONT_SCAN_SUBSTR;
2948     // Compare the rest of substring (> 8 chars).
2949     bind(FOUND_SUBSTR);
2950     // First 8 chars are already matched.
2951     negptr(cnt2);
2952     addptr(cnt2, stride);
2953 
2954     bind(SCAN_SUBSTR);
2955     subl(cnt1, stride);
2956     cmpl(cnt2, -stride); // Do not read beyond substring
2957     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2958     // Back-up strings to avoid reading beyond substring:
2959     // cnt1 = cnt1 - cnt2 + 8
2960     addl(cnt1, cnt2); // cnt2 is negative
2961     addl(cnt1, stride);
2962     movl(cnt2, stride); negptr(cnt2);
2963     bind(CONT_SCAN_SUBSTR);
2964     if (int_cnt2 < (int)G) {
2965       int tail_off1 = int_cnt2<<scale1;
2966       int tail_off2 = int_cnt2<<scale2;
2967       if (ae == StrIntrinsicNode::UL) {
2968         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2969       } else {
2970         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2971       }
2972       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2973     } else {
2974       // calculate index in register to avoid integer overflow (int_cnt2*2)
2975       movl(tmp, int_cnt2);
2976       addptr(tmp, cnt2);
2977       if (ae == StrIntrinsicNode::UL) {
2978         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2979       } else {
2980         movdqu(vec, Address(str2, tmp, scale2, 0));
2981       }
2982       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2983     }
2984     // Need to reload strings pointers if not matched whole vector
2985     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2986     addptr(cnt2, stride);
2987     jcc(Assembler::negative, SCAN_SUBSTR);
2988     // Fall through if found full substring
2989 
2990   } // (int_cnt2 > 8)
2991 
2992   bind(RET_FOUND);
2993   // Found result if we matched full small substring.
2994   // Compute substr offset
2995   subptr(result, str1);
2996   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2997     shrl(result, 1); // index
2998   }
2999   bind(EXIT);
3000 
3001 } // string_indexofC8
3002 
3003 // Small strings are loaded through stack if they cross page boundary.
3004 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
3005                                        Register cnt1, Register cnt2,
3006                                        int int_cnt2,  Register result,
3007                                        XMMRegister vec, Register tmp,
3008                                        int ae) {
3009   ShortBranchVerifier sbv(this);
3010   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3011   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3012 
3013   //
3014   // int_cnt2 is length of small (< 8 chars) constant substring
3015   // or (-1) for non constant substring in which case its length
3016   // is in cnt2 register.
3017   //
3018   // Note, inline_string_indexOf() generates checks:
3019   // if (substr.count > string.count) return -1;
3020   // if (substr.count == 0) return 0;
3021   //
3022   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3023   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3024   // This method uses the pcmpestri instruction with bound registers
3025   //   inputs:
3026   //     xmm - substring
3027   //     rax - substring length (elements count)
3028   //     mem - scanned string
3029   //     rdx - string length (elements count)
3030   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3031   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3032   //   outputs:
3033   //     rcx - matched index in string
3034   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3035   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3036   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3037   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3038 
3039   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3040         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3041         FOUND_CANDIDATE;
3042 
3043   { //========================================================
3044     // We don't know where these strings are located
3045     // and we can't read beyond them. Load them through stack.
3046     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3047 
3048     movptr(tmp, rsp); // save old SP
3049 
3050     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3051       if (int_cnt2 == (1>>scale2)) { // One byte
3052         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3053         load_unsigned_byte(result, Address(str2, 0));
3054         movdl(vec, result); // move 32 bits
3055       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3056         // Not enough header space in 32-bit VM: 12+3 = 15.
3057         movl(result, Address(str2, -1));
3058         shrl(result, 8);
3059         movdl(vec, result); // move 32 bits
3060       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3061         load_unsigned_short(result, Address(str2, 0));
3062         movdl(vec, result); // move 32 bits
3063       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3064         movdl(vec, Address(str2, 0)); // move 32 bits
3065       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3066         movq(vec, Address(str2, 0));  // move 64 bits
3067       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3068         // Array header size is 12 bytes in 32-bit VM
3069         // + 6 bytes for 3 chars == 18 bytes,
3070         // enough space to load vec and shift.
3071         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3072         if (ae == StrIntrinsicNode::UL) {
3073           int tail_off = int_cnt2-8;
3074           pmovzxbw(vec, Address(str2, tail_off));
3075           psrldq(vec, -2*tail_off);
3076         }
3077         else {
3078           int tail_off = int_cnt2*(1<<scale2);
3079           movdqu(vec, Address(str2, tail_off-16));
3080           psrldq(vec, 16-tail_off);
3081         }
3082       }
3083     } else { // not constant substring
3084       cmpl(cnt2, stride);
3085       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3086 
3087       // We can read beyond string if srt+16 does not cross page boundary
3088       // since heaps are aligned and mapped by pages.
3089       assert(os::vm_page_size() < (int)G, "default page should be small");
3090       movl(result, str2); // We need only low 32 bits
3091       andl(result, ((int)os::vm_page_size()-1));
3092       cmpl(result, ((int)os::vm_page_size()-16));
3093       jccb(Assembler::belowEqual, CHECK_STR);
3094 
3095       // Move small strings to stack to allow load 16 bytes into vec.
3096       subptr(rsp, 16);
3097       int stk_offset = wordSize-(1<<scale2);
3098       push(cnt2);
3099 
3100       bind(COPY_SUBSTR);
3101       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3102         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3103         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3104       } else if (ae == StrIntrinsicNode::UU) {
3105         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3106         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3107       }
3108       decrement(cnt2);
3109       jccb(Assembler::notZero, COPY_SUBSTR);
3110 
3111       pop(cnt2);
3112       movptr(str2, rsp);  // New substring address
3113     } // non constant
3114 
3115     bind(CHECK_STR);
3116     cmpl(cnt1, stride);
3117     jccb(Assembler::aboveEqual, BIG_STRINGS);
3118 
3119     // Check cross page boundary.
3120     movl(result, str1); // We need only low 32 bits
3121     andl(result, ((int)os::vm_page_size()-1));
3122     cmpl(result, ((int)os::vm_page_size()-16));
3123     jccb(Assembler::belowEqual, BIG_STRINGS);
3124 
3125     subptr(rsp, 16);
3126     int stk_offset = -(1<<scale1);
3127     if (int_cnt2 < 0) { // not constant
3128       push(cnt2);
3129       stk_offset += wordSize;
3130     }
3131     movl(cnt2, cnt1);
3132 
3133     bind(COPY_STR);
3134     if (ae == StrIntrinsicNode::LL) {
3135       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3136       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3137     } else {
3138       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3139       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3140     }
3141     decrement(cnt2);
3142     jccb(Assembler::notZero, COPY_STR);
3143 
3144     if (int_cnt2 < 0) { // not constant
3145       pop(cnt2);
3146     }
3147     movptr(str1, rsp);  // New string address
3148 
3149     bind(BIG_STRINGS);
3150     // Load substring.
3151     if (int_cnt2 < 0) { // -1
3152       if (ae == StrIntrinsicNode::UL) {
3153         pmovzxbw(vec, Address(str2, 0));
3154       } else {
3155         movdqu(vec, Address(str2, 0));
3156       }
3157       push(cnt2);       // substr count
3158       push(str2);       // substr addr
3159       push(str1);       // string addr
3160     } else {
3161       // Small (< 8 chars) constant substrings are loaded already.
3162       movl(cnt2, int_cnt2);
3163     }
3164     push(tmp);  // original SP
3165 
3166   } // Finished loading
3167 
3168   //========================================================
3169   // Start search
3170   //
3171 
3172   movptr(result, str1); // string addr
3173 
3174   if (int_cnt2  < 0) {  // Only for non constant substring
3175     jmpb(SCAN_TO_SUBSTR);
3176 
3177     // SP saved at sp+0
3178     // String saved at sp+1*wordSize
3179     // Substr saved at sp+2*wordSize
3180     // Substr count saved at sp+3*wordSize
3181 
3182     // Reload substr for rescan, this code
3183     // is executed only for large substrings (> 8 chars)
3184     bind(RELOAD_SUBSTR);
3185     movptr(str2, Address(rsp, 2*wordSize));
3186     movl(cnt2, Address(rsp, 3*wordSize));
3187     if (ae == StrIntrinsicNode::UL) {
3188       pmovzxbw(vec, Address(str2, 0));
3189     } else {
3190       movdqu(vec, Address(str2, 0));
3191     }
3192     // We came here after the beginning of the substring was
3193     // matched but the rest of it was not so we need to search
3194     // again. Start from the next element after the previous match.
3195     subptr(str1, result); // Restore counter
3196     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3197       shrl(str1, 1);
3198     }
3199     addl(cnt1, str1);
3200     decrementl(cnt1);   // Shift to next element
3201     cmpl(cnt1, cnt2);
3202     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3203 
3204     addptr(result, (1<<scale1));
3205   } // non constant
3206 
3207   // Scan string for start of substr in 16-byte vectors
3208   bind(SCAN_TO_SUBSTR);
3209   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3210   pcmpestri(vec, Address(result, 0), mode);
3211   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3212   subl(cnt1, stride);
3213   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3214   cmpl(cnt1, cnt2);
3215   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3216   addptr(result, 16);
3217 
3218   bind(ADJUST_STR);
3219   cmpl(cnt1, stride); // Do not read beyond string
3220   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3221   // Back-up string to avoid reading beyond string.
3222   lea(result, Address(result, cnt1, scale1, -16));
3223   movl(cnt1, stride);
3224   jmpb(SCAN_TO_SUBSTR);
3225 
3226   // Found a potential substr
3227   bind(FOUND_CANDIDATE);
3228   // After pcmpestri tmp(rcx) contains matched element index
3229 
3230   // Make sure string is still long enough
3231   subl(cnt1, tmp);
3232   cmpl(cnt1, cnt2);
3233   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3234   // Left less then substring.
3235 
3236   bind(RET_NOT_FOUND);
3237   movl(result, -1);
3238   jmp(CLEANUP);
3239 
3240   bind(FOUND_SUBSTR);
3241   // Compute start addr of substr
3242   lea(result, Address(result, tmp, scale1));
3243   if (int_cnt2 > 0) { // Constant substring
3244     // Repeat search for small substring (< 8 chars)
3245     // from new point without reloading substring.
3246     // Have to check that we don't read beyond string.
3247     cmpl(tmp, stride-int_cnt2);
3248     jccb(Assembler::greater, ADJUST_STR);
3249     // Fall through if matched whole substring.
3250   } else { // non constant
3251     assert(int_cnt2 == -1, "should be != 0");
3252 
3253     addl(tmp, cnt2);
3254     // Found result if we matched whole substring.
3255     cmpl(tmp, stride);
3256     jcc(Assembler::lessEqual, RET_FOUND);
3257 
3258     // Repeat search for small substring (<= 8 chars)
3259     // from new point 'str1' without reloading substring.
3260     cmpl(cnt2, stride);
3261     // Have to check that we don't read beyond string.
3262     jccb(Assembler::lessEqual, ADJUST_STR);
3263 
3264     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3265     // Compare the rest of substring (> 8 chars).
3266     movptr(str1, result);
3267 
3268     cmpl(tmp, cnt2);
3269     // First 8 chars are already matched.
3270     jccb(Assembler::equal, CHECK_NEXT);
3271 
3272     bind(SCAN_SUBSTR);
3273     pcmpestri(vec, Address(str1, 0), mode);
3274     // Need to reload strings pointers if not matched whole vector
3275     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3276 
3277     bind(CHECK_NEXT);
3278     subl(cnt2, stride);
3279     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3280     addptr(str1, 16);
3281     if (ae == StrIntrinsicNode::UL) {
3282       addptr(str2, 8);
3283     } else {
3284       addptr(str2, 16);
3285     }
3286     subl(cnt1, stride);
3287     cmpl(cnt2, stride); // Do not read beyond substring
3288     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3289     // Back-up strings to avoid reading beyond substring.
3290 
3291     if (ae == StrIntrinsicNode::UL) {
3292       lea(str2, Address(str2, cnt2, scale2, -8));
3293       lea(str1, Address(str1, cnt2, scale1, -16));
3294     } else {
3295       lea(str2, Address(str2, cnt2, scale2, -16));
3296       lea(str1, Address(str1, cnt2, scale1, -16));
3297     }
3298     subl(cnt1, cnt2);
3299     movl(cnt2, stride);
3300     addl(cnt1, stride);
3301     bind(CONT_SCAN_SUBSTR);
3302     if (ae == StrIntrinsicNode::UL) {
3303       pmovzxbw(vec, Address(str2, 0));
3304     } else {
3305       movdqu(vec, Address(str2, 0));
3306     }
3307     jmp(SCAN_SUBSTR);
3308 
3309     bind(RET_FOUND_LONG);
3310     movptr(str1, Address(rsp, wordSize));
3311   } // non constant
3312 
3313   bind(RET_FOUND);
3314   // Compute substr offset
3315   subptr(result, str1);
3316   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3317     shrl(result, 1); // index
3318   }
3319   bind(CLEANUP);
3320   pop(rsp); // restore SP
3321 
3322 } // string_indexof
3323 
3324 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3325                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3326   ShortBranchVerifier sbv(this);
3327   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3328 
3329   int stride = 8;
3330 
3331   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3332         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3333         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3334         FOUND_SEQ_CHAR, DONE_LABEL;
3335 
3336   movptr(result, str1);
3337   if (UseAVX >= 2) {
3338     cmpl(cnt1, stride);
3339     jcc(Assembler::less, SCAN_TO_CHAR);
3340     cmpl(cnt1, 2*stride);
3341     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3342     movdl(vec1, ch);
3343     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3344     vpxor(vec2, vec2);
3345     movl(tmp, cnt1);
3346     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3347     andl(cnt1,0x0000000F);  //tail count (in chars)
3348 
3349     bind(SCAN_TO_16_CHAR_LOOP);
3350     vmovdqu(vec3, Address(result, 0));
3351     vpcmpeqw(vec3, vec3, vec1, 1);
3352     vptest(vec2, vec3);
3353     jcc(Assembler::carryClear, FOUND_CHAR);
3354     addptr(result, 32);
3355     subl(tmp, 2*stride);
3356     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3357     jmp(SCAN_TO_8_CHAR);
3358     bind(SCAN_TO_8_CHAR_INIT);
3359     movdl(vec1, ch);
3360     pshuflw(vec1, vec1, 0x00);
3361     pshufd(vec1, vec1, 0);
3362     pxor(vec2, vec2);
3363   }
3364   bind(SCAN_TO_8_CHAR);
3365   cmpl(cnt1, stride);
3366   jcc(Assembler::less, SCAN_TO_CHAR);
3367   if (UseAVX < 2) {
3368     movdl(vec1, ch);
3369     pshuflw(vec1, vec1, 0x00);
3370     pshufd(vec1, vec1, 0);
3371     pxor(vec2, vec2);
3372   }
3373   movl(tmp, cnt1);
3374   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3375   andl(cnt1,0x00000007);  //tail count (in chars)
3376 
3377   bind(SCAN_TO_8_CHAR_LOOP);
3378   movdqu(vec3, Address(result, 0));
3379   pcmpeqw(vec3, vec1);
3380   ptest(vec2, vec3);
3381   jcc(Assembler::carryClear, FOUND_CHAR);
3382   addptr(result, 16);
3383   subl(tmp, stride);
3384   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3385   bind(SCAN_TO_CHAR);
3386   testl(cnt1, cnt1);
3387   jcc(Assembler::zero, RET_NOT_FOUND);
3388   bind(SCAN_TO_CHAR_LOOP);
3389   load_unsigned_short(tmp, Address(result, 0));
3390   cmpl(ch, tmp);
3391   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3392   addptr(result, 2);
3393   subl(cnt1, 1);
3394   jccb(Assembler::zero, RET_NOT_FOUND);
3395   jmp(SCAN_TO_CHAR_LOOP);
3396 
3397   bind(RET_NOT_FOUND);
3398   movl(result, -1);
3399   jmpb(DONE_LABEL);
3400 
3401   bind(FOUND_CHAR);
3402   if (UseAVX >= 2) {
3403     vpmovmskb(tmp, vec3);
3404   } else {
3405     pmovmskb(tmp, vec3);
3406   }
3407   bsfl(ch, tmp);
3408   addptr(result, ch);
3409 
3410   bind(FOUND_SEQ_CHAR);
3411   subptr(result, str1);
3412   shrl(result, 1);
3413 
3414   bind(DONE_LABEL);
3415 } // string_indexof_char
3416 
3417 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3418                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3419   ShortBranchVerifier sbv(this);
3420   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3421 
3422   int stride = 16;
3423 
3424   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3425         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3426         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3427         FOUND_SEQ_CHAR, DONE_LABEL;
3428 
3429   movptr(result, str1);
3430   if (UseAVX >= 2) {
3431     cmpl(cnt1, stride);
3432     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3433     cmpl(cnt1, stride*2);
3434     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3435     movdl(vec1, ch);
3436     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3437     vpxor(vec2, vec2);
3438     movl(tmp, cnt1);
3439     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3440     andl(cnt1,0x0000001F);  //tail count (in chars)
3441 
3442     bind(SCAN_TO_32_CHAR_LOOP);
3443     vmovdqu(vec3, Address(result, 0));
3444     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3445     vptest(vec2, vec3);
3446     jcc(Assembler::carryClear, FOUND_CHAR);
3447     addptr(result, 32);
3448     subl(tmp, stride*2);
3449     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3450     jmp(SCAN_TO_16_CHAR);
3451 
3452     bind(SCAN_TO_16_CHAR_INIT);
3453     movdl(vec1, ch);
3454     pxor(vec2, vec2);
3455     pshufb(vec1, vec2);
3456   }
3457 
3458   bind(SCAN_TO_16_CHAR);
3459   cmpl(cnt1, stride);
3460   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3461   if (UseAVX < 2) {
3462     movdl(vec1, ch);
3463     pxor(vec2, vec2);
3464     pshufb(vec1, vec2);
3465   }
3466   movl(tmp, cnt1);
3467   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3468   andl(cnt1,0x0000000F);  //tail count (in bytes)
3469 
3470   bind(SCAN_TO_16_CHAR_LOOP);
3471   movdqu(vec3, Address(result, 0));
3472   pcmpeqb(vec3, vec1);
3473   ptest(vec2, vec3);
3474   jcc(Assembler::carryClear, FOUND_CHAR);
3475   addptr(result, 16);
3476   subl(tmp, stride);
3477   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3478 
3479   bind(SCAN_TO_CHAR_INIT);
3480   testl(cnt1, cnt1);
3481   jcc(Assembler::zero, RET_NOT_FOUND);
3482   bind(SCAN_TO_CHAR_LOOP);
3483   load_unsigned_byte(tmp, Address(result, 0));
3484   cmpl(ch, tmp);
3485   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3486   addptr(result, 1);
3487   subl(cnt1, 1);
3488   jccb(Assembler::zero, RET_NOT_FOUND);
3489   jmp(SCAN_TO_CHAR_LOOP);
3490 
3491   bind(RET_NOT_FOUND);
3492   movl(result, -1);
3493   jmpb(DONE_LABEL);
3494 
3495   bind(FOUND_CHAR);
3496   if (UseAVX >= 2) {
3497     vpmovmskb(tmp, vec3);
3498   } else {
3499     pmovmskb(tmp, vec3);
3500   }
3501   bsfl(ch, tmp);
3502   addptr(result, ch);
3503 
3504   bind(FOUND_SEQ_CHAR);
3505   subptr(result, str1);
3506 
3507   bind(DONE_LABEL);
3508 } // stringL_indexof_char
3509 
3510 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3511   switch (eltype) {
3512   case T_BOOLEAN: return sizeof(jboolean);
3513   case T_BYTE:  return sizeof(jbyte);
3514   case T_SHORT: return sizeof(jshort);
3515   case T_CHAR:  return sizeof(jchar);
3516   case T_INT:   return sizeof(jint);
3517   default:
3518     ShouldNotReachHere();
3519     return -1;
3520   }
3521 }
3522 
3523 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3524   switch (eltype) {
3525   // T_BOOLEAN used as surrogate for unsigned byte
3526   case T_BOOLEAN: movzbl(dst, src);   break;
3527   case T_BYTE:    movsbl(dst, src);   break;
3528   case T_SHORT:   movswl(dst, src);   break;
3529   case T_CHAR:    movzwl(dst, src);   break;
3530   case T_INT:     movl(dst, src);     break;
3531   default:
3532     ShouldNotReachHere();
3533   }
3534 }
3535 
3536 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3537   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3538 }
3539 
3540 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3541   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3542 }
3543 
3544 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3545   const int vlen = Assembler::AVX_256bit;
3546   switch (eltype) {
3547   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3548   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3549   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3550   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3551   case T_INT:
3552     // do nothing
3553     break;
3554   default:
3555     ShouldNotReachHere();
3556   }
3557 }
3558 
3559 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3560                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3561                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3562                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3563                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3564                                         BasicType eltype) {
3565   ShortBranchVerifier sbv(this);
3566   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3567   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3568   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3569 
3570   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3571         SHORT_UNROLLED_LOOP_EXIT,
3572         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3573         UNROLLED_VECTOR_LOOP_BEGIN,
3574         END;
3575   switch (eltype) {
3576   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3577   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3578   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3579   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3580   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3581   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3582   }
3583 
3584   // For "renaming" for readibility of the code
3585   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3586                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3587                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3588 
3589   const int elsize = arrays_hashcode_elsize(eltype);
3590 
3591   /*
3592     if (cnt1 >= 2) {
3593       if (cnt1 >= 32) {
3594         UNROLLED VECTOR LOOP
3595       }
3596       UNROLLED SCALAR LOOP
3597     }
3598     SINGLE SCALAR
3599    */
3600 
3601   cmpl(cnt1, 32);
3602   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3603 
3604   // cnt1 >= 32 && generate_vectorized_loop
3605   xorl(index, index);
3606 
3607   // vresult = IntVector.zero(I256);
3608   for (int idx = 0; idx < 4; idx++) {
3609     vpxor(vresult[idx], vresult[idx]);
3610   }
3611   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3612   Register bound = tmp2;
3613   Register next = tmp3;
3614   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3615   movl(next, Address(tmp2, 0));
3616   movdl(vnext, next);
3617   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3618 
3619   // index = 0;
3620   // bound = cnt1 & ~(32 - 1);
3621   movl(bound, cnt1);
3622   andl(bound, ~(32 - 1));
3623   // for (; index < bound; index += 32) {
3624   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3625   // result *= next;
3626   imull(result, next);
3627   // loop fission to upfront the cost of fetching from memory, OOO execution
3628   // can then hopefully do a better job of prefetching
3629   for (int idx = 0; idx < 4; idx++) {
3630     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3631   }
3632   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3633   for (int idx = 0; idx < 4; idx++) {
3634     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3635     arrays_hashcode_elvcast(vtmp[idx], eltype);
3636     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3637   }
3638   // index += 32;
3639   addl(index, 32);
3640   // index < bound;
3641   cmpl(index, bound);
3642   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3643   // }
3644 
3645   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3646   subl(cnt1, bound);
3647   // release bound
3648 
3649   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3650   for (int idx = 0; idx < 4; idx++) {
3651     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3652     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3653     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3654   }
3655   // result += vresult.reduceLanes(ADD);
3656   for (int idx = 0; idx < 4; idx++) {
3657     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3658   }
3659 
3660   // } else if (cnt1 < 32) {
3661 
3662   bind(SHORT_UNROLLED_BEGIN);
3663   // int i = 1;
3664   movl(index, 1);
3665   cmpl(index, cnt1);
3666   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3667 
3668   // for (; i < cnt1 ; i += 2) {
3669   bind(SHORT_UNROLLED_LOOP_BEGIN);
3670   movl(tmp3, 961);
3671   imull(result, tmp3);
3672   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3673   movl(tmp3, tmp2);
3674   shll(tmp3, 5);
3675   subl(tmp3, tmp2);
3676   addl(result, tmp3);
3677   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3678   addl(result, tmp3);
3679   addl(index, 2);
3680   cmpl(index, cnt1);
3681   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3682 
3683   // }
3684   // if (i >= cnt1) {
3685   bind(SHORT_UNROLLED_LOOP_EXIT);
3686   jccb(Assembler::greater, END);
3687   movl(tmp2, result);
3688   shll(result, 5);
3689   subl(result, tmp2);
3690   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3691   addl(result, tmp3);
3692   // }
3693   bind(END);
3694 
3695   BLOCK_COMMENT("} // arrays_hashcode");
3696 
3697 } // arrays_hashcode
3698 
3699 // helper function for string_compare
3700 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3701                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3702                                            Address::ScaleFactor scale2, Register index, int ae) {
3703   if (ae == StrIntrinsicNode::LL) {
3704     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3705     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3706   } else if (ae == StrIntrinsicNode::UU) {
3707     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3708     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3709   } else {
3710     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3711     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3712   }
3713 }
3714 
3715 // Compare strings, used for char[] and byte[].
3716 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3717                                        Register cnt1, Register cnt2, Register result,
3718                                        XMMRegister vec1, int ae, KRegister mask) {
3719   ShortBranchVerifier sbv(this);
3720   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3721   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3722   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3723   int stride2x2 = 0x40;
3724   Address::ScaleFactor scale = Address::no_scale;
3725   Address::ScaleFactor scale1 = Address::no_scale;
3726   Address::ScaleFactor scale2 = Address::no_scale;
3727 
3728   if (ae != StrIntrinsicNode::LL) {
3729     stride2x2 = 0x20;
3730   }
3731 
3732   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3733     shrl(cnt2, 1);
3734   }
3735   // Compute the minimum of the string lengths and the
3736   // difference of the string lengths (stack).
3737   // Do the conditional move stuff
3738   movl(result, cnt1);
3739   subl(cnt1, cnt2);
3740   push(cnt1);
3741   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3742 
3743   // Is the minimum length zero?
3744   testl(cnt2, cnt2);
3745   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3746   if (ae == StrIntrinsicNode::LL) {
3747     // Load first bytes
3748     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3749     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3750   } else if (ae == StrIntrinsicNode::UU) {
3751     // Load first characters
3752     load_unsigned_short(result, Address(str1, 0));
3753     load_unsigned_short(cnt1, Address(str2, 0));
3754   } else {
3755     load_unsigned_byte(result, Address(str1, 0));
3756     load_unsigned_short(cnt1, Address(str2, 0));
3757   }
3758   subl(result, cnt1);
3759   jcc(Assembler::notZero,  POP_LABEL);
3760 
3761   if (ae == StrIntrinsicNode::UU) {
3762     // Divide length by 2 to get number of chars
3763     shrl(cnt2, 1);
3764   }
3765   cmpl(cnt2, 1);
3766   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3767 
3768   // Check if the strings start at the same location and setup scale and stride
3769   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3770     cmpptr(str1, str2);
3771     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3772     if (ae == StrIntrinsicNode::LL) {
3773       scale = Address::times_1;
3774       stride = 16;
3775     } else {
3776       scale = Address::times_2;
3777       stride = 8;
3778     }
3779   } else {
3780     scale1 = Address::times_1;
3781     scale2 = Address::times_2;
3782     // scale not used
3783     stride = 8;
3784   }
3785 
3786   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3787     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3788     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3789     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3790     Label COMPARE_TAIL_LONG;
3791     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3792 
3793     int pcmpmask = 0x19;
3794     if (ae == StrIntrinsicNode::LL) {
3795       pcmpmask &= ~0x01;
3796     }
3797 
3798     // Setup to compare 16-chars (32-bytes) vectors,
3799     // start from first character again because it has aligned address.
3800     if (ae == StrIntrinsicNode::LL) {
3801       stride2 = 32;
3802     } else {
3803       stride2 = 16;
3804     }
3805     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3806       adr_stride = stride << scale;
3807     } else {
3808       adr_stride1 = 8;  //stride << scale1;
3809       adr_stride2 = 16; //stride << scale2;
3810     }
3811 
3812     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3813     // rax and rdx are used by pcmpestri as elements counters
3814     movl(result, cnt2);
3815     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3816     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3817 
3818     // fast path : compare first 2 8-char vectors.
3819     bind(COMPARE_16_CHARS);
3820     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3821       movdqu(vec1, Address(str1, 0));
3822     } else {
3823       pmovzxbw(vec1, Address(str1, 0));
3824     }
3825     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3826     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3827 
3828     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3829       movdqu(vec1, Address(str1, adr_stride));
3830       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3831     } else {
3832       pmovzxbw(vec1, Address(str1, adr_stride1));
3833       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3834     }
3835     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3836     addl(cnt1, stride);
3837 
3838     // Compare the characters at index in cnt1
3839     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3840     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3841     subl(result, cnt2);
3842     jmp(POP_LABEL);
3843 
3844     // Setup the registers to start vector comparison loop
3845     bind(COMPARE_WIDE_VECTORS);
3846     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3847       lea(str1, Address(str1, result, scale));
3848       lea(str2, Address(str2, result, scale));
3849     } else {
3850       lea(str1, Address(str1, result, scale1));
3851       lea(str2, Address(str2, result, scale2));
3852     }
3853     subl(result, stride2);
3854     subl(cnt2, stride2);
3855     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3856     negptr(result);
3857 
3858     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3859     bind(COMPARE_WIDE_VECTORS_LOOP);
3860 
3861 #ifdef _LP64
3862     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3863       cmpl(cnt2, stride2x2);
3864       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3865       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3866       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3867 
3868       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3869       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3870         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3871         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3872       } else {
3873         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3874         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3875       }
3876       kortestql(mask, mask);
3877       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3878       addptr(result, stride2x2);  // update since we already compared at this addr
3879       subl(cnt2, stride2x2);      // and sub the size too
3880       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3881 
3882       vpxor(vec1, vec1);
3883       jmpb(COMPARE_WIDE_TAIL);
3884     }//if (VM_Version::supports_avx512vlbw())
3885 #endif // _LP64
3886 
3887 
3888     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3889     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3890       vmovdqu(vec1, Address(str1, result, scale));
3891       vpxor(vec1, Address(str2, result, scale));
3892     } else {
3893       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3894       vpxor(vec1, Address(str2, result, scale2));
3895     }
3896     vptest(vec1, vec1);
3897     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3898     addptr(result, stride2);
3899     subl(cnt2, stride2);
3900     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3901     // clean upper bits of YMM registers
3902     vpxor(vec1, vec1);
3903 
3904     // compare wide vectors tail
3905     bind(COMPARE_WIDE_TAIL);
3906     testptr(result, result);
3907     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3908 
3909     movl(result, stride2);
3910     movl(cnt2, result);
3911     negptr(result);
3912     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3913 
3914     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3915     bind(VECTOR_NOT_EQUAL);
3916     // clean upper bits of YMM registers
3917     vpxor(vec1, vec1);
3918     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3919       lea(str1, Address(str1, result, scale));
3920       lea(str2, Address(str2, result, scale));
3921     } else {
3922       lea(str1, Address(str1, result, scale1));
3923       lea(str2, Address(str2, result, scale2));
3924     }
3925     jmp(COMPARE_16_CHARS);
3926 
3927     // Compare tail chars, length between 1 to 15 chars
3928     bind(COMPARE_TAIL_LONG);
3929     movl(cnt2, result);
3930     cmpl(cnt2, stride);
3931     jcc(Assembler::less, COMPARE_SMALL_STR);
3932 
3933     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3934       movdqu(vec1, Address(str1, 0));
3935     } else {
3936       pmovzxbw(vec1, Address(str1, 0));
3937     }
3938     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3939     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3940     subptr(cnt2, stride);
3941     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3942     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3943       lea(str1, Address(str1, result, scale));
3944       lea(str2, Address(str2, result, scale));
3945     } else {
3946       lea(str1, Address(str1, result, scale1));
3947       lea(str2, Address(str2, result, scale2));
3948     }
3949     negptr(cnt2);
3950     jmpb(WHILE_HEAD_LABEL);
3951 
3952     bind(COMPARE_SMALL_STR);
3953   } else if (UseSSE42Intrinsics) {
3954     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3955     int pcmpmask = 0x19;
3956     // Setup to compare 8-char (16-byte) vectors,
3957     // start from first character again because it has aligned address.
3958     movl(result, cnt2);
3959     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3960     if (ae == StrIntrinsicNode::LL) {
3961       pcmpmask &= ~0x01;
3962     }
3963     jcc(Assembler::zero, COMPARE_TAIL);
3964     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3965       lea(str1, Address(str1, result, scale));
3966       lea(str2, Address(str2, result, scale));
3967     } else {
3968       lea(str1, Address(str1, result, scale1));
3969       lea(str2, Address(str2, result, scale2));
3970     }
3971     negptr(result);
3972 
3973     // pcmpestri
3974     //   inputs:
3975     //     vec1- substring
3976     //     rax - negative string length (elements count)
3977     //     mem - scanned string
3978     //     rdx - string length (elements count)
3979     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3980     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3981     //   outputs:
3982     //     rcx - first mismatched element index
3983     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3984 
3985     bind(COMPARE_WIDE_VECTORS);
3986     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3987       movdqu(vec1, Address(str1, result, scale));
3988       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3989     } else {
3990       pmovzxbw(vec1, Address(str1, result, scale1));
3991       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3992     }
3993     // After pcmpestri cnt1(rcx) contains mismatched element index
3994 
3995     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3996     addptr(result, stride);
3997     subptr(cnt2, stride);
3998     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3999 
4000     // compare wide vectors tail
4001     testptr(result, result);
4002     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4003 
4004     movl(cnt2, stride);
4005     movl(result, stride);
4006     negptr(result);
4007     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4008       movdqu(vec1, Address(str1, result, scale));
4009       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4010     } else {
4011       pmovzxbw(vec1, Address(str1, result, scale1));
4012       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4013     }
4014     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4015 
4016     // Mismatched characters in the vectors
4017     bind(VECTOR_NOT_EQUAL);
4018     addptr(cnt1, result);
4019     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4020     subl(result, cnt2);
4021     jmpb(POP_LABEL);
4022 
4023     bind(COMPARE_TAIL); // limit is zero
4024     movl(cnt2, result);
4025     // Fallthru to tail compare
4026   }
4027   // Shift str2 and str1 to the end of the arrays, negate min
4028   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4029     lea(str1, Address(str1, cnt2, scale));
4030     lea(str2, Address(str2, cnt2, scale));
4031   } else {
4032     lea(str1, Address(str1, cnt2, scale1));
4033     lea(str2, Address(str2, cnt2, scale2));
4034   }
4035   decrementl(cnt2);  // first character was compared already
4036   negptr(cnt2);
4037 
4038   // Compare the rest of the elements
4039   bind(WHILE_HEAD_LABEL);
4040   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4041   subl(result, cnt1);
4042   jccb(Assembler::notZero, POP_LABEL);
4043   increment(cnt2);
4044   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4045 
4046   // Strings are equal up to min length.  Return the length difference.
4047   bind(LENGTH_DIFF_LABEL);
4048   pop(result);
4049   if (ae == StrIntrinsicNode::UU) {
4050     // Divide diff by 2 to get number of chars
4051     sarl(result, 1);
4052   }
4053   jmpb(DONE_LABEL);
4054 
4055 #ifdef _LP64
4056   if (VM_Version::supports_avx512vlbw()) {
4057 
4058     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4059 
4060     kmovql(cnt1, mask);
4061     notq(cnt1);
4062     bsfq(cnt2, cnt1);
4063     if (ae != StrIntrinsicNode::LL) {
4064       // Divide diff by 2 to get number of chars
4065       sarl(cnt2, 1);
4066     }
4067     addq(result, cnt2);
4068     if (ae == StrIntrinsicNode::LL) {
4069       load_unsigned_byte(cnt1, Address(str2, result));
4070       load_unsigned_byte(result, Address(str1, result));
4071     } else if (ae == StrIntrinsicNode::UU) {
4072       load_unsigned_short(cnt1, Address(str2, result, scale));
4073       load_unsigned_short(result, Address(str1, result, scale));
4074     } else {
4075       load_unsigned_short(cnt1, Address(str2, result, scale2));
4076       load_unsigned_byte(result, Address(str1, result, scale1));
4077     }
4078     subl(result, cnt1);
4079     jmpb(POP_LABEL);
4080   }//if (VM_Version::supports_avx512vlbw())
4081 #endif // _LP64
4082 
4083   // Discard the stored length difference
4084   bind(POP_LABEL);
4085   pop(cnt1);
4086 
4087   // That's it
4088   bind(DONE_LABEL);
4089   if(ae == StrIntrinsicNode::UL) {
4090     negl(result);
4091   }
4092 
4093 }
4094 
4095 // Search for Non-ASCII character (Negative byte value) in a byte array,
4096 // return the index of the first such character, otherwise the length
4097 // of the array segment searched.
4098 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4099 //   @IntrinsicCandidate
4100 //   public static int countPositives(byte[] ba, int off, int len) {
4101 //     for (int i = off; i < off + len; i++) {
4102 //       if (ba[i] < 0) {
4103 //         return i - off;
4104 //       }
4105 //     }
4106 //     return len;
4107 //   }
4108 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4109   Register result, Register tmp1,
4110   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4111   // rsi: byte array
4112   // rcx: len
4113   // rax: result
4114   ShortBranchVerifier sbv(this);
4115   assert_different_registers(ary1, len, result, tmp1);
4116   assert_different_registers(vec1, vec2);
4117   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4118 
4119   movl(result, len); // copy
4120   // len == 0
4121   testl(len, len);
4122   jcc(Assembler::zero, DONE);
4123 
4124   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4125     VM_Version::supports_avx512vlbw() &&
4126     VM_Version::supports_bmi2()) {
4127 
4128     Label test_64_loop, test_tail, BREAK_LOOP;
4129     movl(tmp1, len);
4130     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4131 
4132     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4133     andl(len,  0xffffffc0); // vector count (in chars)
4134     jccb(Assembler::zero, test_tail);
4135 
4136     lea(ary1, Address(ary1, len, Address::times_1));
4137     negptr(len);
4138 
4139     bind(test_64_loop);
4140     // Check whether our 64 elements of size byte contain negatives
4141     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4142     kortestql(mask1, mask1);
4143     jcc(Assembler::notZero, BREAK_LOOP);
4144 
4145     addptr(len, 64);
4146     jccb(Assembler::notZero, test_64_loop);
4147 
4148     bind(test_tail);
4149     // bail out when there is nothing to be done
4150     testl(tmp1, -1);
4151     jcc(Assembler::zero, DONE);
4152 
4153 
4154     // check the tail for absense of negatives
4155     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4156 #ifdef _LP64
4157     {
4158       Register tmp3_aliased = len;
4159       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4160       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4161       notq(tmp3_aliased);
4162       kmovql(mask2, tmp3_aliased);
4163     }
4164 #else
4165     Label k_init;
4166     jmp(k_init);
4167 
4168     // We could not read 64-bits from a general purpose register thus we move
4169     // data required to compose 64 1's to the instruction stream
4170     // We emit 64 byte wide series of elements from 0..63 which later on would
4171     // be used as a compare targets with tail count contained in tmp1 register.
4172     // Result would be a k register having tmp1 consecutive number or 1
4173     // counting from least significant bit.
4174     address tmp = pc();
4175     emit_int64(0x0706050403020100);
4176     emit_int64(0x0F0E0D0C0B0A0908);
4177     emit_int64(0x1716151413121110);
4178     emit_int64(0x1F1E1D1C1B1A1918);
4179     emit_int64(0x2726252423222120);
4180     emit_int64(0x2F2E2D2C2B2A2928);
4181     emit_int64(0x3736353433323130);
4182     emit_int64(0x3F3E3D3C3B3A3938);
4183 
4184     bind(k_init);
4185     lea(len, InternalAddress(tmp));
4186     // create mask to test for negative byte inside a vector
4187     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4188     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4189 
4190 #endif
4191     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4192     ktestq(mask1, mask2);
4193     jcc(Assembler::zero, DONE);
4194 
4195     // do a full check for negative registers in the tail
4196     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4197                      // ary1 already pointing to the right place
4198     jmpb(TAIL_START);
4199 
4200     bind(BREAK_LOOP);
4201     // At least one byte in the last 64 byte block was negative.
4202     // Set up to look at the last 64 bytes as if they were a tail
4203     lea(ary1, Address(ary1, len, Address::times_1));
4204     addptr(result, len);
4205     // Ignore the very last byte: if all others are positive,
4206     // it must be negative, so we can skip right to the 2+1 byte
4207     // end comparison at this point
4208     orl(result, 63);
4209     movl(len, 63);
4210     // Fallthru to tail compare
4211   } else {
4212 
4213     if (UseAVX >= 2 && UseSSE >= 2) {
4214       // With AVX2, use 32-byte vector compare
4215       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4216 
4217       // Compare 32-byte vectors
4218       testl(len, 0xffffffe0);   // vector count (in bytes)
4219       jccb(Assembler::zero, TAIL_START);
4220 
4221       andl(len, 0xffffffe0);
4222       lea(ary1, Address(ary1, len, Address::times_1));
4223       negptr(len);
4224 
4225       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4226       movdl(vec2, tmp1);
4227       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4228 
4229       bind(COMPARE_WIDE_VECTORS);
4230       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4231       vptest(vec1, vec2);
4232       jccb(Assembler::notZero, BREAK_LOOP);
4233       addptr(len, 32);
4234       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4235 
4236       testl(result, 0x0000001f);   // any bytes remaining?
4237       jcc(Assembler::zero, DONE);
4238 
4239       // Quick test using the already prepared vector mask
4240       movl(len, result);
4241       andl(len, 0x0000001f);
4242       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4243       vptest(vec1, vec2);
4244       jcc(Assembler::zero, DONE);
4245       // There are zeros, jump to the tail to determine exactly where
4246       jmpb(TAIL_START);
4247 
4248       bind(BREAK_LOOP);
4249       // At least one byte in the last 32-byte vector is negative.
4250       // Set up to look at the last 32 bytes as if they were a tail
4251       lea(ary1, Address(ary1, len, Address::times_1));
4252       addptr(result, len);
4253       // Ignore the very last byte: if all others are positive,
4254       // it must be negative, so we can skip right to the 2+1 byte
4255       // end comparison at this point
4256       orl(result, 31);
4257       movl(len, 31);
4258       // Fallthru to tail compare
4259     } else if (UseSSE42Intrinsics) {
4260       // With SSE4.2, use double quad vector compare
4261       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4262 
4263       // Compare 16-byte vectors
4264       testl(len, 0xfffffff0);   // vector count (in bytes)
4265       jcc(Assembler::zero, TAIL_START);
4266 
4267       andl(len, 0xfffffff0);
4268       lea(ary1, Address(ary1, len, Address::times_1));
4269       negptr(len);
4270 
4271       movl(tmp1, 0x80808080);
4272       movdl(vec2, tmp1);
4273       pshufd(vec2, vec2, 0);
4274 
4275       bind(COMPARE_WIDE_VECTORS);
4276       movdqu(vec1, Address(ary1, len, Address::times_1));
4277       ptest(vec1, vec2);
4278       jccb(Assembler::notZero, BREAK_LOOP);
4279       addptr(len, 16);
4280       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4281 
4282       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4283       jcc(Assembler::zero, DONE);
4284 
4285       // Quick test using the already prepared vector mask
4286       movl(len, result);
4287       andl(len, 0x0000000f);   // tail count (in bytes)
4288       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4289       ptest(vec1, vec2);
4290       jcc(Assembler::zero, DONE);
4291       jmpb(TAIL_START);
4292 
4293       bind(BREAK_LOOP);
4294       // At least one byte in the last 16-byte vector is negative.
4295       // Set up and look at the last 16 bytes as if they were a tail
4296       lea(ary1, Address(ary1, len, Address::times_1));
4297       addptr(result, len);
4298       // Ignore the very last byte: if all others are positive,
4299       // it must be negative, so we can skip right to the 2+1 byte
4300       // end comparison at this point
4301       orl(result, 15);
4302       movl(len, 15);
4303       // Fallthru to tail compare
4304     }
4305   }
4306 
4307   bind(TAIL_START);
4308   // Compare 4-byte vectors
4309   andl(len, 0xfffffffc); // vector count (in bytes)
4310   jccb(Assembler::zero, COMPARE_CHAR);
4311 
4312   lea(ary1, Address(ary1, len, Address::times_1));
4313   negptr(len);
4314 
4315   bind(COMPARE_VECTORS);
4316   movl(tmp1, Address(ary1, len, Address::times_1));
4317   andl(tmp1, 0x80808080);
4318   jccb(Assembler::notZero, TAIL_ADJUST);
4319   addptr(len, 4);
4320   jccb(Assembler::notZero, COMPARE_VECTORS);
4321 
4322   // Compare trailing char (final 2-3 bytes), if any
4323   bind(COMPARE_CHAR);
4324 
4325   testl(result, 0x2);   // tail  char
4326   jccb(Assembler::zero, COMPARE_BYTE);
4327   load_unsigned_short(tmp1, Address(ary1, 0));
4328   andl(tmp1, 0x00008080);
4329   jccb(Assembler::notZero, CHAR_ADJUST);
4330   lea(ary1, Address(ary1, 2));
4331 
4332   bind(COMPARE_BYTE);
4333   testl(result, 0x1);   // tail  byte
4334   jccb(Assembler::zero, DONE);
4335   load_unsigned_byte(tmp1, Address(ary1, 0));
4336   testl(tmp1, 0x00000080);
4337   jccb(Assembler::zero, DONE);
4338   subptr(result, 1);
4339   jmpb(DONE);
4340 
4341   bind(TAIL_ADJUST);
4342   // there are negative bits in the last 4 byte block.
4343   // Adjust result and check the next three bytes
4344   addptr(result, len);
4345   orl(result, 3);
4346   lea(ary1, Address(ary1, len, Address::times_1));
4347   jmpb(COMPARE_CHAR);
4348 
4349   bind(CHAR_ADJUST);
4350   // We are looking at a char + optional byte tail, and found that one
4351   // of the bytes in the char is negative. Adjust the result, check the
4352   // first byte and readjust if needed.
4353   andl(result, 0xfffffffc);
4354   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4355   jccb(Assembler::notZero, DONE);
4356   addptr(result, 1);
4357 
4358   // That's it
4359   bind(DONE);
4360   if (UseAVX >= 2 && UseSSE >= 2) {
4361     // clean upper bits of YMM registers
4362     vpxor(vec1, vec1);
4363     vpxor(vec2, vec2);
4364   }
4365 }
4366 
4367 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4368 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4369                                       Register limit, Register result, Register chr,
4370                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4371                                       KRegister mask, bool expand_ary2) {
4372   // for expand_ary2, limit is the (smaller) size of the second array.
4373   ShortBranchVerifier sbv(this);
4374   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4375 
4376   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4377          "Expansion only implemented for AVX2");
4378 
4379   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4380   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4381 
4382   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4383   int scaleIncr = expand_ary2 ? 8 : 16;
4384 
4385   if (is_array_equ) {
4386     // Check the input args
4387     cmpoop(ary1, ary2);
4388     jcc(Assembler::equal, TRUE_LABEL);
4389 
4390     // Need additional checks for arrays_equals.
4391     testptr(ary1, ary1);
4392     jcc(Assembler::zero, FALSE_LABEL);
4393     testptr(ary2, ary2);
4394     jcc(Assembler::zero, FALSE_LABEL);
4395 
4396     // Check the lengths
4397     movl(limit, Address(ary1, length_offset));
4398     cmpl(limit, Address(ary2, length_offset));
4399     jcc(Assembler::notEqual, FALSE_LABEL);
4400   }
4401 
4402   // count == 0
4403   testl(limit, limit);
4404   jcc(Assembler::zero, TRUE_LABEL);
4405 
4406   if (is_array_equ) {
4407     // Load array address
4408     lea(ary1, Address(ary1, base_offset));
4409     lea(ary2, Address(ary2, base_offset));
4410   }
4411 
4412   if (is_array_equ && is_char) {
4413     // arrays_equals when used for char[].
4414     shll(limit, 1);      // byte count != 0
4415   }
4416   movl(result, limit); // copy
4417 
4418   if (UseAVX >= 2) {
4419     // With AVX2, use 32-byte vector compare
4420     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4421 
4422     // Compare 32-byte vectors
4423     if (expand_ary2) {
4424       andl(result, 0x0000000f);  //   tail count (in bytes)
4425       andl(limit, 0xfffffff0);   // vector count (in bytes)
4426       jcc(Assembler::zero, COMPARE_TAIL);
4427     } else {
4428       andl(result, 0x0000001f);  //   tail count (in bytes)
4429       andl(limit, 0xffffffe0);   // vector count (in bytes)
4430       jcc(Assembler::zero, COMPARE_TAIL_16);
4431     }
4432 
4433     lea(ary1, Address(ary1, limit, scaleFactor));
4434     lea(ary2, Address(ary2, limit, Address::times_1));
4435     negptr(limit);
4436 
4437 #ifdef _LP64
4438     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4439       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4440 
4441       cmpl(limit, -64);
4442       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4443 
4444       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4445 
4446       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4447       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4448       kortestql(mask, mask);
4449       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4450       addptr(limit, 64);  // update since we already compared at this addr
4451       cmpl(limit, -64);
4452       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4453 
4454       // At this point we may still need to compare -limit+result bytes.
4455       // We could execute the next two instruction and just continue via non-wide path:
4456       //  cmpl(limit, 0);
4457       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4458       // But since we stopped at the points ary{1,2}+limit which are
4459       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4460       // (|limit| <= 32 and result < 32),
4461       // we may just compare the last 64 bytes.
4462       //
4463       addptr(result, -64);   // it is safe, bc we just came from this area
4464       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4465       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4466       kortestql(mask, mask);
4467       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4468 
4469       jmp(TRUE_LABEL);
4470 
4471       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4472 
4473     }//if (VM_Version::supports_avx512vlbw())
4474 #endif //_LP64
4475     bind(COMPARE_WIDE_VECTORS);
4476     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4477     if (expand_ary2) {
4478       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4479     } else {
4480       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4481     }
4482     vpxor(vec1, vec2);
4483 
4484     vptest(vec1, vec1);
4485     jcc(Assembler::notZero, FALSE_LABEL);
4486     addptr(limit, scaleIncr * 2);
4487     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4488 
4489     testl(result, result);
4490     jcc(Assembler::zero, TRUE_LABEL);
4491 
4492     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4493     if (expand_ary2) {
4494       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4495     } else {
4496       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4497     }
4498     vpxor(vec1, vec2);
4499 
4500     vptest(vec1, vec1);
4501     jcc(Assembler::notZero, FALSE_LABEL);
4502     jmp(TRUE_LABEL);
4503 
4504     bind(COMPARE_TAIL_16); // limit is zero
4505     movl(limit, result);
4506 
4507     // Compare 16-byte chunks
4508     andl(result, 0x0000000f);  //   tail count (in bytes)
4509     andl(limit, 0xfffffff0);   // vector count (in bytes)
4510     jcc(Assembler::zero, COMPARE_TAIL);
4511 
4512     lea(ary1, Address(ary1, limit, scaleFactor));
4513     lea(ary2, Address(ary2, limit, Address::times_1));
4514     negptr(limit);
4515 
4516     bind(COMPARE_WIDE_VECTORS_16);
4517     movdqu(vec1, Address(ary1, limit, scaleFactor));
4518     if (expand_ary2) {
4519       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4520     } else {
4521       movdqu(vec2, Address(ary2, limit, Address::times_1));
4522     }
4523     pxor(vec1, vec2);
4524 
4525     ptest(vec1, vec1);
4526     jcc(Assembler::notZero, FALSE_LABEL);
4527     addptr(limit, scaleIncr);
4528     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4529 
4530     bind(COMPARE_TAIL); // limit is zero
4531     movl(limit, result);
4532     // Fallthru to tail compare
4533   } else if (UseSSE42Intrinsics) {
4534     // With SSE4.2, use double quad vector compare
4535     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4536 
4537     // Compare 16-byte vectors
4538     andl(result, 0x0000000f);  //   tail count (in bytes)
4539     andl(limit, 0xfffffff0);   // vector count (in bytes)
4540     jcc(Assembler::zero, COMPARE_TAIL);
4541 
4542     lea(ary1, Address(ary1, limit, Address::times_1));
4543     lea(ary2, Address(ary2, limit, Address::times_1));
4544     negptr(limit);
4545 
4546     bind(COMPARE_WIDE_VECTORS);
4547     movdqu(vec1, Address(ary1, limit, Address::times_1));
4548     movdqu(vec2, Address(ary2, limit, Address::times_1));
4549     pxor(vec1, vec2);
4550 
4551     ptest(vec1, vec1);
4552     jcc(Assembler::notZero, FALSE_LABEL);
4553     addptr(limit, 16);
4554     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4555 
4556     testl(result, result);
4557     jcc(Assembler::zero, TRUE_LABEL);
4558 
4559     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4560     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4561     pxor(vec1, vec2);
4562 
4563     ptest(vec1, vec1);
4564     jccb(Assembler::notZero, FALSE_LABEL);
4565     jmpb(TRUE_LABEL);
4566 
4567     bind(COMPARE_TAIL); // limit is zero
4568     movl(limit, result);
4569     // Fallthru to tail compare
4570   }
4571 
4572   // Compare 4-byte vectors
4573   if (expand_ary2) {
4574     testl(result, result);
4575     jccb(Assembler::zero, TRUE_LABEL);
4576   } else {
4577     andl(limit, 0xfffffffc); // vector count (in bytes)
4578     jccb(Assembler::zero, COMPARE_CHAR);
4579   }
4580 
4581   lea(ary1, Address(ary1, limit, scaleFactor));
4582   lea(ary2, Address(ary2, limit, Address::times_1));
4583   negptr(limit);
4584 
4585   bind(COMPARE_VECTORS);
4586   if (expand_ary2) {
4587     // There are no "vector" operations for bytes to shorts
4588     movzbl(chr, Address(ary2, limit, Address::times_1));
4589     cmpw(Address(ary1, limit, Address::times_2), chr);
4590     jccb(Assembler::notEqual, FALSE_LABEL);
4591     addptr(limit, 1);
4592     jcc(Assembler::notZero, COMPARE_VECTORS);
4593     jmp(TRUE_LABEL);
4594   } else {
4595     movl(chr, Address(ary1, limit, Address::times_1));
4596     cmpl(chr, Address(ary2, limit, Address::times_1));
4597     jccb(Assembler::notEqual, FALSE_LABEL);
4598     addptr(limit, 4);
4599     jcc(Assembler::notZero, COMPARE_VECTORS);
4600   }
4601 
4602   // Compare trailing char (final 2 bytes), if any
4603   bind(COMPARE_CHAR);
4604   testl(result, 0x2);   // tail  char
4605   jccb(Assembler::zero, COMPARE_BYTE);
4606   load_unsigned_short(chr, Address(ary1, 0));
4607   load_unsigned_short(limit, Address(ary2, 0));
4608   cmpl(chr, limit);
4609   jccb(Assembler::notEqual, FALSE_LABEL);
4610 
4611   if (is_array_equ && is_char) {
4612     bind(COMPARE_BYTE);
4613   } else {
4614     lea(ary1, Address(ary1, 2));
4615     lea(ary2, Address(ary2, 2));
4616 
4617     bind(COMPARE_BYTE);
4618     testl(result, 0x1);   // tail  byte
4619     jccb(Assembler::zero, TRUE_LABEL);
4620     load_unsigned_byte(chr, Address(ary1, 0));
4621     load_unsigned_byte(limit, Address(ary2, 0));
4622     cmpl(chr, limit);
4623     jccb(Assembler::notEqual, FALSE_LABEL);
4624   }
4625   bind(TRUE_LABEL);
4626   movl(result, 1);   // return true
4627   jmpb(DONE);
4628 
4629   bind(FALSE_LABEL);
4630   xorl(result, result); // return false
4631 
4632   // That's it
4633   bind(DONE);
4634   if (UseAVX >= 2) {
4635     // clean upper bits of YMM registers
4636     vpxor(vec1, vec1);
4637     vpxor(vec2, vec2);
4638   }
4639 }
4640 
4641 #ifdef _LP64
4642 
4643 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4644 #define __ masm.
4645   Register dst = stub.data<0>();
4646   XMMRegister src = stub.data<1>();
4647   address target = stub.data<2>();
4648   __ bind(stub.entry());
4649   __ subptr(rsp, 8);
4650   __ movdbl(Address(rsp), src);
4651   __ call(RuntimeAddress(target));
4652   __ pop(dst);
4653   __ jmp(stub.continuation());
4654 #undef __
4655 }
4656 
4657 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4658   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4659   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4660 
4661   address slowpath_target;
4662   if (dst_bt == T_INT) {
4663     if (src_bt == T_FLOAT) {
4664       cvttss2sil(dst, src);
4665       cmpl(dst, 0x80000000);
4666       slowpath_target = StubRoutines::x86::f2i_fixup();
4667     } else {
4668       cvttsd2sil(dst, src);
4669       cmpl(dst, 0x80000000);
4670       slowpath_target = StubRoutines::x86::d2i_fixup();
4671     }
4672   } else {
4673     if (src_bt == T_FLOAT) {
4674       cvttss2siq(dst, src);
4675       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4676       slowpath_target = StubRoutines::x86::f2l_fixup();
4677     } else {
4678       cvttsd2siq(dst, src);
4679       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4680       slowpath_target = StubRoutines::x86::d2l_fixup();
4681     }
4682   }
4683 
4684   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4685   jcc(Assembler::equal, stub->entry());
4686   bind(stub->continuation());
4687 }
4688 
4689 #endif // _LP64
4690 
4691 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4692                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4693   switch(ideal_opc) {
4694     case Op_LShiftVS:
4695       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4696     case Op_LShiftVI:
4697       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4698     case Op_LShiftVL:
4699       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4700     case Op_RShiftVS:
4701       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4702     case Op_RShiftVI:
4703       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4704     case Op_RShiftVL:
4705       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4706     case Op_URShiftVS:
4707       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4708     case Op_URShiftVI:
4709       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4710     case Op_URShiftVL:
4711       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4712     case Op_RotateRightV:
4713       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4714     case Op_RotateLeftV:
4715       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4716     default:
4717       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4718       break;
4719   }
4720 }
4721 
4722 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4723                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4724   if (is_unsigned) {
4725     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4726   } else {
4727     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4728   }
4729 }
4730 
4731 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4732                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4733   switch (elem_bt) {
4734     case T_BYTE:
4735       if (ideal_opc == Op_SaturatingAddV) {
4736         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4737       } else {
4738         assert(ideal_opc == Op_SaturatingSubV, "");
4739         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4740       }
4741       break;
4742     case T_SHORT:
4743       if (ideal_opc == Op_SaturatingAddV) {
4744         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4745       } else {
4746         assert(ideal_opc == Op_SaturatingSubV, "");
4747         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4748       }
4749       break;
4750     default:
4751       fatal("Unsupported type %s", type2name(elem_bt));
4752       break;
4753   }
4754 }
4755 
4756 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4757                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4758   switch (elem_bt) {
4759     case T_BYTE:
4760       if (ideal_opc == Op_SaturatingAddV) {
4761         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4762       } else {
4763         assert(ideal_opc == Op_SaturatingSubV, "");
4764         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4765       }
4766       break;
4767     case T_SHORT:
4768       if (ideal_opc == Op_SaturatingAddV) {
4769         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4770       } else {
4771         assert(ideal_opc == Op_SaturatingSubV, "");
4772         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4773       }
4774       break;
4775     default:
4776       fatal("Unsupported type %s", type2name(elem_bt));
4777       break;
4778   }
4779 }
4780 
4781 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4782                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4783   if (is_unsigned) {
4784     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4785   } else {
4786     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4787   }
4788 }
4789 
4790 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4791                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4792   switch (elem_bt) {
4793     case T_BYTE:
4794       if (ideal_opc == Op_SaturatingAddV) {
4795         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4796       } else {
4797         assert(ideal_opc == Op_SaturatingSubV, "");
4798         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4799       }
4800       break;
4801     case T_SHORT:
4802       if (ideal_opc == Op_SaturatingAddV) {
4803         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4804       } else {
4805         assert(ideal_opc == Op_SaturatingSubV, "");
4806         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4807       }
4808       break;
4809     default:
4810       fatal("Unsupported type %s", type2name(elem_bt));
4811       break;
4812   }
4813 }
4814 
4815 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4816                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4817   switch (elem_bt) {
4818     case T_BYTE:
4819       if (ideal_opc == Op_SaturatingAddV) {
4820         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4821       } else {
4822         assert(ideal_opc == Op_SaturatingSubV, "");
4823         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4824       }
4825       break;
4826     case T_SHORT:
4827       if (ideal_opc == Op_SaturatingAddV) {
4828         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4829       } else {
4830         assert(ideal_opc == Op_SaturatingSubV, "");
4831         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4832       }
4833       break;
4834     default:
4835       fatal("Unsupported type %s", type2name(elem_bt));
4836       break;
4837   }
4838 }
4839 
4840 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4841                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4842                                     bool is_varshift) {
4843   switch (ideal_opc) {
4844     case Op_AddVB:
4845       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4846     case Op_AddVS:
4847       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4848     case Op_AddVI:
4849       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4850     case Op_AddVL:
4851       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4852     case Op_AddVF:
4853       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4854     case Op_AddVD:
4855       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4856     case Op_SubVB:
4857       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4858     case Op_SubVS:
4859       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4860     case Op_SubVI:
4861       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4862     case Op_SubVL:
4863       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4864     case Op_SubVF:
4865       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4866     case Op_SubVD:
4867       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4868     case Op_MulVS:
4869       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4870     case Op_MulVI:
4871       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4872     case Op_MulVL:
4873       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4874     case Op_MulVF:
4875       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4876     case Op_MulVD:
4877       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4878     case Op_DivVF:
4879       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4880     case Op_DivVD:
4881       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4882     case Op_SqrtVF:
4883       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4884     case Op_SqrtVD:
4885       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4886     case Op_AbsVB:
4887       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4888     case Op_AbsVS:
4889       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4890     case Op_AbsVI:
4891       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4892     case Op_AbsVL:
4893       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4894     case Op_FmaVF:
4895       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4896     case Op_FmaVD:
4897       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4898     case Op_VectorRearrange:
4899       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4900     case Op_LShiftVS:
4901       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4902     case Op_LShiftVI:
4903       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4904     case Op_LShiftVL:
4905       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4906     case Op_RShiftVS:
4907       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4908     case Op_RShiftVI:
4909       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4910     case Op_RShiftVL:
4911       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4912     case Op_URShiftVS:
4913       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4914     case Op_URShiftVI:
4915       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4916     case Op_URShiftVL:
4917       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4918     case Op_RotateLeftV:
4919       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4920     case Op_RotateRightV:
4921       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4922     case Op_MaxV:
4923       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4924     case Op_MinV:
4925       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4926     case Op_UMinV:
4927       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4928     case Op_UMaxV:
4929       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4930     case Op_XorV:
4931       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4932     case Op_OrV:
4933       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4934     case Op_AndV:
4935       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4936     default:
4937       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4938       break;
4939   }
4940 }
4941 
4942 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4943                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4944   switch (ideal_opc) {
4945     case Op_AddVB:
4946       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4947     case Op_AddVS:
4948       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4949     case Op_AddVI:
4950       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4951     case Op_AddVL:
4952       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4953     case Op_AddVF:
4954       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4955     case Op_AddVD:
4956       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4957     case Op_SubVB:
4958       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4959     case Op_SubVS:
4960       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4961     case Op_SubVI:
4962       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4963     case Op_SubVL:
4964       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4965     case Op_SubVF:
4966       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4967     case Op_SubVD:
4968       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4969     case Op_MulVS:
4970       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4971     case Op_MulVI:
4972       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4973     case Op_MulVL:
4974       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4975     case Op_MulVF:
4976       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4977     case Op_MulVD:
4978       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4979     case Op_DivVF:
4980       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4981     case Op_DivVD:
4982       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4983     case Op_FmaVF:
4984       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4985     case Op_FmaVD:
4986       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4987     case Op_MaxV:
4988       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4989     case Op_MinV:
4990       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4991     case Op_UMaxV:
4992       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4993     case Op_UMinV:
4994       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4995     case Op_XorV:
4996       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4997     case Op_OrV:
4998       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4999     case Op_AndV:
5000       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5001     default:
5002       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
5003       break;
5004   }
5005 }
5006 
5007 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
5008                                   KRegister src1, KRegister src2) {
5009   BasicType etype = T_ILLEGAL;
5010   switch(mask_len) {
5011     case 2:
5012     case 4:
5013     case 8:  etype = T_BYTE; break;
5014     case 16: etype = T_SHORT; break;
5015     case 32: etype = T_INT; break;
5016     case 64: etype = T_LONG; break;
5017     default: fatal("Unsupported type"); break;
5018   }
5019   assert(etype != T_ILLEGAL, "");
5020   switch(ideal_opc) {
5021     case Op_AndVMask:
5022       kand(etype, dst, src1, src2); break;
5023     case Op_OrVMask:
5024       kor(etype, dst, src1, src2); break;
5025     case Op_XorVMask:
5026       kxor(etype, dst, src1, src2); break;
5027     default:
5028       fatal("Unsupported masked operation"); break;
5029   }
5030 }
5031 
5032 /*
5033  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5034  * If src is NaN, the result is 0.
5035  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
5036  * the result is equal to the value of Integer.MIN_VALUE.
5037  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
5038  * the result is equal to the value of Integer.MAX_VALUE.
5039  */
5040 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5041                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5042                                                                    Register rscratch, AddressLiteral float_sign_flip,
5043                                                                    int vec_enc) {
5044   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5045   Label done;
5046   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
5047   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5048   vptest(xtmp2, xtmp2, vec_enc);
5049   jccb(Assembler::equal, done);
5050 
5051   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
5052   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
5053 
5054   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5055   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
5056   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
5057 
5058   // Recompute the mask for remaining special value.
5059   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
5060   // Extract SRC values corresponding to TRUE mask lanes.
5061   vpand(xtmp4, xtmp2, src, vec_enc);
5062   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
5063   // values are set.
5064   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5065 
5066   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5067   bind(done);
5068 }
5069 
5070 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5071                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5072                                                                     Register rscratch, AddressLiteral float_sign_flip,
5073                                                                     int vec_enc) {
5074   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5075   Label done;
5076   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5077   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5078   kortestwl(ktmp1, ktmp1);
5079   jccb(Assembler::equal, done);
5080 
5081   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5082   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5083   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5084 
5085   kxorwl(ktmp1, ktmp1, ktmp2);
5086   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5087   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5088   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5089   bind(done);
5090 }
5091 
5092 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5093                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5094                                                                      Register rscratch, AddressLiteral double_sign_flip,
5095                                                                      int vec_enc) {
5096   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5097 
5098   Label done;
5099   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5100   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5101   kortestwl(ktmp1, ktmp1);
5102   jccb(Assembler::equal, done);
5103 
5104   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5105   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5106   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5107 
5108   kxorwl(ktmp1, ktmp1, ktmp2);
5109   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5110   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5111   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5112   bind(done);
5113 }
5114 
5115 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5116                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5117                                                                      Register rscratch, AddressLiteral float_sign_flip,
5118                                                                      int vec_enc) {
5119   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5120   Label done;
5121   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5122   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5123   kortestwl(ktmp1, ktmp1);
5124   jccb(Assembler::equal, done);
5125 
5126   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5127   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5128   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5129 
5130   kxorwl(ktmp1, ktmp1, ktmp2);
5131   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5132   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5133   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5134   bind(done);
5135 }
5136 
5137 /*
5138  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5139  * If src is NaN, the result is 0.
5140  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5141  * the result is equal to the value of Long.MIN_VALUE.
5142  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5143  * the result is equal to the value of Long.MAX_VALUE.
5144  */
5145 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5146                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5147                                                                       Register rscratch, AddressLiteral double_sign_flip,
5148                                                                       int vec_enc) {
5149   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5150 
5151   Label done;
5152   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5153   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5154   kortestwl(ktmp1, ktmp1);
5155   jccb(Assembler::equal, done);
5156 
5157   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5158   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5159   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5160 
5161   kxorwl(ktmp1, ktmp1, ktmp2);
5162   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5163   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5164   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5165   bind(done);
5166 }
5167 
5168 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5169                                                              XMMRegister xtmp, int index, int vec_enc) {
5170    assert(vec_enc < Assembler::AVX_512bit, "");
5171    if (vec_enc == Assembler::AVX_256bit) {
5172      vextractf128_high(xtmp, src);
5173      vshufps(dst, src, xtmp, index, vec_enc);
5174    } else {
5175      vshufps(dst, src, zero, index, vec_enc);
5176    }
5177 }
5178 
5179 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5180                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5181                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5182   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5183 
5184   Label done;
5185   // Compare the destination lanes with float_sign_flip
5186   // value to get mask for all special values.
5187   movdqu(xtmp1, float_sign_flip, rscratch);
5188   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5189   ptest(xtmp2, xtmp2);
5190   jccb(Assembler::equal, done);
5191 
5192   // Flip float_sign_flip to get max integer value.
5193   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5194   pxor(xtmp1, xtmp4);
5195 
5196   // Set detination lanes corresponding to unordered source lanes as zero.
5197   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5198   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5199 
5200   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5201   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5202   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5203 
5204   // Recompute the mask for remaining special value.
5205   pxor(xtmp2, xtmp3);
5206   // Extract mask corresponding to non-negative source lanes.
5207   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5208 
5209   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5210   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5211   pand(xtmp3, xtmp2);
5212 
5213   // Replace destination lanes holding special value(0x80000000) with max int
5214   // if corresponding source lane holds a +ve value.
5215   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5216   bind(done);
5217 }
5218 
5219 
5220 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5221                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5222   switch(to_elem_bt) {
5223     case T_SHORT:
5224       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5225       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5226       vpackusdw(dst, dst, zero, vec_enc);
5227       if (vec_enc == Assembler::AVX_256bit) {
5228         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5229       }
5230       break;
5231     case  T_BYTE:
5232       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5233       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5234       vpackusdw(dst, dst, zero, vec_enc);
5235       if (vec_enc == Assembler::AVX_256bit) {
5236         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5237       }
5238       vpackuswb(dst, dst, zero, vec_enc);
5239       break;
5240     default: assert(false, "%s", type2name(to_elem_bt));
5241   }
5242 }
5243 
5244 /*
5245  * Algorithm for vector D2L and F2I conversions:-
5246  * a) Perform vector D2L/F2I cast.
5247  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5248  *    It signifies that source value could be any of the special floating point
5249  *    values(NaN,-Inf,Inf,Max,-Min).
5250  * c) Set destination to zero if source is NaN value.
5251  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5252  */
5253 
5254 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5255                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5256                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5257   int to_elem_sz = type2aelembytes(to_elem_bt);
5258   assert(to_elem_sz <= 4, "");
5259   vcvttps2dq(dst, src, vec_enc);
5260   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5261   if (to_elem_sz < 4) {
5262     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5263     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5264   }
5265 }
5266 
5267 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5268                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5269                                             Register rscratch, int vec_enc) {
5270   int to_elem_sz = type2aelembytes(to_elem_bt);
5271   assert(to_elem_sz <= 4, "");
5272   vcvttps2dq(dst, src, vec_enc);
5273   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5274   switch(to_elem_bt) {
5275     case T_INT:
5276       break;
5277     case T_SHORT:
5278       evpmovdw(dst, dst, vec_enc);
5279       break;
5280     case T_BYTE:
5281       evpmovdb(dst, dst, vec_enc);
5282       break;
5283     default: assert(false, "%s", type2name(to_elem_bt));
5284   }
5285 }
5286 
5287 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5288                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5289                                             Register rscratch, int vec_enc) {
5290   evcvttps2qq(dst, src, vec_enc);
5291   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5292 }
5293 
5294 // Handling for downcasting from double to integer or sub-word types on AVX2.
5295 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5296                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5297                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5298   int to_elem_sz = type2aelembytes(to_elem_bt);
5299   assert(to_elem_sz < 8, "");
5300   vcvttpd2dq(dst, src, vec_enc);
5301   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5302                                               float_sign_flip, vec_enc);
5303   if (to_elem_sz < 4) {
5304     // xtmp4 holds all zero lanes.
5305     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5306   }
5307 }
5308 
5309 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5310                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5311                                             KRegister ktmp2, AddressLiteral sign_flip,
5312                                             Register rscratch, int vec_enc) {
5313   if (VM_Version::supports_avx512dq()) {
5314     evcvttpd2qq(dst, src, vec_enc);
5315     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5316     switch(to_elem_bt) {
5317       case T_LONG:
5318         break;
5319       case T_INT:
5320         evpmovsqd(dst, dst, vec_enc);
5321         break;
5322       case T_SHORT:
5323         evpmovsqd(dst, dst, vec_enc);
5324         evpmovdw(dst, dst, vec_enc);
5325         break;
5326       case T_BYTE:
5327         evpmovsqd(dst, dst, vec_enc);
5328         evpmovdb(dst, dst, vec_enc);
5329         break;
5330       default: assert(false, "%s", type2name(to_elem_bt));
5331     }
5332   } else {
5333     assert(type2aelembytes(to_elem_bt) <= 4, "");
5334     vcvttpd2dq(dst, src, vec_enc);
5335     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5336     switch(to_elem_bt) {
5337       case T_INT:
5338         break;
5339       case T_SHORT:
5340         evpmovdw(dst, dst, vec_enc);
5341         break;
5342       case T_BYTE:
5343         evpmovdb(dst, dst, vec_enc);
5344         break;
5345       default: assert(false, "%s", type2name(to_elem_bt));
5346     }
5347   }
5348 }
5349 
5350 #ifdef _LP64
5351 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5352                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5353                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5354   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5355   // and re-instantiate original MXCSR.RC mode after that.
5356   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5357 
5358   mov64(tmp, julong_cast(0.5L));
5359   evpbroadcastq(xtmp1, tmp, vec_enc);
5360   vaddpd(xtmp1, src , xtmp1, vec_enc);
5361   evcvtpd2qq(dst, xtmp1, vec_enc);
5362   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5363                                                 double_sign_flip, vec_enc);;
5364 
5365   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5366 }
5367 
5368 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5369                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5370                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5371   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5372   // and re-instantiate original MXCSR.RC mode after that.
5373   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5374 
5375   movl(tmp, jint_cast(0.5));
5376   movq(xtmp1, tmp);
5377   vbroadcastss(xtmp1, xtmp1, vec_enc);
5378   vaddps(xtmp1, src , xtmp1, vec_enc);
5379   vcvtps2dq(dst, xtmp1, vec_enc);
5380   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5381                                               float_sign_flip, vec_enc);
5382 
5383   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5384 }
5385 
5386 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5387                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5388                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5389   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5390   // and re-instantiate original MXCSR.RC mode after that.
5391   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5392 
5393   movl(tmp, jint_cast(0.5));
5394   movq(xtmp1, tmp);
5395   vbroadcastss(xtmp1, xtmp1, vec_enc);
5396   vaddps(xtmp1, src , xtmp1, vec_enc);
5397   vcvtps2dq(dst, xtmp1, vec_enc);
5398   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5399 
5400   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5401 }
5402 #endif // _LP64
5403 
5404 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5405                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5406   switch (from_elem_bt) {
5407     case T_BYTE:
5408       switch (to_elem_bt) {
5409         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5410         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5411         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5412         default: ShouldNotReachHere();
5413       }
5414       break;
5415     case T_SHORT:
5416       switch (to_elem_bt) {
5417         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5418         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5419         default: ShouldNotReachHere();
5420       }
5421       break;
5422     case T_INT:
5423       assert(to_elem_bt == T_LONG, "");
5424       vpmovzxdq(dst, src, vlen_enc);
5425       break;
5426     default:
5427       ShouldNotReachHere();
5428   }
5429 }
5430 
5431 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5432                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5433   switch (from_elem_bt) {
5434     case T_BYTE:
5435       switch (to_elem_bt) {
5436         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5437         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5438         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5439         default: ShouldNotReachHere();
5440       }
5441       break;
5442     case T_SHORT:
5443       switch (to_elem_bt) {
5444         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5445         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5446         default: ShouldNotReachHere();
5447       }
5448       break;
5449     case T_INT:
5450       assert(to_elem_bt == T_LONG, "");
5451       vpmovsxdq(dst, src, vlen_enc);
5452       break;
5453     default:
5454       ShouldNotReachHere();
5455   }
5456 }
5457 
5458 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5459                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5460   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5461   assert(vlen_enc != AVX_512bit, "");
5462 
5463   int dst_bt_size = type2aelembytes(dst_bt);
5464   int src_bt_size = type2aelembytes(src_bt);
5465   if (dst_bt_size > src_bt_size) {
5466     switch (dst_bt_size / src_bt_size) {
5467       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5468       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5469       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5470       default: ShouldNotReachHere();
5471     }
5472   } else {
5473     assert(dst_bt_size < src_bt_size, "");
5474     switch (src_bt_size / dst_bt_size) {
5475       case 2: {
5476         if (vlen_enc == AVX_128bit) {
5477           vpacksswb(dst, src, src, vlen_enc);
5478         } else {
5479           vpacksswb(dst, src, src, vlen_enc);
5480           vpermq(dst, dst, 0x08, vlen_enc);
5481         }
5482         break;
5483       }
5484       case 4: {
5485         if (vlen_enc == AVX_128bit) {
5486           vpackssdw(dst, src, src, vlen_enc);
5487           vpacksswb(dst, dst, dst, vlen_enc);
5488         } else {
5489           vpackssdw(dst, src, src, vlen_enc);
5490           vpermq(dst, dst, 0x08, vlen_enc);
5491           vpacksswb(dst, dst, dst, AVX_128bit);
5492         }
5493         break;
5494       }
5495       case 8: {
5496         if (vlen_enc == AVX_128bit) {
5497           vpshufd(dst, src, 0x08, vlen_enc);
5498           vpackssdw(dst, dst, dst, vlen_enc);
5499           vpacksswb(dst, dst, dst, vlen_enc);
5500         } else {
5501           vpshufd(dst, src, 0x08, vlen_enc);
5502           vpermq(dst, dst, 0x08, vlen_enc);
5503           vpackssdw(dst, dst, dst, AVX_128bit);
5504           vpacksswb(dst, dst, dst, AVX_128bit);
5505         }
5506         break;
5507       }
5508       default: ShouldNotReachHere();
5509     }
5510   }
5511 }
5512 
5513 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5514                                    bool merge, BasicType bt, int vlen_enc) {
5515   if (bt == T_INT) {
5516     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5517   } else {
5518     assert(bt == T_LONG, "");
5519     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5520   }
5521 }
5522 
5523 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5524                                    bool merge, BasicType bt, int vlen_enc) {
5525   if (bt == T_INT) {
5526     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5527   } else {
5528     assert(bt == T_LONG, "");
5529     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5530   }
5531 }
5532 
5533 #ifdef _LP64
5534 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5535                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5536                                                int vec_enc) {
5537   int index = 0;
5538   int vindex = 0;
5539   mov64(rtmp1, 0x0101010101010101L);
5540   pdepq(rtmp1, src, rtmp1);
5541   if (mask_len > 8) {
5542     movq(rtmp2, src);
5543     vpxor(xtmp, xtmp, xtmp, vec_enc);
5544     movq(xtmp, rtmp1);
5545   }
5546   movq(dst, rtmp1);
5547 
5548   mask_len -= 8;
5549   while (mask_len > 0) {
5550     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5551     index++;
5552     if ((index % 2) == 0) {
5553       pxor(xtmp, xtmp);
5554     }
5555     mov64(rtmp1, 0x0101010101010101L);
5556     shrq(rtmp2, 8);
5557     pdepq(rtmp1, rtmp2, rtmp1);
5558     pinsrq(xtmp, rtmp1, index % 2);
5559     vindex = index / 2;
5560     if (vindex) {
5561       // Write entire 16 byte vector when both 64 bit
5562       // lanes are update to save redundant instructions.
5563       if (index % 2) {
5564         vinsertf128(dst, dst, xtmp, vindex);
5565       }
5566     } else {
5567       vmovdqu(dst, xtmp);
5568     }
5569     mask_len -= 8;
5570   }
5571 }
5572 
5573 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5574   switch(opc) {
5575     case Op_VectorMaskTrueCount:
5576       popcntq(dst, tmp);
5577       break;
5578     case Op_VectorMaskLastTrue:
5579       if (VM_Version::supports_lzcnt()) {
5580         lzcntq(tmp, tmp);
5581         movl(dst, 63);
5582         subl(dst, tmp);
5583       } else {
5584         movl(dst, -1);
5585         bsrq(tmp, tmp);
5586         cmov32(Assembler::notZero, dst, tmp);
5587       }
5588       break;
5589     case Op_VectorMaskFirstTrue:
5590       if (VM_Version::supports_bmi1()) {
5591         if (masklen < 32) {
5592           orl(tmp, 1 << masklen);
5593           tzcntl(dst, tmp);
5594         } else if (masklen == 32) {
5595           tzcntl(dst, tmp);
5596         } else {
5597           assert(masklen == 64, "");
5598           tzcntq(dst, tmp);
5599         }
5600       } else {
5601         if (masklen < 32) {
5602           orl(tmp, 1 << masklen);
5603           bsfl(dst, tmp);
5604         } else {
5605           assert(masklen == 32 || masklen == 64, "");
5606           movl(dst, masklen);
5607           if (masklen == 32)  {
5608             bsfl(tmp, tmp);
5609           } else {
5610             bsfq(tmp, tmp);
5611           }
5612           cmov32(Assembler::notZero, dst, tmp);
5613         }
5614       }
5615       break;
5616     case Op_VectorMaskToLong:
5617       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5618       break;
5619     default: assert(false, "Unhandled mask operation");
5620   }
5621 }
5622 
5623 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5624                                               int masklen, int masksize, int vec_enc) {
5625   assert(VM_Version::supports_popcnt(), "");
5626 
5627   if(VM_Version::supports_avx512bw()) {
5628     kmovql(tmp, mask);
5629   } else {
5630     assert(masklen <= 16, "");
5631     kmovwl(tmp, mask);
5632   }
5633 
5634   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5635   // operations needs to be clipped.
5636   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5637     andq(tmp, (1 << masklen) - 1);
5638   }
5639 
5640   vector_mask_operation_helper(opc, dst, tmp, masklen);
5641 }
5642 
5643 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5644                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5645   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5646          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5647   assert(VM_Version::supports_popcnt(), "");
5648 
5649   bool need_clip = false;
5650   switch(bt) {
5651     case T_BOOLEAN:
5652       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5653       vpxor(xtmp, xtmp, xtmp, vec_enc);
5654       vpsubb(xtmp, xtmp, mask, vec_enc);
5655       vpmovmskb(tmp, xtmp, vec_enc);
5656       need_clip = masklen < 16;
5657       break;
5658     case T_BYTE:
5659       vpmovmskb(tmp, mask, vec_enc);
5660       need_clip = masklen < 16;
5661       break;
5662     case T_SHORT:
5663       vpacksswb(xtmp, mask, mask, vec_enc);
5664       if (masklen >= 16) {
5665         vpermpd(xtmp, xtmp, 8, vec_enc);
5666       }
5667       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5668       need_clip = masklen < 16;
5669       break;
5670     case T_INT:
5671     case T_FLOAT:
5672       vmovmskps(tmp, mask, vec_enc);
5673       need_clip = masklen < 4;
5674       break;
5675     case T_LONG:
5676     case T_DOUBLE:
5677       vmovmskpd(tmp, mask, vec_enc);
5678       need_clip = masklen < 2;
5679       break;
5680     default: assert(false, "Unhandled type, %s", type2name(bt));
5681   }
5682 
5683   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5684   // operations needs to be clipped.
5685   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5686     // need_clip implies masklen < 32
5687     andq(tmp, (1 << masklen) - 1);
5688   }
5689 
5690   vector_mask_operation_helper(opc, dst, tmp, masklen);
5691 }
5692 
5693 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5694                                              Register rtmp2, int mask_len) {
5695   kmov(rtmp1, src);
5696   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5697   mov64(rtmp2, -1L);
5698   pextq(rtmp2, rtmp2, rtmp1);
5699   kmov(dst, rtmp2);
5700 }
5701 
5702 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5703                                                     XMMRegister mask, Register rtmp, Register rscratch,
5704                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5705                                                     int vec_enc) {
5706   assert(type2aelembytes(bt) >= 4, "");
5707   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5708   address compress_perm_table = nullptr;
5709   address expand_perm_table = nullptr;
5710   if (type2aelembytes(bt) == 8) {
5711     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5712     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5713     vmovmskpd(rtmp, mask, vec_enc);
5714   } else {
5715     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5716     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5717     vmovmskps(rtmp, mask, vec_enc);
5718   }
5719   shlq(rtmp, 5); // for 32 byte permute row.
5720   if (opcode == Op_CompressV) {
5721     lea(rscratch, ExternalAddress(compress_perm_table));
5722   } else {
5723     lea(rscratch, ExternalAddress(expand_perm_table));
5724   }
5725   addptr(rtmp, rscratch);
5726   vmovdqu(permv, Address(rtmp));
5727   vpermps(dst, permv, src, Assembler::AVX_256bit);
5728   vpxor(xtmp, xtmp, xtmp, vec_enc);
5729   // Blend the result with zero vector using permute mask, each column entry
5730   // in a permute table row contains either a valid permute index or a -1 (default)
5731   // value, this can potentially be used as a blending mask after
5732   // compressing/expanding the source vector lanes.
5733   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5734 }
5735 
5736 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5737                                                bool merge, BasicType bt, int vec_enc) {
5738   if (opcode == Op_CompressV) {
5739     switch(bt) {
5740     case T_BYTE:
5741       evpcompressb(dst, mask, src, merge, vec_enc);
5742       break;
5743     case T_CHAR:
5744     case T_SHORT:
5745       evpcompressw(dst, mask, src, merge, vec_enc);
5746       break;
5747     case T_INT:
5748       evpcompressd(dst, mask, src, merge, vec_enc);
5749       break;
5750     case T_FLOAT:
5751       evcompressps(dst, mask, src, merge, vec_enc);
5752       break;
5753     case T_LONG:
5754       evpcompressq(dst, mask, src, merge, vec_enc);
5755       break;
5756     case T_DOUBLE:
5757       evcompresspd(dst, mask, src, merge, vec_enc);
5758       break;
5759     default:
5760       fatal("Unsupported type %s", type2name(bt));
5761       break;
5762     }
5763   } else {
5764     assert(opcode == Op_ExpandV, "");
5765     switch(bt) {
5766     case T_BYTE:
5767       evpexpandb(dst, mask, src, merge, vec_enc);
5768       break;
5769     case T_CHAR:
5770     case T_SHORT:
5771       evpexpandw(dst, mask, src, merge, vec_enc);
5772       break;
5773     case T_INT:
5774       evpexpandd(dst, mask, src, merge, vec_enc);
5775       break;
5776     case T_FLOAT:
5777       evexpandps(dst, mask, src, merge, vec_enc);
5778       break;
5779     case T_LONG:
5780       evpexpandq(dst, mask, src, merge, vec_enc);
5781       break;
5782     case T_DOUBLE:
5783       evexpandpd(dst, mask, src, merge, vec_enc);
5784       break;
5785     default:
5786       fatal("Unsupported type %s", type2name(bt));
5787       break;
5788     }
5789   }
5790 }
5791 #endif
5792 
5793 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5794                                            KRegister ktmp1, int vec_enc) {
5795   if (opcode == Op_SignumVD) {
5796     vsubpd(dst, zero, one, vec_enc);
5797     // if src < 0 ? -1 : 1
5798     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5799     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5800     // if src == NaN, -0.0 or 0.0 return src.
5801     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5802     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5803   } else {
5804     assert(opcode == Op_SignumVF, "");
5805     vsubps(dst, zero, one, vec_enc);
5806     // if src < 0 ? -1 : 1
5807     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5808     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5809     // if src == NaN, -0.0 or 0.0 return src.
5810     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5811     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5812   }
5813 }
5814 
5815 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5816                                           XMMRegister xtmp1, int vec_enc) {
5817   if (opcode == Op_SignumVD) {
5818     vsubpd(dst, zero, one, vec_enc);
5819     // if src < 0 ? -1 : 1
5820     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5821     // if src == NaN, -0.0 or 0.0 return src.
5822     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5823     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5824   } else {
5825     assert(opcode == Op_SignumVF, "");
5826     vsubps(dst, zero, one, vec_enc);
5827     // if src < 0 ? -1 : 1
5828     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5829     // if src == NaN, -0.0 or 0.0 return src.
5830     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5831     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5832   }
5833 }
5834 
5835 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5836   if (VM_Version::supports_avx512bw()) {
5837     if (mask_len > 32) {
5838       kmovql(dst, src);
5839     } else {
5840       kmovdl(dst, src);
5841       if (mask_len != 32) {
5842         kshiftrdl(dst, dst, 32 - mask_len);
5843       }
5844     }
5845   } else {
5846     assert(mask_len <= 16, "");
5847     kmovwl(dst, src);
5848     if (mask_len != 16) {
5849       kshiftrwl(dst, dst, 16 - mask_len);
5850     }
5851   }
5852 }
5853 
5854 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5855   int lane_size = type2aelembytes(bt);
5856   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5857   if ((is_LP64 || lane_size < 8) &&
5858       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5859        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5860     movptr(rtmp, imm32);
5861     switch(lane_size) {
5862       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5863       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5864       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5865       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5866       fatal("Unsupported lane size %d", lane_size);
5867       break;
5868     }
5869   } else {
5870     movptr(rtmp, imm32);
5871     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5872     switch(lane_size) {
5873       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5874       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5875       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5876       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5877       fatal("Unsupported lane size %d", lane_size);
5878       break;
5879     }
5880   }
5881 }
5882 
5883 //
5884 // Following is lookup table based popcount computation algorithm:-
5885 //       Index   Bit set count
5886 //     [ 0000 ->   0,
5887 //       0001 ->   1,
5888 //       0010 ->   1,
5889 //       0011 ->   2,
5890 //       0100 ->   1,
5891 //       0101 ->   2,
5892 //       0110 ->   2,
5893 //       0111 ->   3,
5894 //       1000 ->   1,
5895 //       1001 ->   2,
5896 //       1010 ->   3,
5897 //       1011 ->   3,
5898 //       1100 ->   2,
5899 //       1101 ->   3,
5900 //       1111 ->   4 ]
5901 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5902 //     shuffle indices for lookup table access.
5903 //  b. Right shift each byte of vector lane by 4 positions.
5904 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5905 //     shuffle indices for lookup table access.
5906 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5907 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5908 //     count of all the bytes of a quadword.
5909 //  f. Perform step e. for upper 128bit vector lane.
5910 //  g. Pack the bitset count of quadwords back to double word.
5911 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5912 
5913 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5914                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5915   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5916   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5917   vpsrlw(dst, src, 4, vec_enc);
5918   vpand(dst, dst, xtmp1, vec_enc);
5919   vpand(xtmp1, src, xtmp1, vec_enc);
5920   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5921   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5922   vpshufb(dst, xtmp2, dst, vec_enc);
5923   vpaddb(dst, dst, xtmp1, vec_enc);
5924 }
5925 
5926 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5927                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5928   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5929   // Following code is as per steps e,f,g and h of above algorithm.
5930   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5931   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5932   vpsadbw(dst, dst, xtmp2, vec_enc);
5933   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5934   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5935   vpackuswb(dst, xtmp1, dst, vec_enc);
5936 }
5937 
5938 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5939                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5940   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5941   // Add the popcount of upper and lower bytes of word.
5942   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5943   vpsrlw(dst, xtmp1, 8, vec_enc);
5944   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5945   vpaddw(dst, dst, xtmp1, vec_enc);
5946 }
5947 
5948 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5949                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5950   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5951   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5952   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5953 }
5954 
5955 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5956                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5957   switch(bt) {
5958     case T_LONG:
5959       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5960       break;
5961     case T_INT:
5962       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5963       break;
5964     case T_CHAR:
5965     case T_SHORT:
5966       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5967       break;
5968     case T_BYTE:
5969     case T_BOOLEAN:
5970       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5971       break;
5972     default:
5973       fatal("Unsupported type %s", type2name(bt));
5974       break;
5975   }
5976 }
5977 
5978 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5979                                                       KRegister mask, bool merge, int vec_enc) {
5980   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5981   switch(bt) {
5982     case T_LONG:
5983       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5984       evpopcntq(dst, mask, src, merge, vec_enc);
5985       break;
5986     case T_INT:
5987       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5988       evpopcntd(dst, mask, src, merge, vec_enc);
5989       break;
5990     case T_CHAR:
5991     case T_SHORT:
5992       assert(VM_Version::supports_avx512_bitalg(), "");
5993       evpopcntw(dst, mask, src, merge, vec_enc);
5994       break;
5995     case T_BYTE:
5996     case T_BOOLEAN:
5997       assert(VM_Version::supports_avx512_bitalg(), "");
5998       evpopcntb(dst, mask, src, merge, vec_enc);
5999       break;
6000     default:
6001       fatal("Unsupported type %s", type2name(bt));
6002       break;
6003   }
6004 }
6005 
6006 #ifndef _LP64
6007 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
6008   assert(VM_Version::supports_avx512bw(), "");
6009   kmovdl(tmp, src);
6010   kunpckdql(dst, tmp, tmp);
6011 }
6012 #endif
6013 
6014 // Bit reversal algorithm first reverses the bits of each byte followed by
6015 // a byte level reversal for multi-byte primitive types (short/int/long).
6016 // Algorithm performs a lookup table access to get reverse bit sequence
6017 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6018 // is obtained by swapping the reverse bit sequences of upper and lower
6019 // nibble of a byte.
6020 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6021                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
6022   if (VM_Version::supports_avx512vlbw()) {
6023 
6024     // Get the reverse bit sequence of lower nibble of each byte.
6025     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6026     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6027     evpandq(dst, xtmp2, src, vec_enc);
6028     vpshufb(dst, xtmp1, dst, vec_enc);
6029     vpsllq(dst, dst, 4, vec_enc);
6030 
6031     // Get the reverse bit sequence of upper nibble of each byte.
6032     vpandn(xtmp2, xtmp2, src, vec_enc);
6033     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6034     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6035 
6036     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6037     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6038     evporq(xtmp2, dst, xtmp2, vec_enc);
6039     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6040 
6041   } else if(vec_enc == Assembler::AVX_512bit) {
6042     // Shift based bit reversal.
6043     assert(bt == T_LONG || bt == T_INT, "");
6044 
6045     // Swap lower and upper nibble of each byte.
6046     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6047 
6048     // Swap two least and most significant bits of each nibble.
6049     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6050 
6051     // Swap adjacent pair of bits.
6052     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6053     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6054 
6055     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6056     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6057   } else {
6058     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6059     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6060 
6061     // Get the reverse bit sequence of lower nibble of each byte.
6062     vpand(dst, xtmp2, src, vec_enc);
6063     vpshufb(dst, xtmp1, dst, vec_enc);
6064     vpsllq(dst, dst, 4, vec_enc);
6065 
6066     // Get the reverse bit sequence of upper nibble of each byte.
6067     vpandn(xtmp2, xtmp2, src, vec_enc);
6068     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6069     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6070 
6071     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6072     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6073     vpor(xtmp2, dst, xtmp2, vec_enc);
6074     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6075   }
6076 }
6077 
6078 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6079                                                 XMMRegister xtmp, Register rscratch) {
6080   assert(VM_Version::supports_gfni(), "");
6081   assert(rscratch != noreg || always_reachable(mask), "missing");
6082 
6083   // Galois field instruction based bit reversal based on following algorithm.
6084   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6085   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6086   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6087   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6088 }
6089 
6090 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6091                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6092   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6093   evpandq(dst, xtmp1, src, vec_enc);
6094   vpsllq(dst, dst, nbits, vec_enc);
6095   vpandn(xtmp1, xtmp1, src, vec_enc);
6096   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6097   evporq(dst, dst, xtmp1, vec_enc);
6098 }
6099 
6100 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6101                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6102   // Shift based bit reversal.
6103   assert(VM_Version::supports_evex(), "");
6104   switch(bt) {
6105     case T_LONG:
6106       // Swap upper and lower double word of each quad word.
6107       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6108       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6109       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6110       break;
6111     case T_INT:
6112       // Swap upper and lower word of each double word.
6113       evprord(xtmp1, k0, src, 16, true, vec_enc);
6114       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6115       break;
6116     case T_CHAR:
6117     case T_SHORT:
6118       // Swap upper and lower byte of each word.
6119       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6120       break;
6121     case T_BYTE:
6122       evmovdquq(dst, k0, src, true, vec_enc);
6123       break;
6124     default:
6125       fatal("Unsupported type %s", type2name(bt));
6126       break;
6127   }
6128 }
6129 
6130 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6131   if (bt == T_BYTE) {
6132     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6133       evmovdquq(dst, k0, src, true, vec_enc);
6134     } else {
6135       vmovdqu(dst, src);
6136     }
6137     return;
6138   }
6139   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6140   // pre-computed shuffle indices.
6141   switch(bt) {
6142     case T_LONG:
6143       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6144       break;
6145     case T_INT:
6146       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6147       break;
6148     case T_CHAR:
6149     case T_SHORT:
6150       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6151       break;
6152     default:
6153       fatal("Unsupported type %s", type2name(bt));
6154       break;
6155   }
6156   vpshufb(dst, src, dst, vec_enc);
6157 }
6158 
6159 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6160                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6161                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6162   assert(is_integral_type(bt), "");
6163   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6164   assert(VM_Version::supports_avx512cd(), "");
6165   switch(bt) {
6166     case T_LONG:
6167       evplzcntq(dst, ktmp, src, merge, vec_enc);
6168       break;
6169     case T_INT:
6170       evplzcntd(dst, ktmp, src, merge, vec_enc);
6171       break;
6172     case T_SHORT:
6173       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6174       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6175       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6176       vpunpckhwd(dst, xtmp1, src, vec_enc);
6177       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6178       vpackusdw(dst, xtmp2, dst, vec_enc);
6179       break;
6180     case T_BYTE:
6181       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6182       // accessing the lookup table.
6183       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6184       // accessing the lookup table.
6185       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6186       assert(VM_Version::supports_avx512bw(), "");
6187       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6188       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6189       vpand(xtmp2, dst, src, vec_enc);
6190       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6191       vpsrlw(xtmp3, src, 4, vec_enc);
6192       vpand(xtmp3, dst, xtmp3, vec_enc);
6193       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6194       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6195       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6196       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6197       break;
6198     default:
6199       fatal("Unsupported type %s", type2name(bt));
6200       break;
6201   }
6202 }
6203 
6204 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6205                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6206   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6207   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6208   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6209   // accessing the lookup table.
6210   vpand(dst, xtmp2, src, vec_enc);
6211   vpshufb(dst, xtmp1, dst, vec_enc);
6212   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6213   // accessing the lookup table.
6214   vpsrlw(xtmp3, src, 4, vec_enc);
6215   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6216   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6217   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6218   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6219   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6220   vpaddb(dst, dst, xtmp2, vec_enc);
6221   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6222 }
6223 
6224 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6225                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6226   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6227   // Add zero counts of lower byte and upper byte of a word if
6228   // upper byte holds a zero value.
6229   vpsrlw(xtmp3, src, 8, vec_enc);
6230   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6231   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6232   vpsllw(xtmp2, dst, 8, vec_enc);
6233   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6234   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6235   vpsrlw(dst, dst, 8, vec_enc);
6236 }
6237 
6238 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6239                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6240   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6241   // hence biased exponent can be used to compute leading zero count as per
6242   // following formula:-
6243   // LZCNT = 32 - (biased_exp - 127)
6244   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6245 
6246   // Broadcast 0xFF
6247   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6248   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6249 
6250   // Extract biased exponent.
6251   vcvtdq2ps(dst, src, vec_enc);
6252   vpsrld(dst, dst, 23, vec_enc);
6253   vpand(dst, dst, xtmp1, vec_enc);
6254 
6255   // Broadcast 127.
6256   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6257   // Exponent = biased_exp - 127
6258   vpsubd(dst, dst, xtmp1, vec_enc);
6259 
6260   // Exponent = Exponent  + 1
6261   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6262   vpaddd(dst, dst, xtmp3, vec_enc);
6263 
6264   // Replace -ve exponent with zero, exponent is -ve when src
6265   // lane contains a zero value.
6266   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6267   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6268 
6269   // Rematerialize broadcast 32.
6270   vpslld(xtmp1, xtmp3, 5, vec_enc);
6271   // Exponent is 32 if corresponding source lane contains max_int value.
6272   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6273   // LZCNT = 32 - exponent
6274   vpsubd(dst, xtmp1, dst, vec_enc);
6275 
6276   // Replace LZCNT with a value 1 if corresponding source lane
6277   // contains max_int value.
6278   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6279 
6280   // Replace biased_exp with 0 if source lane value is less than zero.
6281   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6282   vblendvps(dst, dst, xtmp2, src, vec_enc);
6283 }
6284 
6285 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6286                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6287   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6288   // Add zero counts of lower word and upper word of a double word if
6289   // upper word holds a zero value.
6290   vpsrld(xtmp3, src, 16, vec_enc);
6291   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6292   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6293   vpslld(xtmp2, dst, 16, vec_enc);
6294   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6295   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6296   vpsrld(dst, dst, 16, vec_enc);
6297   // Add zero counts of lower doubleword and upper doubleword of a
6298   // quadword if upper doubleword holds a zero value.
6299   vpsrlq(xtmp3, src, 32, vec_enc);
6300   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6301   vpsllq(xtmp2, dst, 32, vec_enc);
6302   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6303   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6304   vpsrlq(dst, dst, 32, vec_enc);
6305 }
6306 
6307 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6308                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6309                                                        Register rtmp, int vec_enc) {
6310   assert(is_integral_type(bt), "unexpected type");
6311   assert(vec_enc < Assembler::AVX_512bit, "");
6312   switch(bt) {
6313     case T_LONG:
6314       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6315       break;
6316     case T_INT:
6317       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6318       break;
6319     case T_SHORT:
6320       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6321       break;
6322     case T_BYTE:
6323       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6324       break;
6325     default:
6326       fatal("Unsupported type %s", type2name(bt));
6327       break;
6328   }
6329 }
6330 
6331 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6332   switch(bt) {
6333     case T_BYTE:
6334       vpsubb(dst, src1, src2, vec_enc);
6335       break;
6336     case T_SHORT:
6337       vpsubw(dst, src1, src2, vec_enc);
6338       break;
6339     case T_INT:
6340       vpsubd(dst, src1, src2, vec_enc);
6341       break;
6342     case T_LONG:
6343       vpsubq(dst, src1, src2, vec_enc);
6344       break;
6345     default:
6346       fatal("Unsupported type %s", type2name(bt));
6347       break;
6348   }
6349 }
6350 
6351 // Trailing zero count computation is based on leading zero count operation as per
6352 // following equation. All AVX3 targets support AVX512CD feature which offers
6353 // direct vector instruction to compute leading zero count.
6354 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6355 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6356                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6357                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6358   assert(is_integral_type(bt), "");
6359   // xtmp = -1
6360   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6361   // xtmp = xtmp + src
6362   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6363   // xtmp = xtmp & ~src
6364   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6365   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6366   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6367   vpsub(bt, dst, xtmp4, dst, vec_enc);
6368 }
6369 
6370 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6371 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6372 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6373                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6374   assert(is_integral_type(bt), "");
6375   // xtmp = 0
6376   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6377   // xtmp = 0 - src
6378   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6379   // xtmp = xtmp | src
6380   vpor(xtmp3, xtmp3, src, vec_enc);
6381   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6382   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6383   vpsub(bt, dst, xtmp1, dst, vec_enc);
6384 }
6385 
6386 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6387   Label done;
6388   Label neg_divisor_fastpath;
6389   cmpl(divisor, 0);
6390   jccb(Assembler::less, neg_divisor_fastpath);
6391   xorl(rdx, rdx);
6392   divl(divisor);
6393   jmpb(done);
6394   bind(neg_divisor_fastpath);
6395   // Fastpath for divisor < 0:
6396   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6397   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6398   movl(rdx, rax);
6399   subl(rdx, divisor);
6400   if (VM_Version::supports_bmi1()) {
6401     andnl(rax, rdx, rax);
6402   } else {
6403     notl(rdx);
6404     andl(rax, rdx);
6405   }
6406   shrl(rax, 31);
6407   bind(done);
6408 }
6409 
6410 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6411   Label done;
6412   Label neg_divisor_fastpath;
6413   cmpl(divisor, 0);
6414   jccb(Assembler::less, neg_divisor_fastpath);
6415   xorl(rdx, rdx);
6416   divl(divisor);
6417   jmpb(done);
6418   bind(neg_divisor_fastpath);
6419   // Fastpath when divisor < 0:
6420   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6421   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6422   movl(rdx, rax);
6423   subl(rax, divisor);
6424   if (VM_Version::supports_bmi1()) {
6425     andnl(rax, rax, rdx);
6426   } else {
6427     notl(rax);
6428     andl(rax, rdx);
6429   }
6430   sarl(rax, 31);
6431   andl(rax, divisor);
6432   subl(rdx, rax);
6433   bind(done);
6434 }
6435 
6436 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6437   Label done;
6438   Label neg_divisor_fastpath;
6439 
6440   cmpl(divisor, 0);
6441   jccb(Assembler::less, neg_divisor_fastpath);
6442   xorl(rdx, rdx);
6443   divl(divisor);
6444   jmpb(done);
6445   bind(neg_divisor_fastpath);
6446   // Fastpath for divisor < 0:
6447   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6448   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6449   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6450   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6451   movl(rdx, rax);
6452   subl(rax, divisor);
6453   if (VM_Version::supports_bmi1()) {
6454     andnl(rax, rax, rdx);
6455   } else {
6456     notl(rax);
6457     andl(rax, rdx);
6458   }
6459   movl(tmp, rax);
6460   shrl(rax, 31); // quotient
6461   sarl(tmp, 31);
6462   andl(tmp, divisor);
6463   subl(rdx, tmp); // remainder
6464   bind(done);
6465 }
6466 
6467 #ifdef _LP64
6468 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6469                                  XMMRegister xtmp2, Register rtmp) {
6470   if(VM_Version::supports_gfni()) {
6471     // Galois field instruction based bit reversal based on following algorithm.
6472     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6473     mov64(rtmp, 0x8040201008040201L);
6474     movq(xtmp1, src);
6475     movq(xtmp2, rtmp);
6476     gf2p8affineqb(xtmp1, xtmp2, 0);
6477     movq(dst, xtmp1);
6478   } else {
6479     // Swap even and odd numbered bits.
6480     movl(rtmp, src);
6481     andl(rtmp, 0x55555555);
6482     shll(rtmp, 1);
6483     movl(dst, src);
6484     andl(dst, 0xAAAAAAAA);
6485     shrl(dst, 1);
6486     orl(dst, rtmp);
6487 
6488     // Swap LSB and MSB 2 bits of each nibble.
6489     movl(rtmp, dst);
6490     andl(rtmp, 0x33333333);
6491     shll(rtmp, 2);
6492     andl(dst, 0xCCCCCCCC);
6493     shrl(dst, 2);
6494     orl(dst, rtmp);
6495 
6496     // Swap LSB and MSB 4 bits of each byte.
6497     movl(rtmp, dst);
6498     andl(rtmp, 0x0F0F0F0F);
6499     shll(rtmp, 4);
6500     andl(dst, 0xF0F0F0F0);
6501     shrl(dst, 4);
6502     orl(dst, rtmp);
6503   }
6504   bswapl(dst);
6505 }
6506 
6507 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6508                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6509   if(VM_Version::supports_gfni()) {
6510     // Galois field instruction based bit reversal based on following algorithm.
6511     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6512     mov64(rtmp1, 0x8040201008040201L);
6513     movq(xtmp1, src);
6514     movq(xtmp2, rtmp1);
6515     gf2p8affineqb(xtmp1, xtmp2, 0);
6516     movq(dst, xtmp1);
6517   } else {
6518     // Swap even and odd numbered bits.
6519     movq(rtmp1, src);
6520     mov64(rtmp2, 0x5555555555555555L);
6521     andq(rtmp1, rtmp2);
6522     shlq(rtmp1, 1);
6523     movq(dst, src);
6524     notq(rtmp2);
6525     andq(dst, rtmp2);
6526     shrq(dst, 1);
6527     orq(dst, rtmp1);
6528 
6529     // Swap LSB and MSB 2 bits of each nibble.
6530     movq(rtmp1, dst);
6531     mov64(rtmp2, 0x3333333333333333L);
6532     andq(rtmp1, rtmp2);
6533     shlq(rtmp1, 2);
6534     notq(rtmp2);
6535     andq(dst, rtmp2);
6536     shrq(dst, 2);
6537     orq(dst, rtmp1);
6538 
6539     // Swap LSB and MSB 4 bits of each byte.
6540     movq(rtmp1, dst);
6541     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6542     andq(rtmp1, rtmp2);
6543     shlq(rtmp1, 4);
6544     notq(rtmp2);
6545     andq(dst, rtmp2);
6546     shrq(dst, 4);
6547     orq(dst, rtmp1);
6548   }
6549   bswapq(dst);
6550 }
6551 
6552 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6553   Label done;
6554   Label neg_divisor_fastpath;
6555   cmpq(divisor, 0);
6556   jccb(Assembler::less, neg_divisor_fastpath);
6557   xorl(rdx, rdx);
6558   divq(divisor);
6559   jmpb(done);
6560   bind(neg_divisor_fastpath);
6561   // Fastpath for divisor < 0:
6562   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6563   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6564   movq(rdx, rax);
6565   subq(rdx, divisor);
6566   if (VM_Version::supports_bmi1()) {
6567     andnq(rax, rdx, rax);
6568   } else {
6569     notq(rdx);
6570     andq(rax, rdx);
6571   }
6572   shrq(rax, 63);
6573   bind(done);
6574 }
6575 
6576 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6577   Label done;
6578   Label neg_divisor_fastpath;
6579   cmpq(divisor, 0);
6580   jccb(Assembler::less, neg_divisor_fastpath);
6581   xorq(rdx, rdx);
6582   divq(divisor);
6583   jmp(done);
6584   bind(neg_divisor_fastpath);
6585   // Fastpath when divisor < 0:
6586   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6587   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6588   movq(rdx, rax);
6589   subq(rax, divisor);
6590   if (VM_Version::supports_bmi1()) {
6591     andnq(rax, rax, rdx);
6592   } else {
6593     notq(rax);
6594     andq(rax, rdx);
6595   }
6596   sarq(rax, 63);
6597   andq(rax, divisor);
6598   subq(rdx, rax);
6599   bind(done);
6600 }
6601 
6602 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6603   Label done;
6604   Label neg_divisor_fastpath;
6605   cmpq(divisor, 0);
6606   jccb(Assembler::less, neg_divisor_fastpath);
6607   xorq(rdx, rdx);
6608   divq(divisor);
6609   jmp(done);
6610   bind(neg_divisor_fastpath);
6611   // Fastpath for divisor < 0:
6612   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6613   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6614   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6615   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6616   movq(rdx, rax);
6617   subq(rax, divisor);
6618   if (VM_Version::supports_bmi1()) {
6619     andnq(rax, rax, rdx);
6620   } else {
6621     notq(rax);
6622     andq(rax, rdx);
6623   }
6624   movq(tmp, rax);
6625   shrq(rax, 63); // quotient
6626   sarq(tmp, 63);
6627   andq(tmp, divisor);
6628   subq(rdx, tmp); // remainder
6629   bind(done);
6630 }
6631 #endif
6632 
6633 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6634                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6635                                         int vlen_enc) {
6636   assert(VM_Version::supports_avx512bw(), "");
6637   // Byte shuffles are inlane operations and indices are determined using
6638   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6639   // normalized to index range 0-15. This makes sure that all the multiples
6640   // of an index value are placed at same relative position in 128 bit
6641   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6642   // will be 16th element in their respective 128 bit lanes.
6643   movl(rtmp, 16);
6644   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6645 
6646   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6647   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6648   // original shuffle indices and move the shuffled lanes corresponding to true
6649   // mask to destination vector.
6650   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6651   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6652   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6653 
6654   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6655   // and broadcasting second 128 bit lane.
6656   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6657   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6658   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6659   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6660   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6661 
6662   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6663   // and broadcasting third 128 bit lane.
6664   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6665   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6666   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6667   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6668   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6669 
6670   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6671   // and broadcasting third 128 bit lane.
6672   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6673   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6674   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6675   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6676   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6677 }
6678 
6679 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6680                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6681   if (vlen_enc == AVX_128bit) {
6682     vpermilps(dst, src, shuffle, vlen_enc);
6683   } else if (bt == T_INT) {
6684     vpermd(dst, shuffle, src, vlen_enc);
6685   } else {
6686     assert(bt == T_FLOAT, "");
6687     vpermps(dst, shuffle, src, vlen_enc);
6688   }
6689 }
6690 
6691 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6692   switch(elem_bt) {
6693     case T_BYTE:
6694       if (ideal_opc == Op_SaturatingAddV) {
6695         vpaddsb(dst, src1, src2, vlen_enc);
6696       } else {
6697         assert(ideal_opc == Op_SaturatingSubV, "");
6698         vpsubsb(dst, src1, src2, vlen_enc);
6699       }
6700       break;
6701     case T_SHORT:
6702       if (ideal_opc == Op_SaturatingAddV) {
6703         vpaddsw(dst, src1, src2, vlen_enc);
6704       } else {
6705         assert(ideal_opc == Op_SaturatingSubV, "");
6706         vpsubsw(dst, src1, src2, vlen_enc);
6707       }
6708       break;
6709     default:
6710       fatal("Unsupported type %s", type2name(elem_bt));
6711       break;
6712   }
6713 }
6714 
6715 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6716   switch(elem_bt) {
6717     case T_BYTE:
6718       if (ideal_opc == Op_SaturatingAddV) {
6719         vpaddusb(dst, src1, src2, vlen_enc);
6720       } else {
6721         assert(ideal_opc == Op_SaturatingSubV, "");
6722         vpsubusb(dst, src1, src2, vlen_enc);
6723       }
6724       break;
6725     case T_SHORT:
6726       if (ideal_opc == Op_SaturatingAddV) {
6727         vpaddusw(dst, src1, src2, vlen_enc);
6728       } else {
6729         assert(ideal_opc == Op_SaturatingSubV, "");
6730         vpsubusw(dst, src1, src2, vlen_enc);
6731       }
6732       break;
6733     default:
6734       fatal("Unsupported type %s", type2name(elem_bt));
6735       break;
6736   }
6737 }
6738 
6739 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6740                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6741   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6742   // overflow_mask = Inp1 <u Inp2
6743   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6744   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6745   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6746 }
6747 
6748 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6749                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6750   // Emulate unsigned comparison using signed comparison
6751   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6752   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6753   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6754   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6755 
6756   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6757 
6758   // Res = INP1 - INP2 (non-commutative and non-associative)
6759   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6760   // Res = Mask ? Zero : Res
6761   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6762   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6763 }
6764 
6765 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6766                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6767   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6768   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6769   // Res = Signed Add INP1, INP2
6770   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6771   // T1 = SRC1 | SRC2
6772   vpor(xtmp1, src1, src2, vlen_enc);
6773   // Max_Unsigned = -1
6774   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6775   // Unsigned compare:  Mask = Res <u T1
6776   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6777   // res  = Mask ? Max_Unsigned : Res
6778   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6779 }
6780 
6781 //
6782 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6783 // unsigned addition operation.
6784 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6785 //
6786 // We empirically determined its semantic equivalence to following reduced expression
6787 //    overflow_mask =  (a + b) <u (a | b)
6788 //
6789 // and also verified it though Alive2 solver.
6790 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6791 //
6792 
6793 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6794                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6795   // Res = Signed Add INP1, INP2
6796   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6797   // Compute T1 = INP1 | INP2
6798   vpor(xtmp3, src1, src2, vlen_enc);
6799   // T1 = Minimum signed value.
6800   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6801   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6802   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6803   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6804   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6805   // Compute overflow detection mask = Res<1> <s T1
6806   if (elem_bt == T_INT) {
6807     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6808   } else {
6809     assert(elem_bt == T_LONG, "");
6810     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6811   }
6812   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6813 }
6814 
6815 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6816                                       int vlen_enc, bool xtmp2_hold_M1) {
6817   if (VM_Version::supports_avx512dq()) {
6818     evpmovq2m(ktmp, src, vlen_enc);
6819   } else {
6820     assert(VM_Version::supports_evex(), "");
6821     if (!xtmp2_hold_M1) {
6822       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6823     }
6824     evpsraq(xtmp1, src, 63, vlen_enc);
6825     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6826   }
6827 }
6828 
6829 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6830                                       int vlen_enc, bool xtmp2_hold_M1) {
6831   if (VM_Version::supports_avx512dq()) {
6832     evpmovd2m(ktmp, src, vlen_enc);
6833   } else {
6834     assert(VM_Version::supports_evex(), "");
6835     if (!xtmp2_hold_M1) {
6836       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6837     }
6838     vpsrad(xtmp1, src, 31, vlen_enc);
6839     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6840   }
6841 }
6842 
6843 
6844 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6845   if (elem_bt == T_LONG) {
6846     if (VM_Version::supports_evex()) {
6847       evpsraq(dst, src, 63, vlen_enc);
6848     } else {
6849       vpsrad(dst, src, 31, vlen_enc);
6850       vpshufd(dst, dst, 0xF5, vlen_enc);
6851     }
6852   } else {
6853     assert(elem_bt == T_INT, "");
6854     vpsrad(dst, src, 31, vlen_enc);
6855   }
6856 }
6857 
6858 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6859   if (compute_allones) {
6860     if (vlen_enc == Assembler::AVX_512bit) {
6861       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6862     } else {
6863       vpcmpeqq(allones, allones, allones, vlen_enc);
6864     }
6865   }
6866   if (elem_bt == T_LONG) {
6867     vpsrlq(dst, allones, 1, vlen_enc);
6868   } else {
6869     assert(elem_bt == T_INT, "");
6870     vpsrld(dst, allones, 1, vlen_enc);
6871   }
6872 }
6873 
6874 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6875   if (compute_allones) {
6876     if (vlen_enc == Assembler::AVX_512bit) {
6877       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6878     } else {
6879       vpcmpeqq(allones, allones, allones, vlen_enc);
6880     }
6881   }
6882   if (elem_bt == T_LONG) {
6883     vpsllq(dst, allones, 63, vlen_enc);
6884   } else {
6885     assert(elem_bt == T_INT, "");
6886     vpslld(dst, allones, 31, vlen_enc);
6887   }
6888 }
6889 
6890 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6891                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6892   switch(elem_bt) {
6893     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6894     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6895     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6896     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6897     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6898   }
6899 }
6900 
6901 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6902   switch(elem_bt) {
6903     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6904     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6905     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6906     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6907     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6908   }
6909 }
6910 
6911 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6912                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6913   if (elem_bt == T_LONG) {
6914     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6915   } else {
6916     assert(elem_bt == T_INT, "");
6917     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6918   }
6919 }
6920 
6921 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6922                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6923                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6924   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6925   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6926   // Overflow detection based on Hacker's delight section 2-13.
6927   if (ideal_opc == Op_SaturatingAddV) {
6928     // res = src1 + src2
6929     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6930     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6931     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6932     vpxor(xtmp1, dst, src1, vlen_enc);
6933     vpxor(xtmp2, dst, src2, vlen_enc);
6934     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6935   } else {
6936     assert(ideal_opc == Op_SaturatingSubV, "");
6937     // res = src1 - src2
6938     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6939     // Overflow occurs when both inputs have opposite polarity and
6940     // result polarity does not comply with first input polarity.
6941     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6942     vpxor(xtmp1, src1, src2, vlen_enc);
6943     vpxor(xtmp2, dst, src1, vlen_enc);
6944     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6945   }
6946 
6947   // Compute overflow detection mask.
6948   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6949   // Note: xtmp1 hold -1 in all its lanes after above call.
6950 
6951   // Compute mask based on first input polarity.
6952   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6953 
6954   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6955   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6956 
6957   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6958   // set bits in first input polarity mask holds a min value.
6959   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6960   // Blend destination lanes with saturated values using overflow detection mask.
6961   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6962 }
6963 
6964 
6965 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6966                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6967                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6968   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6969   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6970   // Overflow detection based on Hacker's delight section 2-13.
6971   if (ideal_opc == Op_SaturatingAddV) {
6972     // res = src1 + src2
6973     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6974     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6975     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6976     vpxor(xtmp1, dst, src1, vlen_enc);
6977     vpxor(xtmp2, dst, src2, vlen_enc);
6978     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6979   } else {
6980     assert(ideal_opc == Op_SaturatingSubV, "");
6981     // res = src1 - src2
6982     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6983     // Overflow occurs when both inputs have opposite polarity and
6984     // result polarity does not comply with first input polarity.
6985     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6986     vpxor(xtmp1, src1, src2, vlen_enc);
6987     vpxor(xtmp2, dst, src1, vlen_enc);
6988     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6989   }
6990 
6991   // Sign-extend to compute overflow detection mask.
6992   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6993 
6994   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6995   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6996   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6997 
6998   // Compose saturating min/max vector using first input polarity mask.
6999   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
7000   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
7001 
7002   // Blend result with saturating vector using overflow detection mask.
7003   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
7004 }
7005 
7006 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7007   switch(elem_bt) {
7008     case T_BYTE:
7009       if (ideal_opc == Op_SaturatingAddV) {
7010         vpaddsb(dst, src1, src2, vlen_enc);
7011       } else {
7012         assert(ideal_opc == Op_SaturatingSubV, "");
7013         vpsubsb(dst, src1, src2, vlen_enc);
7014       }
7015       break;
7016     case T_SHORT:
7017       if (ideal_opc == Op_SaturatingAddV) {
7018         vpaddsw(dst, src1, src2, vlen_enc);
7019       } else {
7020         assert(ideal_opc == Op_SaturatingSubV, "");
7021         vpsubsw(dst, src1, src2, vlen_enc);
7022       }
7023       break;
7024     default:
7025       fatal("Unsupported type %s", type2name(elem_bt));
7026       break;
7027   }
7028 }
7029 
7030 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7031   switch(elem_bt) {
7032     case T_BYTE:
7033       if (ideal_opc == Op_SaturatingAddV) {
7034         vpaddusb(dst, src1, src2, vlen_enc);
7035       } else {
7036         assert(ideal_opc == Op_SaturatingSubV, "");
7037         vpsubusb(dst, src1, src2, vlen_enc);
7038       }
7039       break;
7040     case T_SHORT:
7041       if (ideal_opc == Op_SaturatingAddV) {
7042         vpaddusw(dst, src1, src2, vlen_enc);
7043       } else {
7044         assert(ideal_opc == Op_SaturatingSubV, "");
7045         vpsubusw(dst, src1, src2, vlen_enc);
7046       }
7047       break;
7048     default:
7049       fatal("Unsupported type %s", type2name(elem_bt));
7050       break;
7051   }
7052 }
7053 
7054 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7055                                                      XMMRegister src2, int vlen_enc) {
7056   switch(elem_bt) {
7057     case T_BYTE:
7058       evpermi2b(dst, src1, src2, vlen_enc);
7059       break;
7060     case T_SHORT:
7061       evpermi2w(dst, src1, src2, vlen_enc);
7062       break;
7063     case T_INT:
7064       evpermi2d(dst, src1, src2, vlen_enc);
7065       break;
7066     case T_LONG:
7067       evpermi2q(dst, src1, src2, vlen_enc);
7068       break;
7069     case T_FLOAT:
7070       evpermi2ps(dst, src1, src2, vlen_enc);
7071       break;
7072     case T_DOUBLE:
7073       evpermi2pd(dst, src1, src2, vlen_enc);
7074       break;
7075     default:
7076       fatal("Unsupported type %s", type2name(elem_bt));
7077       break;
7078   }
7079 }
7080 
7081 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7082   if (is_unsigned) {
7083     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7084   } else {
7085     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7086   }
7087 }
7088 
7089 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7090   if (is_unsigned) {
7091     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7092   } else {
7093     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7094   }
7095 }