1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  53 
  54   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  55   // NativeJump::patch_verified_entry will be able to patch out the entry
  56   // code safely. The push to verify stack depth is ok at 5 bytes,
  57   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  58   // stack bang then we must use the 6 byte frame allocation even if
  59   // we have no frame. :-(
  60   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  61 
  62   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  63   // Remove word for return addr
  64   framesize -= wordSize;
  65   stack_bang_size -= wordSize;
  66 
  67   // Calls to C2R adapters often do not accept exceptional returns.
  68   // We require that their callers must bang for them.  But be careful, because
  69   // some VM calls (such as call site linkage) can use several kilobytes of
  70   // stack.  But the stack safety zone should account for that.
  71   // See bugs 4446381, 4468289, 4497237.
  72   if (stack_bang_size > 0) {
  73     generate_stack_overflow_check(stack_bang_size);
  74 
  75     // We always push rbp, so that on return to interpreter rbp, will be
  76     // restored correctly and we can correct the stack.
  77     push(rbp);
  78     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  79     if (PreserveFramePointer) {
  80       mov(rbp, rsp);
  81     }
  82     // Remove word for ebp
  83     framesize -= wordSize;
  84 
  85     // Create frame
  86     if (framesize) {
  87       subptr(rsp, framesize);
  88     }
  89   } else {
  90     // Create frame (force generation of a 4 byte immediate value)
  91     subptr_imm32(rsp, framesize);
  92 
  93     // Save RBP register now.
  94     framesize -= wordSize;
  95     movptr(Address(rsp, framesize), rbp);
  96     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  97     if (PreserveFramePointer) {
  98       movptr(rbp, rsp);
  99       if (framesize > 0) {
 100         addptr(rbp, framesize);
 101       }
 102     }
 103   }
 104 
 105   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 106     framesize -= wordSize;
 107     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 108   }
 109 
 110 #ifndef _LP64
 111   // If method sets FPU control word do it now
 112   if (fp_mode_24b) {
 113     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 114   }
 115   if (UseSSE >= 2 && VerifyFPU) {
 116     verify_FPU(0, "FPU stack must be clean on entry");
 117   }
 118 #endif
 119 
 120 #ifdef ASSERT
 121   if (VerifyStackAtCalls) {
 122     Label L;
 123     push(rax);
 124     mov(rax, rsp);
 125     andptr(rax, StackAlignmentInBytes-1);
 126     cmpptr(rax, StackAlignmentInBytes-wordSize);
 127     pop(rax);
 128     jcc(Assembler::equal, L);
 129     STOP("Stack is not properly aligned!");
 130     bind(L);
 131   }
 132 #endif
 133 
 134   if (!is_stub) {
 135     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 136  #ifdef _LP64
 137     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 138       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 139       Label dummy_slow_path;
 140       Label dummy_continuation;
 141       Label* slow_path = &dummy_slow_path;
 142       Label* continuation = &dummy_continuation;
 143       if (!Compile::current()->output()->in_scratch_emit_size()) {
 144         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 145         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 146         Compile::current()->output()->add_stub(stub);
 147         slow_path = &stub->entry();
 148         continuation = &stub->continuation();
 149       }
 150       bs->nmethod_entry_barrier(this, slow_path, continuation);
 151     }
 152 #else
 153     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 154     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 155 #endif
 156   }
 157 }
 158 
 159 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 160   switch (vlen_in_bytes) {
 161     case  4: // fall-through
 162     case  8: // fall-through
 163     case 16: return Assembler::AVX_128bit;
 164     case 32: return Assembler::AVX_256bit;
 165     case 64: return Assembler::AVX_512bit;
 166 
 167     default: {
 168       ShouldNotReachHere();
 169       return Assembler::AVX_NoVec;
 170     }
 171   }
 172 }
 173 
 174 // fast_lock and fast_unlock used by C2
 175 
 176 // Because the transitions from emitted code to the runtime
 177 // monitorenter/exit helper stubs are so slow it's critical that
 178 // we inline both the stack-locking fast path and the inflated fast path.
 179 //
 180 // See also: cmpFastLock and cmpFastUnlock.
 181 //
 182 // What follows is a specialized inline transliteration of the code
 183 // in enter() and exit(). If we're concerned about I$ bloat another
 184 // option would be to emit TrySlowEnter and TrySlowExit methods
 185 // at startup-time.  These methods would accept arguments as
 186 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 187 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 188 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 189 // In practice, however, the # of lock sites is bounded and is usually small.
 190 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 191 // if the processor uses simple bimodal branch predictors keyed by EIP
 192 // Since the helper routines would be called from multiple synchronization
 193 // sites.
 194 //
 195 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 196 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 197 // to those specialized methods.  That'd give us a mostly platform-independent
 198 // implementation that the JITs could optimize and inline at their pleasure.
 199 // Done correctly, the only time we'd need to cross to native could would be
 200 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 201 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 202 // (b) explicit barriers or fence operations.
 203 //
 204 // TODO:
 205 //
 206 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 207 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 208 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 209 //    the lock operators would typically be faster than reifying Self.
 210 //
 211 // *  Ideally I'd define the primitives as:
 212 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 213 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 214 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 215 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 216 //    Furthermore the register assignments are overconstrained, possibly resulting in
 217 //    sub-optimal code near the synchronization site.
 218 //
 219 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 220 //    Alternately, use a better sp-proximity test.
 221 //
 222 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 223 //    Either one is sufficient to uniquely identify a thread.
 224 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 225 //
 226 // *  Intrinsify notify() and notifyAll() for the common cases where the
 227 //    object is locked by the calling thread but the waitlist is empty.
 228 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 229 //
 230 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 231 //    But beware of excessive branch density on AMD Opterons.
 232 //
 233 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 234 //    or failure of the fast path.  If the fast path fails then we pass
 235 //    control to the slow path, typically in C.  In fast_lock and
 236 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 237 //    will emit a conditional branch immediately after the node.
 238 //    So we have branches to branches and lots of ICC.ZF games.
 239 //    Instead, it might be better to have C2 pass a "FailureLabel"
 240 //    into fast_lock and fast_unlock.  In the case of success, control
 241 //    will drop through the node.  ICC.ZF is undefined at exit.
 242 //    In the case of failure, the node will branch directly to the
 243 //    FailureLabel
 244 
 245 
 246 // obj: object to lock
 247 // box: on-stack box address (displaced header location) - KILLED
 248 // rax,: tmp -- KILLED
 249 // scr: tmp -- KILLED
 250 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 251                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 252                                  Metadata* method_data) {
 253   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 254   // Ensure the register assignments are disjoint
 255   assert(tmpReg == rax, "");
 256   assert(cx1Reg == noreg, "");
 257   assert(cx2Reg == noreg, "");
 258   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 259 
 260   // Possible cases that we'll encounter in fast_lock
 261   // ------------------------------------------------
 262   // * Inflated
 263   //    -- unlocked
 264   //    -- Locked
 265   //       = by self
 266   //       = by other
 267   // * neutral
 268   // * stack-locked
 269   //    -- by self
 270   //       = sp-proximity test hits
 271   //       = sp-proximity test generates false-negative
 272   //    -- by other
 273   //
 274 
 275   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 276 
 277   if (DiagnoseSyncOnValueBasedClasses != 0) {
 278     load_klass(tmpReg, objReg, scrReg);
 279     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 280     jcc(Assembler::notZero, DONE_LABEL);
 281   }
 282 
 283   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 284   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 285   jcc(Assembler::notZero, IsInflated);
 286 
 287   if (LockingMode == LM_MONITOR) {
 288     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 289     testptr(objReg, objReg);
 290   } else {
 291     assert(LockingMode == LM_LEGACY, "must be");
 292     // Attempt stack-locking ...
 293     orptr (tmpReg, markWord::unlocked_value);
 294     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 295     lock();
 296     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 297     jcc(Assembler::equal, COUNT);           // Success
 298 
 299     // Recursive locking.
 300     // The object is stack-locked: markword contains stack pointer to BasicLock.
 301     // Locked by current thread if difference with current SP is less than one page.
 302     subptr(tmpReg, rsp);
 303     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 304     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 305     movptr(Address(boxReg, 0), tmpReg);
 306   }
 307   jmp(DONE_LABEL);
 308 
 309   bind(IsInflated);
 310   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 311 
 312 #ifndef _LP64
 313   // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 314   orl(boxReg, 1);  // set ICC.ZF=0 to indicate failure
 315 #else
 316   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 317   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 318   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 319 
 320   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 321   movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset()));
 322   movq(scrReg, tmpReg);
 323   xorq(tmpReg, tmpReg);
 324   lock();
 325   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 326 
 327   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 328   jccb(Assembler::equal, COUNT);    // CAS above succeeded; propagate ZF = 1 (success)
 329 
 330   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 331   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 332   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 333   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 334 #endif // _LP64
 335   bind(DONE_LABEL);
 336 
 337   // ZFlag == 1 count in fast path
 338   // ZFlag == 0 count in slow path
 339   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 340 
 341   bind(COUNT);
 342   if (LockingMode == LM_LEGACY) {
 343 #ifdef _LP64
 344     // Count monitors in fast path
 345     increment(Address(thread, JavaThread::held_monitor_count_offset()));
 346 #endif
 347   }
 348   xorl(tmpReg, tmpReg); // Set ZF == 1
 349 
 350   bind(NO_COUNT);
 351 
 352   // At NO_COUNT the icc ZFlag is set as follows ...
 353   // fast_unlock uses the same protocol.
 354   // ZFlag == 1 -> Success
 355   // ZFlag == 0 -> Failure - force control through the slow path
 356 }
 357 
 358 // obj: object to unlock
 359 // box: box address (displaced header location), killed.  Must be EAX.
 360 // tmp: killed, cannot be obj nor box.
 361 //
 362 // Some commentary on balanced locking:
 363 //
 364 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 365 // Methods that don't have provably balanced locking are forced to run in the
 366 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 367 // The interpreter provides two properties:
 368 // I1:  At return-time the interpreter automatically and quietly unlocks any
 369 //      objects acquired the current activation (frame).  Recall that the
 370 //      interpreter maintains an on-stack list of locks currently held by
 371 //      a frame.
 372 // I2:  If a method attempts to unlock an object that is not held by the
 373 //      the frame the interpreter throws IMSX.
 374 //
 375 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 376 // B() doesn't have provably balanced locking so it runs in the interpreter.
 377 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 378 // is still locked by A().
 379 //
 380 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 381 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 382 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 383 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 384 // Arguably given that the spec legislates the JNI case as undefined our implementation
 385 // could reasonably *avoid* checking owner in fast_unlock().
 386 // In the interest of performance we elide m->Owner==Self check in unlock.
 387 // A perfectly viable alternative is to elide the owner check except when
 388 // Xcheck:jni is enabled.
 389 
 390 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 391   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 392   assert(boxReg == rax, "");
 393   assert_different_registers(objReg, boxReg, tmpReg);
 394 
 395   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 396 
 397   if (LockingMode == LM_LEGACY) {
 398     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 399     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 400   }
 401   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 402   if (LockingMode != LM_MONITOR) {
 403     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 404     jcc(Assembler::zero, Stacked);
 405   }
 406 
 407   // It's inflated.
 408 
 409 #ifndef _LP64
 410   // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 411   orl(boxReg, 1);  // set ICC.ZF=0 to indicate failure
 412   jmpb(DONE_LABEL);
 413 #else
 414   // Despite our balanced locking property we still check that m->_owner == Self
 415   // as java routines or native JNI code called by this thread might
 416   // have released the lock.
 417   // Refer to the comments in synchronizer.cpp for how we might encode extra
 418   // state in _succ so we can avoid fetching EntryList|cxq.
 419   //
 420   // If there's no contention try a 1-0 exit.  That is, exit without
 421   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 422   // we detect and recover from the race that the 1-0 exit admits.
 423   //
 424   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 425   // before it STs null into _owner, releasing the lock.  Updates
 426   // to data protected by the critical section must be visible before
 427   // we drop the lock (and thus before any other thread could acquire
 428   // the lock and observe the fields protected by the lock).
 429   // IA32's memory-model is SPO, so STs are ordered with respect to
 430   // each other and there's no need for an explicit barrier (fence).
 431   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 432   Label LSuccess, LNotRecursive;
 433 
 434   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 435   jccb(Assembler::equal, LNotRecursive);
 436 
 437   // Recursive inflated unlock
 438   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 439   jmpb(LSuccess);
 440 
 441   bind(LNotRecursive);
 442 
 443   // Set owner to null.
 444   // Release to satisfy the JMM
 445   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 446   // We need a full fence after clearing owner to avoid stranding.
 447   // StoreLoad achieves this.
 448   membar(StoreLoad);
 449 
 450   // Check if the entry lists are empty (EntryList first - by convention).
 451   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 452   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 453   jccb(Assembler::zero, LSuccess);    // If so we are done.
 454 
 455   // Check if there is a successor.
 456   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 457   jccb(Assembler::notZero, LSuccess); // If so we are done.
 458 
 459   // Save the monitor pointer in the current thread, so we can try to
 460   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 461   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 462   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 463 
 464   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 465   jmpb  (DONE_LABEL);
 466 
 467   bind  (LSuccess);
 468   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 469   jmpb  (DONE_LABEL);
 470 #endif  // _LP64
 471 
 472   if (LockingMode == LM_LEGACY) {
 473     bind  (Stacked);
 474     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 475     lock();
 476     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 477     // Intentional fall-thru into DONE_LABEL
 478   }
 479 
 480   bind(DONE_LABEL);
 481 
 482   // ZFlag == 1 count in fast path
 483   // ZFlag == 0 count in slow path
 484   jccb(Assembler::notZero, NO_COUNT);
 485 
 486   bind(COUNT);
 487 
 488   if (LockingMode == LM_LEGACY) {
 489     // Count monitors in fast path
 490 #ifdef _LP64
 491     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 492 #endif
 493   }
 494 
 495   xorl(tmpReg, tmpReg); // Set ZF == 1
 496 
 497   bind(NO_COUNT);
 498 }
 499 
 500 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 501                                               Register t, Register thread) {
 502   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 503   assert(rax_reg == rax, "Used for CAS");
 504   assert_different_registers(obj, box, rax_reg, t, thread);
 505 
 506   // Handle inflated monitor.
 507   Label inflated;
 508   // Finish fast lock successfully. ZF value is irrelevant.
 509   Label locked;
 510   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 511   Label slow_path;
 512 
 513   if (UseObjectMonitorTable) {
 514     // Clear cache in case fast locking succeeds.
 515     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 516   }
 517 
 518   if (DiagnoseSyncOnValueBasedClasses != 0) {
 519     load_klass(rax_reg, obj, t);
 520     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 521     jcc(Assembler::notZero, slow_path);
 522   }
 523 
 524   const Register mark = t;
 525 
 526   { // Lightweight Lock
 527 
 528     Label push;
 529 
 530     const Register top = UseObjectMonitorTable ? rax_reg : box;
 531 
 532     // Load the mark.
 533     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 534 
 535     // Prefetch top.
 536     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 537 
 538     // Check for monitor (0b10).
 539     testptr(mark, markWord::monitor_value);
 540     jcc(Assembler::notZero, inflated);
 541 
 542     // Check if lock-stack is full.
 543     cmpl(top, LockStack::end_offset() - 1);
 544     jcc(Assembler::greater, slow_path);
 545 
 546     // Check if recursive.
 547     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 548     jccb(Assembler::equal, push);
 549 
 550     // Try to lock. Transition lock bits 0b01 => 0b00
 551     movptr(rax_reg, mark);
 552     orptr(rax_reg, markWord::unlocked_value);
 553     andptr(mark, ~(int32_t)markWord::unlocked_value);
 554     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 555     jcc(Assembler::notEqual, slow_path);
 556 
 557     if (UseObjectMonitorTable) {
 558       // Need to reload top, clobbered by CAS.
 559       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 560     }
 561     bind(push);
 562     // After successful lock, push object on lock-stack.
 563     movptr(Address(thread, top), obj);
 564     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 565     jmpb(locked);
 566   }
 567 
 568   { // Handle inflated monitor.
 569     bind(inflated);
 570 
 571 #ifndef _LP64
 572     // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 573     orl(box, 1);  // set ICC.ZF=0 to indicate failure
 574     jmpb(slow_path);
 575 #else
 576     const Register monitor = t;
 577 
 578     if (!UseObjectMonitorTable) {
 579       assert(mark == monitor, "should be the same here");
 580     } else {
 581       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 582       // Fetch ObjectMonitor* from the cache or take the slow-path.
 583       Label monitor_found;
 584 
 585       // Load cache address
 586       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 587 
 588       const int num_unrolled = 2;
 589       for (int i = 0; i < num_unrolled; i++) {
 590         cmpptr(obj, Address(t));
 591         jccb(Assembler::equal, monitor_found);
 592         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 593       }
 594 
 595       Label loop;
 596 
 597       // Search for obj in cache.
 598       bind(loop);
 599 
 600       // Check for match.
 601       cmpptr(obj, Address(t));
 602       jccb(Assembler::equal, monitor_found);
 603 
 604       // Search until null encountered, guaranteed _null_sentinel at end.
 605       cmpptr(Address(t), 1);
 606       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 607       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 608       jmpb(loop);
 609 
 610       // Cache hit.
 611       bind(monitor_found);
 612       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 613     }
 614     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 615     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 616     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 617 
 618     Label monitor_locked;
 619     // Lock the monitor.
 620 
 621     if (UseObjectMonitorTable) {
 622       // Cache the monitor for unlock before trashing box. On failure to acquire
 623       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 624       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 625     }
 626 
 627     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 628     xorptr(rax_reg, rax_reg);
 629     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 630     lock(); cmpxchgptr(box, owner_address);
 631     jccb(Assembler::equal, monitor_locked);
 632 
 633     // Check if recursive.
 634     cmpptr(box, rax_reg);
 635     jccb(Assembler::notEqual, slow_path);
 636 
 637     // Recursive.
 638     increment(recursions_address);
 639 
 640     bind(monitor_locked);
 641 #endif  // _LP64
 642   }
 643 
 644   bind(locked);
 645   // Set ZF = 1
 646   xorl(rax_reg, rax_reg);
 647 
 648 #ifdef ASSERT
 649   // Check that locked label is reached with ZF set.
 650   Label zf_correct;
 651   Label zf_bad_zero;
 652   jcc(Assembler::zero, zf_correct);
 653   jmp(zf_bad_zero);
 654 #endif
 655 
 656   bind(slow_path);
 657 #ifdef ASSERT
 658   // Check that slow_path label is reached with ZF not set.
 659   jcc(Assembler::notZero, zf_correct);
 660   stop("Fast Lock ZF != 0");
 661   bind(zf_bad_zero);
 662   stop("Fast Lock ZF != 1");
 663   bind(zf_correct);
 664 #endif
 665   // C2 uses the value of ZF to determine the continuation.
 666 }
 667 
 668 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 669   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 670   assert(reg_rax == rax, "Used for CAS");
 671   assert_different_registers(obj, reg_rax, t);
 672 
 673   // Handle inflated monitor.
 674   Label inflated, inflated_check_lock_stack;
 675   // Finish fast unlock successfully.  MUST jump with ZF == 1
 676   Label unlocked, slow_path;
 677 
 678   const Register mark = t;
 679   const Register monitor = t;
 680   const Register top = UseObjectMonitorTable ? t : reg_rax;
 681   const Register box = reg_rax;
 682 
 683   Label dummy;
 684   C2FastUnlockLightweightStub* stub = nullptr;
 685 
 686   if (!Compile::current()->output()->in_scratch_emit_size()) {
 687     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 688     Compile::current()->output()->add_stub(stub);
 689   }
 690 
 691   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 692 
 693   { // Lightweight Unlock
 694 
 695     // Load top.
 696     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 697 
 698     if (!UseObjectMonitorTable) {
 699       // Prefetch mark.
 700       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 701     }
 702 
 703     // Check if obj is top of lock-stack.
 704     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 705     // Top of lock stack was not obj. Must be monitor.
 706     jcc(Assembler::notEqual, inflated_check_lock_stack);
 707 
 708     // Pop lock-stack.
 709     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 710     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 711 
 712     // Check if recursive.
 713     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 714     jcc(Assembler::equal, unlocked);
 715 
 716     // We elide the monitor check, let the CAS fail instead.
 717 
 718     if (UseObjectMonitorTable) {
 719       // Load mark.
 720       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 721     }
 722 
 723     // Try to unlock. Transition lock bits 0b00 => 0b01
 724     movptr(reg_rax, mark);
 725     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 726     orptr(mark, markWord::unlocked_value);
 727     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 728     jcc(Assembler::notEqual, push_and_slow_path);
 729     jmp(unlocked);
 730   }
 731 
 732 
 733   { // Handle inflated monitor.
 734     bind(inflated_check_lock_stack);
 735 #ifdef ASSERT
 736     Label check_done;
 737     subl(top, oopSize);
 738     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 739     jcc(Assembler::below, check_done);
 740     cmpptr(obj, Address(thread, top));
 741     jccb(Assembler::notEqual, inflated_check_lock_stack);
 742     stop("Fast Unlock lock on stack");
 743     bind(check_done);
 744     if (UseObjectMonitorTable) {
 745       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 746     }
 747     testptr(mark, markWord::monitor_value);
 748     jccb(Assembler::notZero, inflated);
 749     stop("Fast Unlock not monitor");
 750 #endif
 751 
 752     bind(inflated);
 753 
 754 #ifndef _LP64
 755     // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 756     orl(t, 1);  // set ICC.ZF=0 to indicate failure
 757     jmpb(slow_path);
 758 #else
 759     if (!UseObjectMonitorTable) {
 760       assert(mark == monitor, "should be the same here");
 761     } else {
 762       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 763       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 764       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 765       cmpptr(monitor, alignof(ObjectMonitor*));
 766       jcc(Assembler::below, slow_path);
 767     }
 768     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 769     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 770     const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag};
 771     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 772     const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag};
 773     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 774 
 775     Label recursive;
 776 
 777     // Check if recursive.
 778     cmpptr(recursions_address, 0);
 779     jccb(Assembler::notZero, recursive);
 780 
 781     // Set owner to null.
 782     // Release to satisfy the JMM
 783     movptr(owner_address, NULL_WORD);
 784     // We need a full fence after clearing owner to avoid stranding.
 785     // StoreLoad achieves this.
 786     membar(StoreLoad);
 787 
 788     // Check if the entry lists are empty (EntryList first - by convention).
 789     movptr(reg_rax, EntryList_address);
 790     orptr(reg_rax, cxq_address);
 791     jccb(Assembler::zero, unlocked);    // If so we are done.
 792 
 793     // Check if there is a successor.
 794     cmpptr(succ_address, NULL_WORD);
 795     jccb(Assembler::notZero, unlocked); // If so we are done.
 796 
 797     // Save the monitor pointer in the current thread, so we can try to
 798     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 799     if (!UseObjectMonitorTable) {
 800       andptr(monitor, ~(int32_t)markWord::monitor_value);
 801     }
 802     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 803 
 804     orl(t, 1); // Fast Unlock ZF = 0
 805     jmpb(slow_path);
 806 
 807     // Recursive unlock.
 808     bind(recursive);
 809     decrement(recursions_address);
 810 #endif  // _LP64
 811   }
 812 
 813   bind(unlocked);
 814   xorl(t, t); // Fast Unlock ZF = 1
 815 
 816 #ifdef ASSERT
 817   // Check that unlocked label is reached with ZF set.
 818   Label zf_correct;
 819   Label zf_bad_zero;
 820   jcc(Assembler::zero, zf_correct);
 821   jmp(zf_bad_zero);
 822 #endif
 823 
 824   bind(slow_path);
 825   if (stub != nullptr) {
 826     bind(stub->slow_path_continuation());
 827   }
 828 #ifdef ASSERT
 829   // Check that stub->continuation() label is reached with ZF not set.
 830   jcc(Assembler::notZero, zf_correct);
 831   stop("Fast Unlock ZF != 0");
 832   bind(zf_bad_zero);
 833   stop("Fast Unlock ZF != 1");
 834   bind(zf_correct);
 835 #endif
 836   // C2 uses the value of ZF to determine the continuation.
 837 }
 838 
 839 //-------------------------------------------------------------------------------------------
 840 // Generic instructions support for use in .ad files C2 code generation
 841 
 842 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 843   if (dst != src) {
 844     movdqu(dst, src);
 845   }
 846   if (opcode == Op_AbsVD) {
 847     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 848   } else {
 849     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 850     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 851   }
 852 }
 853 
 854 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 855   if (opcode == Op_AbsVD) {
 856     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 857   } else {
 858     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 859     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 860   }
 861 }
 862 
 863 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 864   if (dst != src) {
 865     movdqu(dst, src);
 866   }
 867   if (opcode == Op_AbsVF) {
 868     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 869   } else {
 870     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 871     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 872   }
 873 }
 874 
 875 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 876   if (opcode == Op_AbsVF) {
 877     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 878   } else {
 879     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 880     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 881   }
 882 }
 883 
 884 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 885   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 886   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 887 
 888   if (opcode == Op_MinV) {
 889     if (elem_bt == T_BYTE) {
 890       pminsb(dst, src);
 891     } else if (elem_bt == T_SHORT) {
 892       pminsw(dst, src);
 893     } else if (elem_bt == T_INT) {
 894       pminsd(dst, src);
 895     } else {
 896       assert(elem_bt == T_LONG, "required");
 897       assert(tmp == xmm0, "required");
 898       assert_different_registers(dst, src, tmp);
 899       movdqu(xmm0, dst);
 900       pcmpgtq(xmm0, src);
 901       blendvpd(dst, src);  // xmm0 as mask
 902     }
 903   } else { // opcode == Op_MaxV
 904     if (elem_bt == T_BYTE) {
 905       pmaxsb(dst, src);
 906     } else if (elem_bt == T_SHORT) {
 907       pmaxsw(dst, src);
 908     } else if (elem_bt == T_INT) {
 909       pmaxsd(dst, src);
 910     } else {
 911       assert(elem_bt == T_LONG, "required");
 912       assert(tmp == xmm0, "required");
 913       assert_different_registers(dst, src, tmp);
 914       movdqu(xmm0, src);
 915       pcmpgtq(xmm0, dst);
 916       blendvpd(dst, src);  // xmm0 as mask
 917     }
 918   }
 919 }
 920 
 921 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 922                                   XMMRegister src1, Address src2, int vlen_enc) {
 923   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 924   if (opcode == Op_UMinV) {
 925     switch(elem_bt) {
 926       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 927       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 928       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 929       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 930       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 931     }
 932   } else {
 933     assert(opcode == Op_UMaxV, "required");
 934     switch(elem_bt) {
 935       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 936       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 937       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 938       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 939       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 940     }
 941   }
 942 }
 943 
 944 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 945   // For optimality, leverage a full vector width of 512 bits
 946   // for operations over smaller vector sizes on AVX512 targets.
 947   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 948     if (opcode == Op_UMaxV) {
 949       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 950     } else {
 951       assert(opcode == Op_UMinV, "required");
 952       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 953     }
 954   } else {
 955     // T1 = -1
 956     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 957     // T1 = -1 << 63
 958     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 959     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 960     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 961     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 962     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 963     // Mask = T2 > T1
 964     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 965     if (opcode == Op_UMaxV) {
 966       // Res = Mask ? Src2 : Src1
 967       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 968     } else {
 969       // Res = Mask ? Src1 : Src2
 970       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 971     }
 972   }
 973 }
 974 
 975 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 976                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 977   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 978   if (opcode == Op_UMinV) {
 979     switch(elem_bt) {
 980       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 981       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 982       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 983       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 984       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 985     }
 986   } else {
 987     assert(opcode == Op_UMaxV, "required");
 988     switch(elem_bt) {
 989       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 990       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 991       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 992       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 993       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 994     }
 995   }
 996 }
 997 
 998 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 999                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1000                                  int vlen_enc) {
1001   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1002 
1003   if (opcode == Op_MinV) {
1004     if (elem_bt == T_BYTE) {
1005       vpminsb(dst, src1, src2, vlen_enc);
1006     } else if (elem_bt == T_SHORT) {
1007       vpminsw(dst, src1, src2, vlen_enc);
1008     } else if (elem_bt == T_INT) {
1009       vpminsd(dst, src1, src2, vlen_enc);
1010     } else {
1011       assert(elem_bt == T_LONG, "required");
1012       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1013         vpminsq(dst, src1, src2, vlen_enc);
1014       } else {
1015         assert_different_registers(dst, src1, src2);
1016         vpcmpgtq(dst, src1, src2, vlen_enc);
1017         vblendvpd(dst, src1, src2, dst, vlen_enc);
1018       }
1019     }
1020   } else { // opcode == Op_MaxV
1021     if (elem_bt == T_BYTE) {
1022       vpmaxsb(dst, src1, src2, vlen_enc);
1023     } else if (elem_bt == T_SHORT) {
1024       vpmaxsw(dst, src1, src2, vlen_enc);
1025     } else if (elem_bt == T_INT) {
1026       vpmaxsd(dst, src1, src2, vlen_enc);
1027     } else {
1028       assert(elem_bt == T_LONG, "required");
1029       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1030         vpmaxsq(dst, src1, src2, vlen_enc);
1031       } else {
1032         assert_different_registers(dst, src1, src2);
1033         vpcmpgtq(dst, src1, src2, vlen_enc);
1034         vblendvpd(dst, src2, src1, dst, vlen_enc);
1035       }
1036     }
1037   }
1038 }
1039 
1040 // Float/Double min max
1041 
1042 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1043                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1044                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1045                                    int vlen_enc) {
1046   assert(UseAVX > 0, "required");
1047   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1048          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1049   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1050   assert_different_registers(a, tmp, atmp, btmp);
1051   assert_different_registers(b, tmp, atmp, btmp);
1052 
1053   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1054   bool is_double_word = is_double_word_type(elem_bt);
1055 
1056   /* Note on 'non-obvious' assembly sequence:
1057    *
1058    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1059    * and Java on how they handle floats:
1060    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1061    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1062    *
1063    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1064    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1065    *                (only useful when signs differ, noop otherwise)
1066    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1067 
1068    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1069    *   btmp = (b < +0.0) ? a : b
1070    *   atmp = (b < +0.0) ? b : a
1071    *   Tmp  = Max_Float(atmp , btmp)
1072    *   Res  = (atmp == NaN) ? atmp : Tmp
1073    */
1074 
1075   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1076   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1077   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1078   XMMRegister mask;
1079 
1080   if (!is_double_word && is_min) {
1081     mask = a;
1082     vblend = &MacroAssembler::vblendvps;
1083     vmaxmin = &MacroAssembler::vminps;
1084     vcmp = &MacroAssembler::vcmpps;
1085   } else if (!is_double_word && !is_min) {
1086     mask = b;
1087     vblend = &MacroAssembler::vblendvps;
1088     vmaxmin = &MacroAssembler::vmaxps;
1089     vcmp = &MacroAssembler::vcmpps;
1090   } else if (is_double_word && is_min) {
1091     mask = a;
1092     vblend = &MacroAssembler::vblendvpd;
1093     vmaxmin = &MacroAssembler::vminpd;
1094     vcmp = &MacroAssembler::vcmppd;
1095   } else {
1096     assert(is_double_word && !is_min, "sanity");
1097     mask = b;
1098     vblend = &MacroAssembler::vblendvpd;
1099     vmaxmin = &MacroAssembler::vmaxpd;
1100     vcmp = &MacroAssembler::vcmppd;
1101   }
1102 
1103   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1104   XMMRegister maxmin, scratch;
1105   if (dst == btmp) {
1106     maxmin = btmp;
1107     scratch = tmp;
1108   } else {
1109     maxmin = tmp;
1110     scratch = btmp;
1111   }
1112 
1113   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1114   if (precompute_mask && !is_double_word) {
1115     vpsrad(tmp, mask, 32, vlen_enc);
1116     mask = tmp;
1117   } else if (precompute_mask && is_double_word) {
1118     vpxor(tmp, tmp, tmp, vlen_enc);
1119     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1120     mask = tmp;
1121   }
1122 
1123   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1124   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1125   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1126   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1127   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1128 }
1129 
1130 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1131                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1132                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1133                                     int vlen_enc) {
1134   assert(UseAVX > 2, "required");
1135   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1136          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1137   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1138   assert_different_registers(dst, a, atmp, btmp);
1139   assert_different_registers(dst, b, atmp, btmp);
1140 
1141   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1142   bool is_double_word = is_double_word_type(elem_bt);
1143   bool merge = true;
1144 
1145   if (!is_double_word && is_min) {
1146     evpmovd2m(ktmp, a, vlen_enc);
1147     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1148     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1149     vminps(dst, atmp, btmp, vlen_enc);
1150     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1151     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1152   } else if (!is_double_word && !is_min) {
1153     evpmovd2m(ktmp, b, vlen_enc);
1154     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1155     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1156     vmaxps(dst, atmp, btmp, vlen_enc);
1157     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1158     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1159   } else if (is_double_word && is_min) {
1160     evpmovq2m(ktmp, a, vlen_enc);
1161     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1162     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1163     vminpd(dst, atmp, btmp, vlen_enc);
1164     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1165     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1166   } else {
1167     assert(is_double_word && !is_min, "sanity");
1168     evpmovq2m(ktmp, b, vlen_enc);
1169     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1170     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1171     vmaxpd(dst, atmp, btmp, vlen_enc);
1172     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1173     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1174   }
1175 }
1176 
1177 // Float/Double signum
1178 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1179   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1180 
1181   Label DONE_LABEL;
1182 
1183   if (opcode == Op_SignumF) {
1184     assert(UseSSE > 0, "required");
1185     ucomiss(dst, zero);
1186     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1187     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1188     movflt(dst, one);
1189     jcc(Assembler::above, DONE_LABEL);
1190     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1191   } else if (opcode == Op_SignumD) {
1192     assert(UseSSE > 1, "required");
1193     ucomisd(dst, zero);
1194     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1195     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1196     movdbl(dst, one);
1197     jcc(Assembler::above, DONE_LABEL);
1198     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1199   }
1200 
1201   bind(DONE_LABEL);
1202 }
1203 
1204 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1205   if (sign) {
1206     pmovsxbw(dst, src);
1207   } else {
1208     pmovzxbw(dst, src);
1209   }
1210 }
1211 
1212 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1213   if (sign) {
1214     vpmovsxbw(dst, src, vector_len);
1215   } else {
1216     vpmovzxbw(dst, src, vector_len);
1217   }
1218 }
1219 
1220 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1221   if (sign) {
1222     vpmovsxbd(dst, src, vector_len);
1223   } else {
1224     vpmovzxbd(dst, src, vector_len);
1225   }
1226 }
1227 
1228 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1229   if (sign) {
1230     vpmovsxwd(dst, src, vector_len);
1231   } else {
1232     vpmovzxwd(dst, src, vector_len);
1233   }
1234 }
1235 
1236 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1237                                      int shift, int vector_len) {
1238   if (opcode == Op_RotateLeftV) {
1239     if (etype == T_INT) {
1240       evprold(dst, src, shift, vector_len);
1241     } else {
1242       assert(etype == T_LONG, "expected type T_LONG");
1243       evprolq(dst, src, shift, vector_len);
1244     }
1245   } else {
1246     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1247     if (etype == T_INT) {
1248       evprord(dst, src, shift, vector_len);
1249     } else {
1250       assert(etype == T_LONG, "expected type T_LONG");
1251       evprorq(dst, src, shift, vector_len);
1252     }
1253   }
1254 }
1255 
1256 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1257                                      XMMRegister shift, int vector_len) {
1258   if (opcode == Op_RotateLeftV) {
1259     if (etype == T_INT) {
1260       evprolvd(dst, src, shift, vector_len);
1261     } else {
1262       assert(etype == T_LONG, "expected type T_LONG");
1263       evprolvq(dst, src, shift, vector_len);
1264     }
1265   } else {
1266     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1267     if (etype == T_INT) {
1268       evprorvd(dst, src, shift, vector_len);
1269     } else {
1270       assert(etype == T_LONG, "expected type T_LONG");
1271       evprorvq(dst, src, shift, vector_len);
1272     }
1273   }
1274 }
1275 
1276 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1277   if (opcode == Op_RShiftVI) {
1278     psrad(dst, shift);
1279   } else if (opcode == Op_LShiftVI) {
1280     pslld(dst, shift);
1281   } else {
1282     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1283     psrld(dst, shift);
1284   }
1285 }
1286 
1287 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1288   switch (opcode) {
1289     case Op_RShiftVI:  psrad(dst, shift); break;
1290     case Op_LShiftVI:  pslld(dst, shift); break;
1291     case Op_URShiftVI: psrld(dst, shift); break;
1292 
1293     default: assert(false, "%s", NodeClassNames[opcode]);
1294   }
1295 }
1296 
1297 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1298   if (opcode == Op_RShiftVI) {
1299     vpsrad(dst, nds, shift, vector_len);
1300   } else if (opcode == Op_LShiftVI) {
1301     vpslld(dst, nds, shift, vector_len);
1302   } else {
1303     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1304     vpsrld(dst, nds, shift, vector_len);
1305   }
1306 }
1307 
1308 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1309   switch (opcode) {
1310     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1311     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1312     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1313 
1314     default: assert(false, "%s", NodeClassNames[opcode]);
1315   }
1316 }
1317 
1318 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1319   switch (opcode) {
1320     case Op_RShiftVB:  // fall-through
1321     case Op_RShiftVS:  psraw(dst, shift); break;
1322 
1323     case Op_LShiftVB:  // fall-through
1324     case Op_LShiftVS:  psllw(dst, shift);   break;
1325 
1326     case Op_URShiftVS: // fall-through
1327     case Op_URShiftVB: psrlw(dst, shift);  break;
1328 
1329     default: assert(false, "%s", NodeClassNames[opcode]);
1330   }
1331 }
1332 
1333 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1334   switch (opcode) {
1335     case Op_RShiftVB:  // fall-through
1336     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1337 
1338     case Op_LShiftVB:  // fall-through
1339     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1340 
1341     case Op_URShiftVS: // fall-through
1342     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1343 
1344     default: assert(false, "%s", NodeClassNames[opcode]);
1345   }
1346 }
1347 
1348 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1349   switch (opcode) {
1350     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1351     case Op_LShiftVL:  psllq(dst, shift); break;
1352     case Op_URShiftVL: psrlq(dst, shift); break;
1353 
1354     default: assert(false, "%s", NodeClassNames[opcode]);
1355   }
1356 }
1357 
1358 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1359   if (opcode == Op_RShiftVL) {
1360     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1361   } else if (opcode == Op_LShiftVL) {
1362     psllq(dst, shift);
1363   } else {
1364     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1365     psrlq(dst, shift);
1366   }
1367 }
1368 
1369 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1370   switch (opcode) {
1371     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1372     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1373     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1374 
1375     default: assert(false, "%s", NodeClassNames[opcode]);
1376   }
1377 }
1378 
1379 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1380   if (opcode == Op_RShiftVL) {
1381     evpsraq(dst, nds, shift, vector_len);
1382   } else if (opcode == Op_LShiftVL) {
1383     vpsllq(dst, nds, shift, vector_len);
1384   } else {
1385     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1386     vpsrlq(dst, nds, shift, vector_len);
1387   }
1388 }
1389 
1390 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1391   switch (opcode) {
1392     case Op_RShiftVB:  // fall-through
1393     case Op_RShiftVS:  // fall-through
1394     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1395 
1396     case Op_LShiftVB:  // fall-through
1397     case Op_LShiftVS:  // fall-through
1398     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1399 
1400     case Op_URShiftVB: // fall-through
1401     case Op_URShiftVS: // fall-through
1402     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1403 
1404     default: assert(false, "%s", NodeClassNames[opcode]);
1405   }
1406 }
1407 
1408 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1409   switch (opcode) {
1410     case Op_RShiftVB:  // fall-through
1411     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1412 
1413     case Op_LShiftVB:  // fall-through
1414     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1415 
1416     case Op_URShiftVB: // fall-through
1417     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1418 
1419     default: assert(false, "%s", NodeClassNames[opcode]);
1420   }
1421 }
1422 
1423 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1424   assert(UseAVX >= 2, "required");
1425   switch (opcode) {
1426     case Op_RShiftVL: {
1427       if (UseAVX > 2) {
1428         assert(tmp == xnoreg, "not used");
1429         if (!VM_Version::supports_avx512vl()) {
1430           vlen_enc = Assembler::AVX_512bit;
1431         }
1432         evpsravq(dst, src, shift, vlen_enc);
1433       } else {
1434         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1435         vpsrlvq(dst, src, shift, vlen_enc);
1436         vpsrlvq(tmp, tmp, shift, vlen_enc);
1437         vpxor(dst, dst, tmp, vlen_enc);
1438         vpsubq(dst, dst, tmp, vlen_enc);
1439       }
1440       break;
1441     }
1442     case Op_LShiftVL: {
1443       assert(tmp == xnoreg, "not used");
1444       vpsllvq(dst, src, shift, vlen_enc);
1445       break;
1446     }
1447     case Op_URShiftVL: {
1448       assert(tmp == xnoreg, "not used");
1449       vpsrlvq(dst, src, shift, vlen_enc);
1450       break;
1451     }
1452     default: assert(false, "%s", NodeClassNames[opcode]);
1453   }
1454 }
1455 
1456 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1457 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1458   assert(opcode == Op_LShiftVB ||
1459          opcode == Op_RShiftVB ||
1460          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1461   bool sign = (opcode != Op_URShiftVB);
1462   assert(vector_len == 0, "required");
1463   vextendbd(sign, dst, src, 1);
1464   vpmovzxbd(vtmp, shift, 1);
1465   varshiftd(opcode, dst, dst, vtmp, 1);
1466   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1467   vextracti128_high(vtmp, dst);
1468   vpackusdw(dst, dst, vtmp, 0);
1469 }
1470 
1471 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1472 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1473   assert(opcode == Op_LShiftVB ||
1474          opcode == Op_RShiftVB ||
1475          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1476   bool sign = (opcode != Op_URShiftVB);
1477   int ext_vector_len = vector_len + 1;
1478   vextendbw(sign, dst, src, ext_vector_len);
1479   vpmovzxbw(vtmp, shift, ext_vector_len);
1480   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1481   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1482   if (vector_len == 0) {
1483     vextracti128_high(vtmp, dst);
1484     vpackuswb(dst, dst, vtmp, vector_len);
1485   } else {
1486     vextracti64x4_high(vtmp, dst);
1487     vpackuswb(dst, dst, vtmp, vector_len);
1488     vpermq(dst, dst, 0xD8, vector_len);
1489   }
1490 }
1491 
1492 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1493   switch(typ) {
1494     case T_BYTE:
1495       pinsrb(dst, val, idx);
1496       break;
1497     case T_SHORT:
1498       pinsrw(dst, val, idx);
1499       break;
1500     case T_INT:
1501       pinsrd(dst, val, idx);
1502       break;
1503     case T_LONG:
1504       pinsrq(dst, val, idx);
1505       break;
1506     default:
1507       assert(false,"Should not reach here.");
1508       break;
1509   }
1510 }
1511 
1512 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1513   switch(typ) {
1514     case T_BYTE:
1515       vpinsrb(dst, src, val, idx);
1516       break;
1517     case T_SHORT:
1518       vpinsrw(dst, src, val, idx);
1519       break;
1520     case T_INT:
1521       vpinsrd(dst, src, val, idx);
1522       break;
1523     case T_LONG:
1524       vpinsrq(dst, src, val, idx);
1525       break;
1526     default:
1527       assert(false,"Should not reach here.");
1528       break;
1529   }
1530 }
1531 
1532 #ifdef _LP64
1533 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1534                                                 XMMRegister dst, Register base,
1535                                                 Register idx_base,
1536                                                 Register offset, Register mask,
1537                                                 Register mask_idx, Register rtmp,
1538                                                 int vlen_enc) {
1539   vpxor(dst, dst, dst, vlen_enc);
1540   if (elem_bt == T_SHORT) {
1541     for (int i = 0; i < 4; i++) {
1542       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1543       Label skip_load;
1544       btq(mask, mask_idx);
1545       jccb(Assembler::carryClear, skip_load);
1546       movl(rtmp, Address(idx_base, i * 4));
1547       if (offset != noreg) {
1548         addl(rtmp, offset);
1549       }
1550       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1551       bind(skip_load);
1552       incq(mask_idx);
1553     }
1554   } else {
1555     assert(elem_bt == T_BYTE, "");
1556     for (int i = 0; i < 8; i++) {
1557       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1558       Label skip_load;
1559       btq(mask, mask_idx);
1560       jccb(Assembler::carryClear, skip_load);
1561       movl(rtmp, Address(idx_base, i * 4));
1562       if (offset != noreg) {
1563         addl(rtmp, offset);
1564       }
1565       pinsrb(dst, Address(base, rtmp), i);
1566       bind(skip_load);
1567       incq(mask_idx);
1568     }
1569   }
1570 }
1571 #endif // _LP64
1572 
1573 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1574                                          Register base, Register idx_base,
1575                                          Register offset, Register rtmp,
1576                                          int vlen_enc) {
1577   vpxor(dst, dst, dst, vlen_enc);
1578   if (elem_bt == T_SHORT) {
1579     for (int i = 0; i < 4; i++) {
1580       // dst[i] = src[offset + idx_base[i]]
1581       movl(rtmp, Address(idx_base, i * 4));
1582       if (offset != noreg) {
1583         addl(rtmp, offset);
1584       }
1585       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1586     }
1587   } else {
1588     assert(elem_bt == T_BYTE, "");
1589     for (int i = 0; i < 8; i++) {
1590       // dst[i] = src[offset + idx_base[i]]
1591       movl(rtmp, Address(idx_base, i * 4));
1592       if (offset != noreg) {
1593         addl(rtmp, offset);
1594       }
1595       pinsrb(dst, Address(base, rtmp), i);
1596     }
1597   }
1598 }
1599 
1600 /*
1601  * Gather using hybrid algorithm, first partially unroll scalar loop
1602  * to accumulate values from gather indices into a quad-word(64bit) slice.
1603  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1604  * permutation to place the slice into appropriate vector lane
1605  * locations in destination vector. Following pseudo code describes the
1606  * algorithm in detail:
1607  *
1608  * DST_VEC = ZERO_VEC
1609  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1610  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1611  * FOREACH_ITER:
1612  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1613  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1614  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1615  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1616  *
1617  * With each iteration, doubleword permute indices (0,1) corresponding
1618  * to gathered quadword gets right shifted by two lane positions.
1619  *
1620  */
1621 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1622                                         Register base, Register idx_base,
1623                                         Register offset, Register mask,
1624                                         XMMRegister xtmp1, XMMRegister xtmp2,
1625                                         XMMRegister temp_dst, Register rtmp,
1626                                         Register mask_idx, Register length,
1627                                         int vector_len, int vlen_enc) {
1628   Label GATHER8_LOOP;
1629   assert(is_subword_type(elem_ty), "");
1630   movl(length, vector_len);
1631   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1632   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1633   vallones(xtmp2, vlen_enc);
1634   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1635   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1636   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1637 
1638   bind(GATHER8_LOOP);
1639     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1640     if (mask == noreg) {
1641       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1642     } else {
1643       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1644     }
1645     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1646     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1647     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1648     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1649     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1650     vpor(dst, dst, temp_dst, vlen_enc);
1651     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1652     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1653     jcc(Assembler::notEqual, GATHER8_LOOP);
1654 }
1655 
1656 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1657   switch(typ) {
1658     case T_INT:
1659       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1660       break;
1661     case T_FLOAT:
1662       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1663       break;
1664     case T_LONG:
1665       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1666       break;
1667     case T_DOUBLE:
1668       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1669       break;
1670     default:
1671       assert(false,"Should not reach here.");
1672       break;
1673   }
1674 }
1675 
1676 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1677   switch(typ) {
1678     case T_INT:
1679       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1680       break;
1681     case T_FLOAT:
1682       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1683       break;
1684     case T_LONG:
1685       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1686       break;
1687     case T_DOUBLE:
1688       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1689       break;
1690     default:
1691       assert(false,"Should not reach here.");
1692       break;
1693   }
1694 }
1695 
1696 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1697   switch(typ) {
1698     case T_INT:
1699       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1700       break;
1701     case T_FLOAT:
1702       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1703       break;
1704     case T_LONG:
1705       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1706       break;
1707     case T_DOUBLE:
1708       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1709       break;
1710     default:
1711       assert(false,"Should not reach here.");
1712       break;
1713   }
1714 }
1715 
1716 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1717   if (vlen_in_bytes <= 16) {
1718     pxor (dst, dst);
1719     psubb(dst, src);
1720     switch (elem_bt) {
1721       case T_BYTE:   /* nothing to do */ break;
1722       case T_SHORT:  pmovsxbw(dst, dst); break;
1723       case T_INT:    pmovsxbd(dst, dst); break;
1724       case T_FLOAT:  pmovsxbd(dst, dst); break;
1725       case T_LONG:   pmovsxbq(dst, dst); break;
1726       case T_DOUBLE: pmovsxbq(dst, dst); break;
1727 
1728       default: assert(false, "%s", type2name(elem_bt));
1729     }
1730   } else {
1731     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1732     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1733 
1734     vpxor (dst, dst, dst, vlen_enc);
1735     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1736 
1737     switch (elem_bt) {
1738       case T_BYTE:   /* nothing to do */            break;
1739       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1740       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1741       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1742       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1743       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1744 
1745       default: assert(false, "%s", type2name(elem_bt));
1746     }
1747   }
1748 }
1749 
1750 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1751   if (novlbwdq) {
1752     vpmovsxbd(xtmp, src, vlen_enc);
1753     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1754             Assembler::eq, true, vlen_enc, noreg);
1755   } else {
1756     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1757     vpsubb(xtmp, xtmp, src, vlen_enc);
1758     evpmovb2m(dst, xtmp, vlen_enc);
1759   }
1760 }
1761 
1762 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1763   if (is_integral_type(bt)) {
1764     switch (vlen_in_bytes) {
1765       case 4:  movdl(dst, src);   break;
1766       case 8:  movq(dst, src);    break;
1767       case 16: movdqu(dst, src);  break;
1768       case 32: vmovdqu(dst, src); break;
1769       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1770       default: ShouldNotReachHere();
1771     }
1772   } else {
1773     switch (vlen_in_bytes) {
1774       case 4:  movflt(dst, src); break;
1775       case 8:  movdbl(dst, src); break;
1776       case 16: movups(dst, src); break;
1777       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1778       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1779       default: ShouldNotReachHere();
1780     }
1781   }
1782 }
1783 
1784 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1785   assert(rscratch != noreg || always_reachable(src), "missing");
1786 
1787   if (reachable(src)) {
1788     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1789   } else {
1790     lea(rscratch, src);
1791     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1792   }
1793 }
1794 
1795 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1796   int vlen_enc = vector_length_encoding(vlen);
1797   if (VM_Version::supports_avx()) {
1798     if (bt == T_LONG) {
1799       if (VM_Version::supports_avx2()) {
1800         vpbroadcastq(dst, src, vlen_enc);
1801       } else {
1802         vmovddup(dst, src, vlen_enc);
1803       }
1804     } else if (bt == T_DOUBLE) {
1805       if (vlen_enc != Assembler::AVX_128bit) {
1806         vbroadcastsd(dst, src, vlen_enc, noreg);
1807       } else {
1808         vmovddup(dst, src, vlen_enc);
1809       }
1810     } else {
1811       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1812         vpbroadcastd(dst, src, vlen_enc);
1813       } else {
1814         vbroadcastss(dst, src, vlen_enc);
1815       }
1816     }
1817   } else if (VM_Version::supports_sse3()) {
1818     movddup(dst, src);
1819   } else {
1820     load_vector(bt, dst, src, vlen);
1821   }
1822 }
1823 
1824 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1825   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1826   int offset = exact_log2(type2aelembytes(bt)) << 6;
1827   if (is_floating_point_type(bt)) {
1828     offset += 128;
1829   }
1830   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1831   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1832 }
1833 
1834 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1835 
1836 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1837   int vector_len = Assembler::AVX_128bit;
1838 
1839   switch (opcode) {
1840     case Op_AndReductionV:  pand(dst, src); break;
1841     case Op_OrReductionV:   por (dst, src); break;
1842     case Op_XorReductionV:  pxor(dst, src); break;
1843     case Op_MinReductionV:
1844       switch (typ) {
1845         case T_BYTE:        pminsb(dst, src); break;
1846         case T_SHORT:       pminsw(dst, src); break;
1847         case T_INT:         pminsd(dst, src); break;
1848         case T_LONG:        assert(UseAVX > 2, "required");
1849                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1850         default:            assert(false, "wrong type");
1851       }
1852       break;
1853     case Op_MaxReductionV:
1854       switch (typ) {
1855         case T_BYTE:        pmaxsb(dst, src); break;
1856         case T_SHORT:       pmaxsw(dst, src); break;
1857         case T_INT:         pmaxsd(dst, src); break;
1858         case T_LONG:        assert(UseAVX > 2, "required");
1859                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1860         default:            assert(false, "wrong type");
1861       }
1862       break;
1863     case Op_AddReductionVF: addss(dst, src); break;
1864     case Op_AddReductionVD: addsd(dst, src); break;
1865     case Op_AddReductionVI:
1866       switch (typ) {
1867         case T_BYTE:        paddb(dst, src); break;
1868         case T_SHORT:       paddw(dst, src); break;
1869         case T_INT:         paddd(dst, src); break;
1870         default:            assert(false, "wrong type");
1871       }
1872       break;
1873     case Op_AddReductionVL: paddq(dst, src); break;
1874     case Op_MulReductionVF: mulss(dst, src); break;
1875     case Op_MulReductionVD: mulsd(dst, src); break;
1876     case Op_MulReductionVI:
1877       switch (typ) {
1878         case T_SHORT:       pmullw(dst, src); break;
1879         case T_INT:         pmulld(dst, src); break;
1880         default:            assert(false, "wrong type");
1881       }
1882       break;
1883     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1884                             evpmullq(dst, dst, src, vector_len); break;
1885     default:                assert(false, "wrong opcode");
1886   }
1887 }
1888 
1889 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1890   switch (opcode) {
1891     case Op_AddReductionVF: addps(dst, src); break;
1892     case Op_AddReductionVD: addpd(dst, src); break;
1893     case Op_MulReductionVF: mulps(dst, src); break;
1894     case Op_MulReductionVD: mulpd(dst, src); break;
1895     default:                assert(false, "%s", NodeClassNames[opcode]);
1896   }
1897 }
1898 
1899 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1900   int vector_len = Assembler::AVX_256bit;
1901 
1902   switch (opcode) {
1903     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1904     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1905     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1906     case Op_MinReductionV:
1907       switch (typ) {
1908         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1909         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1910         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1911         case T_LONG:        assert(UseAVX > 2, "required");
1912                             vpminsq(dst, src1, src2, vector_len); break;
1913         default:            assert(false, "wrong type");
1914       }
1915       break;
1916     case Op_MaxReductionV:
1917       switch (typ) {
1918         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1919         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1920         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1921         case T_LONG:        assert(UseAVX > 2, "required");
1922                             vpmaxsq(dst, src1, src2, vector_len); break;
1923         default:            assert(false, "wrong type");
1924       }
1925       break;
1926     case Op_AddReductionVI:
1927       switch (typ) {
1928         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1929         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1930         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1931         default:            assert(false, "wrong type");
1932       }
1933       break;
1934     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1935     case Op_MulReductionVI:
1936       switch (typ) {
1937         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1938         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1939         default:            assert(false, "wrong type");
1940       }
1941       break;
1942     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1943     default:                assert(false, "wrong opcode");
1944   }
1945 }
1946 
1947 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1948   int vector_len = Assembler::AVX_256bit;
1949 
1950   switch (opcode) {
1951     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1952     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1953     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1954     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1955     default:                assert(false, "%s", NodeClassNames[opcode]);
1956   }
1957 }
1958 
1959 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1960                                   XMMRegister dst, XMMRegister src,
1961                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1962   switch (opcode) {
1963     case Op_AddReductionVF:
1964     case Op_MulReductionVF:
1965       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1966       break;
1967 
1968     case Op_AddReductionVD:
1969     case Op_MulReductionVD:
1970       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1971       break;
1972 
1973     default: assert(false, "wrong opcode");
1974   }
1975 }
1976 
1977 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1978                                             XMMRegister dst, XMMRegister src,
1979                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1980   switch (opcode) {
1981     case Op_AddReductionVF:
1982     case Op_MulReductionVF:
1983       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1984       break;
1985 
1986     case Op_AddReductionVD:
1987     case Op_MulReductionVD:
1988       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1989       break;
1990 
1991     default: assert(false, "%s", NodeClassNames[opcode]);
1992   }
1993 }
1994 
1995 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1996                              Register dst, Register src1, XMMRegister src2,
1997                              XMMRegister vtmp1, XMMRegister vtmp2) {
1998   switch (vlen) {
1999     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2000     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2001     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2002     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2003 
2004     default: assert(false, "wrong vector length");
2005   }
2006 }
2007 
2008 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2009                              Register dst, Register src1, XMMRegister src2,
2010                              XMMRegister vtmp1, XMMRegister vtmp2) {
2011   switch (vlen) {
2012     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2013     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2014     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2015     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2016 
2017     default: assert(false, "wrong vector length");
2018   }
2019 }
2020 
2021 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2022                              Register dst, Register src1, XMMRegister src2,
2023                              XMMRegister vtmp1, XMMRegister vtmp2) {
2024   switch (vlen) {
2025     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2026     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2027     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2028     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2029 
2030     default: assert(false, "wrong vector length");
2031   }
2032 }
2033 
2034 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2035                              Register dst, Register src1, XMMRegister src2,
2036                              XMMRegister vtmp1, XMMRegister vtmp2) {
2037   switch (vlen) {
2038     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2039     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2040     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2041     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2042 
2043     default: assert(false, "wrong vector length");
2044   }
2045 }
2046 
2047 #ifdef _LP64
2048 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2049                              Register dst, Register src1, XMMRegister src2,
2050                              XMMRegister vtmp1, XMMRegister vtmp2) {
2051   switch (vlen) {
2052     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2053     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2054     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2055 
2056     default: assert(false, "wrong vector length");
2057   }
2058 }
2059 #endif // _LP64
2060 
2061 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2062   switch (vlen) {
2063     case 2:
2064       assert(vtmp2 == xnoreg, "");
2065       reduce2F(opcode, dst, src, vtmp1);
2066       break;
2067     case 4:
2068       assert(vtmp2 == xnoreg, "");
2069       reduce4F(opcode, dst, src, vtmp1);
2070       break;
2071     case 8:
2072       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2073       break;
2074     case 16:
2075       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2076       break;
2077     default: assert(false, "wrong vector length");
2078   }
2079 }
2080 
2081 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2082   switch (vlen) {
2083     case 2:
2084       assert(vtmp2 == xnoreg, "");
2085       reduce2D(opcode, dst, src, vtmp1);
2086       break;
2087     case 4:
2088       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2089       break;
2090     case 8:
2091       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2092       break;
2093     default: assert(false, "wrong vector length");
2094   }
2095 }
2096 
2097 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2098   switch (vlen) {
2099     case 2:
2100       assert(vtmp1 == xnoreg, "");
2101       assert(vtmp2 == xnoreg, "");
2102       unorderedReduce2F(opcode, dst, src);
2103       break;
2104     case 4:
2105       assert(vtmp2 == xnoreg, "");
2106       unorderedReduce4F(opcode, dst, src, vtmp1);
2107       break;
2108     case 8:
2109       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2110       break;
2111     case 16:
2112       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2113       break;
2114     default: assert(false, "wrong vector length");
2115   }
2116 }
2117 
2118 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2119   switch (vlen) {
2120     case 2:
2121       assert(vtmp1 == xnoreg, "");
2122       assert(vtmp2 == xnoreg, "");
2123       unorderedReduce2D(opcode, dst, src);
2124       break;
2125     case 4:
2126       assert(vtmp2 == xnoreg, "");
2127       unorderedReduce4D(opcode, dst, src, vtmp1);
2128       break;
2129     case 8:
2130       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2131       break;
2132     default: assert(false, "wrong vector length");
2133   }
2134 }
2135 
2136 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2137   if (opcode == Op_AddReductionVI) {
2138     if (vtmp1 != src2) {
2139       movdqu(vtmp1, src2);
2140     }
2141     phaddd(vtmp1, vtmp1);
2142   } else {
2143     pshufd(vtmp1, src2, 0x1);
2144     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2145   }
2146   movdl(vtmp2, src1);
2147   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2148   movdl(dst, vtmp1);
2149 }
2150 
2151 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2152   if (opcode == Op_AddReductionVI) {
2153     if (vtmp1 != src2) {
2154       movdqu(vtmp1, src2);
2155     }
2156     phaddd(vtmp1, src2);
2157     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2158   } else {
2159     pshufd(vtmp2, src2, 0xE);
2160     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2161     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2162   }
2163 }
2164 
2165 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2166   if (opcode == Op_AddReductionVI) {
2167     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2168     vextracti128_high(vtmp2, vtmp1);
2169     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2170     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2171   } else {
2172     vextracti128_high(vtmp1, src2);
2173     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2174     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2175   }
2176 }
2177 
2178 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2179   vextracti64x4_high(vtmp2, src2);
2180   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2181   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2182 }
2183 
2184 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2185   pshufd(vtmp2, src2, 0x1);
2186   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2187   movdqu(vtmp1, vtmp2);
2188   psrldq(vtmp1, 2);
2189   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2190   movdqu(vtmp2, vtmp1);
2191   psrldq(vtmp2, 1);
2192   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2193   movdl(vtmp2, src1);
2194   pmovsxbd(vtmp1, vtmp1);
2195   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2196   pextrb(dst, vtmp1, 0x0);
2197   movsbl(dst, dst);
2198 }
2199 
2200 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2201   pshufd(vtmp1, src2, 0xE);
2202   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2203   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2204 }
2205 
2206 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2207   vextracti128_high(vtmp2, src2);
2208   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2209   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2210 }
2211 
2212 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2213   vextracti64x4_high(vtmp1, src2);
2214   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2215   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2216 }
2217 
2218 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2219   pmovsxbw(vtmp2, src2);
2220   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2221 }
2222 
2223 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2224   if (UseAVX > 1) {
2225     int vector_len = Assembler::AVX_256bit;
2226     vpmovsxbw(vtmp1, src2, vector_len);
2227     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2228   } else {
2229     pmovsxbw(vtmp2, src2);
2230     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2231     pshufd(vtmp2, src2, 0x1);
2232     pmovsxbw(vtmp2, src2);
2233     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2234   }
2235 }
2236 
2237 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2238   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2239     int vector_len = Assembler::AVX_512bit;
2240     vpmovsxbw(vtmp1, src2, vector_len);
2241     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2242   } else {
2243     assert(UseAVX >= 2,"Should not reach here.");
2244     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2245     vextracti128_high(vtmp2, src2);
2246     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2247   }
2248 }
2249 
2250 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2251   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2252   vextracti64x4_high(vtmp2, src2);
2253   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2254 }
2255 
2256 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2257   if (opcode == Op_AddReductionVI) {
2258     if (vtmp1 != src2) {
2259       movdqu(vtmp1, src2);
2260     }
2261     phaddw(vtmp1, vtmp1);
2262     phaddw(vtmp1, vtmp1);
2263   } else {
2264     pshufd(vtmp2, src2, 0x1);
2265     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2266     movdqu(vtmp1, vtmp2);
2267     psrldq(vtmp1, 2);
2268     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2269   }
2270   movdl(vtmp2, src1);
2271   pmovsxwd(vtmp1, vtmp1);
2272   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2273   pextrw(dst, vtmp1, 0x0);
2274   movswl(dst, dst);
2275 }
2276 
2277 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2278   if (opcode == Op_AddReductionVI) {
2279     if (vtmp1 != src2) {
2280       movdqu(vtmp1, src2);
2281     }
2282     phaddw(vtmp1, src2);
2283   } else {
2284     pshufd(vtmp1, src2, 0xE);
2285     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2286   }
2287   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2288 }
2289 
2290 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2291   if (opcode == Op_AddReductionVI) {
2292     int vector_len = Assembler::AVX_256bit;
2293     vphaddw(vtmp2, src2, src2, vector_len);
2294     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2295   } else {
2296     vextracti128_high(vtmp2, src2);
2297     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2298   }
2299   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2300 }
2301 
2302 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2303   int vector_len = Assembler::AVX_256bit;
2304   vextracti64x4_high(vtmp1, src2);
2305   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2306   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2307 }
2308 
2309 #ifdef _LP64
2310 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2311   pshufd(vtmp2, src2, 0xE);
2312   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2313   movdq(vtmp1, src1);
2314   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2315   movdq(dst, vtmp1);
2316 }
2317 
2318 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2319   vextracti128_high(vtmp1, src2);
2320   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2321   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2322 }
2323 
2324 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2325   vextracti64x4_high(vtmp2, src2);
2326   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2327   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2328 }
2329 
2330 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2331   mov64(temp, -1L);
2332   bzhiq(temp, temp, len);
2333   kmovql(dst, temp);
2334 }
2335 #endif // _LP64
2336 
2337 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2338   reduce_operation_128(T_FLOAT, opcode, dst, src);
2339   pshufd(vtmp, src, 0x1);
2340   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2341 }
2342 
2343 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2344   reduce2F(opcode, dst, src, vtmp);
2345   pshufd(vtmp, src, 0x2);
2346   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2347   pshufd(vtmp, src, 0x3);
2348   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2349 }
2350 
2351 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2352   reduce4F(opcode, dst, src, vtmp2);
2353   vextractf128_high(vtmp2, src);
2354   reduce4F(opcode, dst, vtmp2, vtmp1);
2355 }
2356 
2357 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2358   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2359   vextracti64x4_high(vtmp1, src);
2360   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2361 }
2362 
2363 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2364   pshufd(dst, src, 0x1);
2365   reduce_operation_128(T_FLOAT, opcode, dst, src);
2366 }
2367 
2368 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2369   pshufd(vtmp, src, 0xE);
2370   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2371   unorderedReduce2F(opcode, dst, vtmp);
2372 }
2373 
2374 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2375   vextractf128_high(vtmp1, src);
2376   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2377   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2378 }
2379 
2380 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2381   vextractf64x4_high(vtmp2, src);
2382   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2383   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2384 }
2385 
2386 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2387   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2388   pshufd(vtmp, src, 0xE);
2389   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2390 }
2391 
2392 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2393   reduce2D(opcode, dst, src, vtmp2);
2394   vextractf128_high(vtmp2, src);
2395   reduce2D(opcode, dst, vtmp2, vtmp1);
2396 }
2397 
2398 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2399   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2400   vextracti64x4_high(vtmp1, src);
2401   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2402 }
2403 
2404 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2405   pshufd(dst, src, 0xE);
2406   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2407 }
2408 
2409 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2410   vextractf128_high(vtmp, src);
2411   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2412   unorderedReduce2D(opcode, dst, vtmp);
2413 }
2414 
2415 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2416   vextractf64x4_high(vtmp2, src);
2417   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2418   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2419 }
2420 
2421 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2422   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2423 }
2424 
2425 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2426   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2427 }
2428 
2429 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2430   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2431 }
2432 
2433 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2434                                  int vec_enc) {
2435   switch(elem_bt) {
2436     case T_INT:
2437     case T_FLOAT:
2438       vmaskmovps(dst, src, mask, vec_enc);
2439       break;
2440     case T_LONG:
2441     case T_DOUBLE:
2442       vmaskmovpd(dst, src, mask, vec_enc);
2443       break;
2444     default:
2445       fatal("Unsupported type %s", type2name(elem_bt));
2446       break;
2447   }
2448 }
2449 
2450 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2451                                  int vec_enc) {
2452   switch(elem_bt) {
2453     case T_INT:
2454     case T_FLOAT:
2455       vmaskmovps(dst, src, mask, vec_enc);
2456       break;
2457     case T_LONG:
2458     case T_DOUBLE:
2459       vmaskmovpd(dst, src, mask, vec_enc);
2460       break;
2461     default:
2462       fatal("Unsupported type %s", type2name(elem_bt));
2463       break;
2464   }
2465 }
2466 
2467 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2468                                           XMMRegister dst, XMMRegister src,
2469                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2470                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2471   const int permconst[] = {1, 14};
2472   XMMRegister wsrc = src;
2473   XMMRegister wdst = xmm_0;
2474   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2475 
2476   int vlen_enc = Assembler::AVX_128bit;
2477   if (vlen == 16) {
2478     vlen_enc = Assembler::AVX_256bit;
2479   }
2480 
2481   for (int i = log2(vlen) - 1; i >=0; i--) {
2482     if (i == 0 && !is_dst_valid) {
2483       wdst = dst;
2484     }
2485     if (i == 3) {
2486       vextracti64x4_high(wtmp, wsrc);
2487     } else if (i == 2) {
2488       vextracti128_high(wtmp, wsrc);
2489     } else { // i = [0,1]
2490       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2491     }
2492     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2493     wsrc = wdst;
2494     vlen_enc = Assembler::AVX_128bit;
2495   }
2496   if (is_dst_valid) {
2497     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2498   }
2499 }
2500 
2501 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2502                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2503                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2504   XMMRegister wsrc = src;
2505   XMMRegister wdst = xmm_0;
2506   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2507   int vlen_enc = Assembler::AVX_128bit;
2508   if (vlen == 8) {
2509     vlen_enc = Assembler::AVX_256bit;
2510   }
2511   for (int i = log2(vlen) - 1; i >=0; i--) {
2512     if (i == 0 && !is_dst_valid) {
2513       wdst = dst;
2514     }
2515     if (i == 1) {
2516       vextracti128_high(wtmp, wsrc);
2517     } else if (i == 2) {
2518       vextracti64x4_high(wtmp, wsrc);
2519     } else {
2520       assert(i == 0, "%d", i);
2521       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2522     }
2523     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2524     wsrc = wdst;
2525     vlen_enc = Assembler::AVX_128bit;
2526   }
2527   if (is_dst_valid) {
2528     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2529   }
2530 }
2531 
2532 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2533   switch (bt) {
2534     case T_BYTE:  pextrb(dst, src, idx); break;
2535     case T_SHORT: pextrw(dst, src, idx); break;
2536     case T_INT:   pextrd(dst, src, idx); break;
2537     case T_LONG:  pextrq(dst, src, idx); break;
2538 
2539     default:
2540       assert(false,"Should not reach here.");
2541       break;
2542   }
2543 }
2544 
2545 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2546   int esize =  type2aelembytes(typ);
2547   int elem_per_lane = 16/esize;
2548   int lane = elemindex / elem_per_lane;
2549   int eindex = elemindex % elem_per_lane;
2550 
2551   if (lane >= 2) {
2552     assert(UseAVX > 2, "required");
2553     vextractf32x4(dst, src, lane & 3);
2554     return dst;
2555   } else if (lane > 0) {
2556     assert(UseAVX > 0, "required");
2557     vextractf128(dst, src, lane);
2558     return dst;
2559   } else {
2560     return src;
2561   }
2562 }
2563 
2564 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2565   if (typ == T_BYTE) {
2566     movsbl(dst, dst);
2567   } else if (typ == T_SHORT) {
2568     movswl(dst, dst);
2569   }
2570 }
2571 
2572 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2573   int esize =  type2aelembytes(typ);
2574   int elem_per_lane = 16/esize;
2575   int eindex = elemindex % elem_per_lane;
2576   assert(is_integral_type(typ),"required");
2577 
2578   if (eindex == 0) {
2579     if (typ == T_LONG) {
2580       movq(dst, src);
2581     } else {
2582       movdl(dst, src);
2583       movsxl(typ, dst);
2584     }
2585   } else {
2586     extract(typ, dst, src, eindex);
2587     movsxl(typ, dst);
2588   }
2589 }
2590 
2591 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2592   int esize =  type2aelembytes(typ);
2593   int elem_per_lane = 16/esize;
2594   int eindex = elemindex % elem_per_lane;
2595   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2596 
2597   if (eindex == 0) {
2598     movq(dst, src);
2599   } else {
2600     if (typ == T_FLOAT) {
2601       if (UseAVX == 0) {
2602         movdqu(dst, src);
2603         shufps(dst, dst, eindex);
2604       } else {
2605         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2606       }
2607     } else {
2608       if (UseAVX == 0) {
2609         movdqu(dst, src);
2610         psrldq(dst, eindex*esize);
2611       } else {
2612         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2613       }
2614       movq(dst, dst);
2615     }
2616   }
2617   // Zero upper bits
2618   if (typ == T_FLOAT) {
2619     if (UseAVX == 0) {
2620       assert(vtmp != xnoreg, "required.");
2621       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2622       pand(dst, vtmp);
2623     } else {
2624       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2625     }
2626   }
2627 }
2628 
2629 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2630   switch(typ) {
2631     case T_BYTE:
2632     case T_BOOLEAN:
2633       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2634       break;
2635     case T_SHORT:
2636     case T_CHAR:
2637       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2638       break;
2639     case T_INT:
2640     case T_FLOAT:
2641       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2642       break;
2643     case T_LONG:
2644     case T_DOUBLE:
2645       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2646       break;
2647     default:
2648       assert(false,"Should not reach here.");
2649       break;
2650   }
2651 }
2652 
2653 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2654   assert(rscratch != noreg || always_reachable(src2), "missing");
2655 
2656   switch(typ) {
2657     case T_BOOLEAN:
2658     case T_BYTE:
2659       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2660       break;
2661     case T_CHAR:
2662     case T_SHORT:
2663       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2664       break;
2665     case T_INT:
2666     case T_FLOAT:
2667       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2668       break;
2669     case T_LONG:
2670     case T_DOUBLE:
2671       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2672       break;
2673     default:
2674       assert(false,"Should not reach here.");
2675       break;
2676   }
2677 }
2678 
2679 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2680   switch(typ) {
2681     case T_BYTE:
2682       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2683       break;
2684     case T_SHORT:
2685       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2686       break;
2687     case T_INT:
2688     case T_FLOAT:
2689       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2690       break;
2691     case T_LONG:
2692     case T_DOUBLE:
2693       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2694       break;
2695     default:
2696       assert(false,"Should not reach here.");
2697       break;
2698   }
2699 }
2700 
2701 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2702   assert(vlen_in_bytes <= 32, "");
2703   int esize = type2aelembytes(bt);
2704   if (vlen_in_bytes == 32) {
2705     assert(vtmp == xnoreg, "required.");
2706     if (esize >= 4) {
2707       vtestps(src1, src2, AVX_256bit);
2708     } else {
2709       vptest(src1, src2, AVX_256bit);
2710     }
2711     return;
2712   }
2713   if (vlen_in_bytes < 16) {
2714     // Duplicate the lower part to fill the whole register,
2715     // Don't need to do so for src2
2716     assert(vtmp != xnoreg, "required");
2717     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2718     pshufd(vtmp, src1, shuffle_imm);
2719   } else {
2720     assert(vtmp == xnoreg, "required");
2721     vtmp = src1;
2722   }
2723   if (esize >= 4 && VM_Version::supports_avx()) {
2724     vtestps(vtmp, src2, AVX_128bit);
2725   } else {
2726     ptest(vtmp, src2);
2727   }
2728 }
2729 
2730 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2731 #ifdef ASSERT
2732   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2733   bool is_bw_supported = VM_Version::supports_avx512bw();
2734   if (is_bw && !is_bw_supported) {
2735     assert(vlen_enc != Assembler::AVX_512bit, "required");
2736     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2737            "XMM register should be 0-15");
2738   }
2739 #endif // ASSERT
2740   switch (elem_bt) {
2741     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2742     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2743     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2744     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2745     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2746     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2747     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2748   }
2749 }
2750 
2751 #ifdef _LP64
2752 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2753   assert(UseAVX >= 2, "required");
2754   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2755   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2756   if ((UseAVX > 2) &&
2757       (!is_bw || VM_Version::supports_avx512bw()) &&
2758       (!is_vl || VM_Version::supports_avx512vl())) {
2759     switch (elem_bt) {
2760       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2761       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2762       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2763       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2764       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2765     }
2766   } else {
2767     assert(vlen_enc != Assembler::AVX_512bit, "required");
2768     assert((dst->encoding() < 16),"XMM register should be 0-15");
2769     switch (elem_bt) {
2770       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2771       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2772       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2773       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2774       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2775       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2776       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2777     }
2778   }
2779 }
2780 #endif
2781 
2782 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2783   switch (to_elem_bt) {
2784     case T_SHORT:
2785       vpmovsxbw(dst, src, vlen_enc);
2786       break;
2787     case T_INT:
2788       vpmovsxbd(dst, src, vlen_enc);
2789       break;
2790     case T_FLOAT:
2791       vpmovsxbd(dst, src, vlen_enc);
2792       vcvtdq2ps(dst, dst, vlen_enc);
2793       break;
2794     case T_LONG:
2795       vpmovsxbq(dst, src, vlen_enc);
2796       break;
2797     case T_DOUBLE: {
2798       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2799       vpmovsxbd(dst, src, mid_vlen_enc);
2800       vcvtdq2pd(dst, dst, vlen_enc);
2801       break;
2802     }
2803     default:
2804       fatal("Unsupported type %s", type2name(to_elem_bt));
2805       break;
2806   }
2807 }
2808 
2809 //-------------------------------------------------------------------------------------------
2810 
2811 // IndexOf for constant substrings with size >= 8 chars
2812 // which don't need to be loaded through stack.
2813 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2814                                          Register cnt1, Register cnt2,
2815                                          int int_cnt2,  Register result,
2816                                          XMMRegister vec, Register tmp,
2817                                          int ae) {
2818   ShortBranchVerifier sbv(this);
2819   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2820   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2821 
2822   // This method uses the pcmpestri instruction with bound registers
2823   //   inputs:
2824   //     xmm - substring
2825   //     rax - substring length (elements count)
2826   //     mem - scanned string
2827   //     rdx - string length (elements count)
2828   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2829   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2830   //   outputs:
2831   //     rcx - matched index in string
2832   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2833   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2834   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2835   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2836   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2837 
2838   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2839         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2840         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2841 
2842   // Note, inline_string_indexOf() generates checks:
2843   // if (substr.count > string.count) return -1;
2844   // if (substr.count == 0) return 0;
2845   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2846 
2847   // Load substring.
2848   if (ae == StrIntrinsicNode::UL) {
2849     pmovzxbw(vec, Address(str2, 0));
2850   } else {
2851     movdqu(vec, Address(str2, 0));
2852   }
2853   movl(cnt2, int_cnt2);
2854   movptr(result, str1); // string addr
2855 
2856   if (int_cnt2 > stride) {
2857     jmpb(SCAN_TO_SUBSTR);
2858 
2859     // Reload substr for rescan, this code
2860     // is executed only for large substrings (> 8 chars)
2861     bind(RELOAD_SUBSTR);
2862     if (ae == StrIntrinsicNode::UL) {
2863       pmovzxbw(vec, Address(str2, 0));
2864     } else {
2865       movdqu(vec, Address(str2, 0));
2866     }
2867     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2868 
2869     bind(RELOAD_STR);
2870     // We came here after the beginning of the substring was
2871     // matched but the rest of it was not so we need to search
2872     // again. Start from the next element after the previous match.
2873 
2874     // cnt2 is number of substring reminding elements and
2875     // cnt1 is number of string reminding elements when cmp failed.
2876     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2877     subl(cnt1, cnt2);
2878     addl(cnt1, int_cnt2);
2879     movl(cnt2, int_cnt2); // Now restore cnt2
2880 
2881     decrementl(cnt1);     // Shift to next element
2882     cmpl(cnt1, cnt2);
2883     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2884 
2885     addptr(result, (1<<scale1));
2886 
2887   } // (int_cnt2 > 8)
2888 
2889   // Scan string for start of substr in 16-byte vectors
2890   bind(SCAN_TO_SUBSTR);
2891   pcmpestri(vec, Address(result, 0), mode);
2892   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2893   subl(cnt1, stride);
2894   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2895   cmpl(cnt1, cnt2);
2896   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2897   addptr(result, 16);
2898   jmpb(SCAN_TO_SUBSTR);
2899 
2900   // Found a potential substr
2901   bind(FOUND_CANDIDATE);
2902   // Matched whole vector if first element matched (tmp(rcx) == 0).
2903   if (int_cnt2 == stride) {
2904     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2905   } else { // int_cnt2 > 8
2906     jccb(Assembler::overflow, FOUND_SUBSTR);
2907   }
2908   // After pcmpestri tmp(rcx) contains matched element index
2909   // Compute start addr of substr
2910   lea(result, Address(result, tmp, scale1));
2911 
2912   // Make sure string is still long enough
2913   subl(cnt1, tmp);
2914   cmpl(cnt1, cnt2);
2915   if (int_cnt2 == stride) {
2916     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2917   } else { // int_cnt2 > 8
2918     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2919   }
2920   // Left less then substring.
2921 
2922   bind(RET_NOT_FOUND);
2923   movl(result, -1);
2924   jmp(EXIT);
2925 
2926   if (int_cnt2 > stride) {
2927     // This code is optimized for the case when whole substring
2928     // is matched if its head is matched.
2929     bind(MATCH_SUBSTR_HEAD);
2930     pcmpestri(vec, Address(result, 0), mode);
2931     // Reload only string if does not match
2932     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2933 
2934     Label CONT_SCAN_SUBSTR;
2935     // Compare the rest of substring (> 8 chars).
2936     bind(FOUND_SUBSTR);
2937     // First 8 chars are already matched.
2938     negptr(cnt2);
2939     addptr(cnt2, stride);
2940 
2941     bind(SCAN_SUBSTR);
2942     subl(cnt1, stride);
2943     cmpl(cnt2, -stride); // Do not read beyond substring
2944     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2945     // Back-up strings to avoid reading beyond substring:
2946     // cnt1 = cnt1 - cnt2 + 8
2947     addl(cnt1, cnt2); // cnt2 is negative
2948     addl(cnt1, stride);
2949     movl(cnt2, stride); negptr(cnt2);
2950     bind(CONT_SCAN_SUBSTR);
2951     if (int_cnt2 < (int)G) {
2952       int tail_off1 = int_cnt2<<scale1;
2953       int tail_off2 = int_cnt2<<scale2;
2954       if (ae == StrIntrinsicNode::UL) {
2955         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2956       } else {
2957         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2958       }
2959       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2960     } else {
2961       // calculate index in register to avoid integer overflow (int_cnt2*2)
2962       movl(tmp, int_cnt2);
2963       addptr(tmp, cnt2);
2964       if (ae == StrIntrinsicNode::UL) {
2965         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2966       } else {
2967         movdqu(vec, Address(str2, tmp, scale2, 0));
2968       }
2969       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2970     }
2971     // Need to reload strings pointers if not matched whole vector
2972     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2973     addptr(cnt2, stride);
2974     jcc(Assembler::negative, SCAN_SUBSTR);
2975     // Fall through if found full substring
2976 
2977   } // (int_cnt2 > 8)
2978 
2979   bind(RET_FOUND);
2980   // Found result if we matched full small substring.
2981   // Compute substr offset
2982   subptr(result, str1);
2983   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2984     shrl(result, 1); // index
2985   }
2986   bind(EXIT);
2987 
2988 } // string_indexofC8
2989 
2990 // Small strings are loaded through stack if they cross page boundary.
2991 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2992                                        Register cnt1, Register cnt2,
2993                                        int int_cnt2,  Register result,
2994                                        XMMRegister vec, Register tmp,
2995                                        int ae) {
2996   ShortBranchVerifier sbv(this);
2997   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2998   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2999 
3000   //
3001   // int_cnt2 is length of small (< 8 chars) constant substring
3002   // or (-1) for non constant substring in which case its length
3003   // is in cnt2 register.
3004   //
3005   // Note, inline_string_indexOf() generates checks:
3006   // if (substr.count > string.count) return -1;
3007   // if (substr.count == 0) return 0;
3008   //
3009   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3010   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3011   // This method uses the pcmpestri instruction with bound registers
3012   //   inputs:
3013   //     xmm - substring
3014   //     rax - substring length (elements count)
3015   //     mem - scanned string
3016   //     rdx - string length (elements count)
3017   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3018   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3019   //   outputs:
3020   //     rcx - matched index in string
3021   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3022   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3023   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3024   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3025 
3026   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3027         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3028         FOUND_CANDIDATE;
3029 
3030   { //========================================================
3031     // We don't know where these strings are located
3032     // and we can't read beyond them. Load them through stack.
3033     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3034 
3035     movptr(tmp, rsp); // save old SP
3036 
3037     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3038       if (int_cnt2 == (1>>scale2)) { // One byte
3039         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3040         load_unsigned_byte(result, Address(str2, 0));
3041         movdl(vec, result); // move 32 bits
3042       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3043         // Not enough header space in 32-bit VM: 12+3 = 15.
3044         movl(result, Address(str2, -1));
3045         shrl(result, 8);
3046         movdl(vec, result); // move 32 bits
3047       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3048         load_unsigned_short(result, Address(str2, 0));
3049         movdl(vec, result); // move 32 bits
3050       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3051         movdl(vec, Address(str2, 0)); // move 32 bits
3052       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3053         movq(vec, Address(str2, 0));  // move 64 bits
3054       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3055         // Array header size is 12 bytes in 32-bit VM
3056         // + 6 bytes for 3 chars == 18 bytes,
3057         // enough space to load vec and shift.
3058         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3059         if (ae == StrIntrinsicNode::UL) {
3060           int tail_off = int_cnt2-8;
3061           pmovzxbw(vec, Address(str2, tail_off));
3062           psrldq(vec, -2*tail_off);
3063         }
3064         else {
3065           int tail_off = int_cnt2*(1<<scale2);
3066           movdqu(vec, Address(str2, tail_off-16));
3067           psrldq(vec, 16-tail_off);
3068         }
3069       }
3070     } else { // not constant substring
3071       cmpl(cnt2, stride);
3072       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3073 
3074       // We can read beyond string if srt+16 does not cross page boundary
3075       // since heaps are aligned and mapped by pages.
3076       assert(os::vm_page_size() < (int)G, "default page should be small");
3077       movl(result, str2); // We need only low 32 bits
3078       andl(result, ((int)os::vm_page_size()-1));
3079       cmpl(result, ((int)os::vm_page_size()-16));
3080       jccb(Assembler::belowEqual, CHECK_STR);
3081 
3082       // Move small strings to stack to allow load 16 bytes into vec.
3083       subptr(rsp, 16);
3084       int stk_offset = wordSize-(1<<scale2);
3085       push(cnt2);
3086 
3087       bind(COPY_SUBSTR);
3088       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3089         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3090         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3091       } else if (ae == StrIntrinsicNode::UU) {
3092         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3093         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3094       }
3095       decrement(cnt2);
3096       jccb(Assembler::notZero, COPY_SUBSTR);
3097 
3098       pop(cnt2);
3099       movptr(str2, rsp);  // New substring address
3100     } // non constant
3101 
3102     bind(CHECK_STR);
3103     cmpl(cnt1, stride);
3104     jccb(Assembler::aboveEqual, BIG_STRINGS);
3105 
3106     // Check cross page boundary.
3107     movl(result, str1); // We need only low 32 bits
3108     andl(result, ((int)os::vm_page_size()-1));
3109     cmpl(result, ((int)os::vm_page_size()-16));
3110     jccb(Assembler::belowEqual, BIG_STRINGS);
3111 
3112     subptr(rsp, 16);
3113     int stk_offset = -(1<<scale1);
3114     if (int_cnt2 < 0) { // not constant
3115       push(cnt2);
3116       stk_offset += wordSize;
3117     }
3118     movl(cnt2, cnt1);
3119 
3120     bind(COPY_STR);
3121     if (ae == StrIntrinsicNode::LL) {
3122       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3123       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3124     } else {
3125       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3126       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3127     }
3128     decrement(cnt2);
3129     jccb(Assembler::notZero, COPY_STR);
3130 
3131     if (int_cnt2 < 0) { // not constant
3132       pop(cnt2);
3133     }
3134     movptr(str1, rsp);  // New string address
3135 
3136     bind(BIG_STRINGS);
3137     // Load substring.
3138     if (int_cnt2 < 0) { // -1
3139       if (ae == StrIntrinsicNode::UL) {
3140         pmovzxbw(vec, Address(str2, 0));
3141       } else {
3142         movdqu(vec, Address(str2, 0));
3143       }
3144       push(cnt2);       // substr count
3145       push(str2);       // substr addr
3146       push(str1);       // string addr
3147     } else {
3148       // Small (< 8 chars) constant substrings are loaded already.
3149       movl(cnt2, int_cnt2);
3150     }
3151     push(tmp);  // original SP
3152 
3153   } // Finished loading
3154 
3155   //========================================================
3156   // Start search
3157   //
3158 
3159   movptr(result, str1); // string addr
3160 
3161   if (int_cnt2  < 0) {  // Only for non constant substring
3162     jmpb(SCAN_TO_SUBSTR);
3163 
3164     // SP saved at sp+0
3165     // String saved at sp+1*wordSize
3166     // Substr saved at sp+2*wordSize
3167     // Substr count saved at sp+3*wordSize
3168 
3169     // Reload substr for rescan, this code
3170     // is executed only for large substrings (> 8 chars)
3171     bind(RELOAD_SUBSTR);
3172     movptr(str2, Address(rsp, 2*wordSize));
3173     movl(cnt2, Address(rsp, 3*wordSize));
3174     if (ae == StrIntrinsicNode::UL) {
3175       pmovzxbw(vec, Address(str2, 0));
3176     } else {
3177       movdqu(vec, Address(str2, 0));
3178     }
3179     // We came here after the beginning of the substring was
3180     // matched but the rest of it was not so we need to search
3181     // again. Start from the next element after the previous match.
3182     subptr(str1, result); // Restore counter
3183     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3184       shrl(str1, 1);
3185     }
3186     addl(cnt1, str1);
3187     decrementl(cnt1);   // Shift to next element
3188     cmpl(cnt1, cnt2);
3189     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3190 
3191     addptr(result, (1<<scale1));
3192   } // non constant
3193 
3194   // Scan string for start of substr in 16-byte vectors
3195   bind(SCAN_TO_SUBSTR);
3196   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3197   pcmpestri(vec, Address(result, 0), mode);
3198   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3199   subl(cnt1, stride);
3200   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3201   cmpl(cnt1, cnt2);
3202   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3203   addptr(result, 16);
3204 
3205   bind(ADJUST_STR);
3206   cmpl(cnt1, stride); // Do not read beyond string
3207   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3208   // Back-up string to avoid reading beyond string.
3209   lea(result, Address(result, cnt1, scale1, -16));
3210   movl(cnt1, stride);
3211   jmpb(SCAN_TO_SUBSTR);
3212 
3213   // Found a potential substr
3214   bind(FOUND_CANDIDATE);
3215   // After pcmpestri tmp(rcx) contains matched element index
3216 
3217   // Make sure string is still long enough
3218   subl(cnt1, tmp);
3219   cmpl(cnt1, cnt2);
3220   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3221   // Left less then substring.
3222 
3223   bind(RET_NOT_FOUND);
3224   movl(result, -1);
3225   jmp(CLEANUP);
3226 
3227   bind(FOUND_SUBSTR);
3228   // Compute start addr of substr
3229   lea(result, Address(result, tmp, scale1));
3230   if (int_cnt2 > 0) { // Constant substring
3231     // Repeat search for small substring (< 8 chars)
3232     // from new point without reloading substring.
3233     // Have to check that we don't read beyond string.
3234     cmpl(tmp, stride-int_cnt2);
3235     jccb(Assembler::greater, ADJUST_STR);
3236     // Fall through if matched whole substring.
3237   } else { // non constant
3238     assert(int_cnt2 == -1, "should be != 0");
3239 
3240     addl(tmp, cnt2);
3241     // Found result if we matched whole substring.
3242     cmpl(tmp, stride);
3243     jcc(Assembler::lessEqual, RET_FOUND);
3244 
3245     // Repeat search for small substring (<= 8 chars)
3246     // from new point 'str1' without reloading substring.
3247     cmpl(cnt2, stride);
3248     // Have to check that we don't read beyond string.
3249     jccb(Assembler::lessEqual, ADJUST_STR);
3250 
3251     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3252     // Compare the rest of substring (> 8 chars).
3253     movptr(str1, result);
3254 
3255     cmpl(tmp, cnt2);
3256     // First 8 chars are already matched.
3257     jccb(Assembler::equal, CHECK_NEXT);
3258 
3259     bind(SCAN_SUBSTR);
3260     pcmpestri(vec, Address(str1, 0), mode);
3261     // Need to reload strings pointers if not matched whole vector
3262     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3263 
3264     bind(CHECK_NEXT);
3265     subl(cnt2, stride);
3266     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3267     addptr(str1, 16);
3268     if (ae == StrIntrinsicNode::UL) {
3269       addptr(str2, 8);
3270     } else {
3271       addptr(str2, 16);
3272     }
3273     subl(cnt1, stride);
3274     cmpl(cnt2, stride); // Do not read beyond substring
3275     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3276     // Back-up strings to avoid reading beyond substring.
3277 
3278     if (ae == StrIntrinsicNode::UL) {
3279       lea(str2, Address(str2, cnt2, scale2, -8));
3280       lea(str1, Address(str1, cnt2, scale1, -16));
3281     } else {
3282       lea(str2, Address(str2, cnt2, scale2, -16));
3283       lea(str1, Address(str1, cnt2, scale1, -16));
3284     }
3285     subl(cnt1, cnt2);
3286     movl(cnt2, stride);
3287     addl(cnt1, stride);
3288     bind(CONT_SCAN_SUBSTR);
3289     if (ae == StrIntrinsicNode::UL) {
3290       pmovzxbw(vec, Address(str2, 0));
3291     } else {
3292       movdqu(vec, Address(str2, 0));
3293     }
3294     jmp(SCAN_SUBSTR);
3295 
3296     bind(RET_FOUND_LONG);
3297     movptr(str1, Address(rsp, wordSize));
3298   } // non constant
3299 
3300   bind(RET_FOUND);
3301   // Compute substr offset
3302   subptr(result, str1);
3303   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3304     shrl(result, 1); // index
3305   }
3306   bind(CLEANUP);
3307   pop(rsp); // restore SP
3308 
3309 } // string_indexof
3310 
3311 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3312                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3313   ShortBranchVerifier sbv(this);
3314   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3315 
3316   int stride = 8;
3317 
3318   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3319         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3320         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3321         FOUND_SEQ_CHAR, DONE_LABEL;
3322 
3323   movptr(result, str1);
3324   if (UseAVX >= 2) {
3325     cmpl(cnt1, stride);
3326     jcc(Assembler::less, SCAN_TO_CHAR);
3327     cmpl(cnt1, 2*stride);
3328     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3329     movdl(vec1, ch);
3330     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3331     vpxor(vec2, vec2);
3332     movl(tmp, cnt1);
3333     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3334     andl(cnt1,0x0000000F);  //tail count (in chars)
3335 
3336     bind(SCAN_TO_16_CHAR_LOOP);
3337     vmovdqu(vec3, Address(result, 0));
3338     vpcmpeqw(vec3, vec3, vec1, 1);
3339     vptest(vec2, vec3);
3340     jcc(Assembler::carryClear, FOUND_CHAR);
3341     addptr(result, 32);
3342     subl(tmp, 2*stride);
3343     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3344     jmp(SCAN_TO_8_CHAR);
3345     bind(SCAN_TO_8_CHAR_INIT);
3346     movdl(vec1, ch);
3347     pshuflw(vec1, vec1, 0x00);
3348     pshufd(vec1, vec1, 0);
3349     pxor(vec2, vec2);
3350   }
3351   bind(SCAN_TO_8_CHAR);
3352   cmpl(cnt1, stride);
3353   jcc(Assembler::less, SCAN_TO_CHAR);
3354   if (UseAVX < 2) {
3355     movdl(vec1, ch);
3356     pshuflw(vec1, vec1, 0x00);
3357     pshufd(vec1, vec1, 0);
3358     pxor(vec2, vec2);
3359   }
3360   movl(tmp, cnt1);
3361   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3362   andl(cnt1,0x00000007);  //tail count (in chars)
3363 
3364   bind(SCAN_TO_8_CHAR_LOOP);
3365   movdqu(vec3, Address(result, 0));
3366   pcmpeqw(vec3, vec1);
3367   ptest(vec2, vec3);
3368   jcc(Assembler::carryClear, FOUND_CHAR);
3369   addptr(result, 16);
3370   subl(tmp, stride);
3371   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3372   bind(SCAN_TO_CHAR);
3373   testl(cnt1, cnt1);
3374   jcc(Assembler::zero, RET_NOT_FOUND);
3375   bind(SCAN_TO_CHAR_LOOP);
3376   load_unsigned_short(tmp, Address(result, 0));
3377   cmpl(ch, tmp);
3378   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3379   addptr(result, 2);
3380   subl(cnt1, 1);
3381   jccb(Assembler::zero, RET_NOT_FOUND);
3382   jmp(SCAN_TO_CHAR_LOOP);
3383 
3384   bind(RET_NOT_FOUND);
3385   movl(result, -1);
3386   jmpb(DONE_LABEL);
3387 
3388   bind(FOUND_CHAR);
3389   if (UseAVX >= 2) {
3390     vpmovmskb(tmp, vec3);
3391   } else {
3392     pmovmskb(tmp, vec3);
3393   }
3394   bsfl(ch, tmp);
3395   addptr(result, ch);
3396 
3397   bind(FOUND_SEQ_CHAR);
3398   subptr(result, str1);
3399   shrl(result, 1);
3400 
3401   bind(DONE_LABEL);
3402 } // string_indexof_char
3403 
3404 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3405                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3406   ShortBranchVerifier sbv(this);
3407   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3408 
3409   int stride = 16;
3410 
3411   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3412         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3413         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3414         FOUND_SEQ_CHAR, DONE_LABEL;
3415 
3416   movptr(result, str1);
3417   if (UseAVX >= 2) {
3418     cmpl(cnt1, stride);
3419     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3420     cmpl(cnt1, stride*2);
3421     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3422     movdl(vec1, ch);
3423     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3424     vpxor(vec2, vec2);
3425     movl(tmp, cnt1);
3426     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3427     andl(cnt1,0x0000001F);  //tail count (in chars)
3428 
3429     bind(SCAN_TO_32_CHAR_LOOP);
3430     vmovdqu(vec3, Address(result, 0));
3431     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3432     vptest(vec2, vec3);
3433     jcc(Assembler::carryClear, FOUND_CHAR);
3434     addptr(result, 32);
3435     subl(tmp, stride*2);
3436     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3437     jmp(SCAN_TO_16_CHAR);
3438 
3439     bind(SCAN_TO_16_CHAR_INIT);
3440     movdl(vec1, ch);
3441     pxor(vec2, vec2);
3442     pshufb(vec1, vec2);
3443   }
3444 
3445   bind(SCAN_TO_16_CHAR);
3446   cmpl(cnt1, stride);
3447   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3448   if (UseAVX < 2) {
3449     movdl(vec1, ch);
3450     pxor(vec2, vec2);
3451     pshufb(vec1, vec2);
3452   }
3453   movl(tmp, cnt1);
3454   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3455   andl(cnt1,0x0000000F);  //tail count (in bytes)
3456 
3457   bind(SCAN_TO_16_CHAR_LOOP);
3458   movdqu(vec3, Address(result, 0));
3459   pcmpeqb(vec3, vec1);
3460   ptest(vec2, vec3);
3461   jcc(Assembler::carryClear, FOUND_CHAR);
3462   addptr(result, 16);
3463   subl(tmp, stride);
3464   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3465 
3466   bind(SCAN_TO_CHAR_INIT);
3467   testl(cnt1, cnt1);
3468   jcc(Assembler::zero, RET_NOT_FOUND);
3469   bind(SCAN_TO_CHAR_LOOP);
3470   load_unsigned_byte(tmp, Address(result, 0));
3471   cmpl(ch, tmp);
3472   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3473   addptr(result, 1);
3474   subl(cnt1, 1);
3475   jccb(Assembler::zero, RET_NOT_FOUND);
3476   jmp(SCAN_TO_CHAR_LOOP);
3477 
3478   bind(RET_NOT_FOUND);
3479   movl(result, -1);
3480   jmpb(DONE_LABEL);
3481 
3482   bind(FOUND_CHAR);
3483   if (UseAVX >= 2) {
3484     vpmovmskb(tmp, vec3);
3485   } else {
3486     pmovmskb(tmp, vec3);
3487   }
3488   bsfl(ch, tmp);
3489   addptr(result, ch);
3490 
3491   bind(FOUND_SEQ_CHAR);
3492   subptr(result, str1);
3493 
3494   bind(DONE_LABEL);
3495 } // stringL_indexof_char
3496 
3497 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3498   switch (eltype) {
3499   case T_BOOLEAN: return sizeof(jboolean);
3500   case T_BYTE:  return sizeof(jbyte);
3501   case T_SHORT: return sizeof(jshort);
3502   case T_CHAR:  return sizeof(jchar);
3503   case T_INT:   return sizeof(jint);
3504   default:
3505     ShouldNotReachHere();
3506     return -1;
3507   }
3508 }
3509 
3510 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3511   switch (eltype) {
3512   // T_BOOLEAN used as surrogate for unsigned byte
3513   case T_BOOLEAN: movzbl(dst, src);   break;
3514   case T_BYTE:    movsbl(dst, src);   break;
3515   case T_SHORT:   movswl(dst, src);   break;
3516   case T_CHAR:    movzwl(dst, src);   break;
3517   case T_INT:     movl(dst, src);     break;
3518   default:
3519     ShouldNotReachHere();
3520   }
3521 }
3522 
3523 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3524   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3525 }
3526 
3527 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3528   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3529 }
3530 
3531 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3532   const int vlen = Assembler::AVX_256bit;
3533   switch (eltype) {
3534   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3535   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3536   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3537   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3538   case T_INT:
3539     // do nothing
3540     break;
3541   default:
3542     ShouldNotReachHere();
3543   }
3544 }
3545 
3546 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3547                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3548                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3549                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3550                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3551                                         BasicType eltype) {
3552   ShortBranchVerifier sbv(this);
3553   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3554   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3555   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3556 
3557   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3558         SHORT_UNROLLED_LOOP_EXIT,
3559         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3560         UNROLLED_VECTOR_LOOP_BEGIN,
3561         END;
3562   switch (eltype) {
3563   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3564   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3565   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3566   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3567   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3568   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3569   }
3570 
3571   // For "renaming" for readibility of the code
3572   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3573                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3574                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3575 
3576   const int elsize = arrays_hashcode_elsize(eltype);
3577 
3578   /*
3579     if (cnt1 >= 2) {
3580       if (cnt1 >= 32) {
3581         UNROLLED VECTOR LOOP
3582       }
3583       UNROLLED SCALAR LOOP
3584     }
3585     SINGLE SCALAR
3586    */
3587 
3588   cmpl(cnt1, 32);
3589   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3590 
3591   // cnt1 >= 32 && generate_vectorized_loop
3592   xorl(index, index);
3593 
3594   // vresult = IntVector.zero(I256);
3595   for (int idx = 0; idx < 4; idx++) {
3596     vpxor(vresult[idx], vresult[idx]);
3597   }
3598   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3599   Register bound = tmp2;
3600   Register next = tmp3;
3601   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3602   movl(next, Address(tmp2, 0));
3603   movdl(vnext, next);
3604   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3605 
3606   // index = 0;
3607   // bound = cnt1 & ~(32 - 1);
3608   movl(bound, cnt1);
3609   andl(bound, ~(32 - 1));
3610   // for (; index < bound; index += 32) {
3611   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3612   // result *= next;
3613   imull(result, next);
3614   // loop fission to upfront the cost of fetching from memory, OOO execution
3615   // can then hopefully do a better job of prefetching
3616   for (int idx = 0; idx < 4; idx++) {
3617     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3618   }
3619   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3620   for (int idx = 0; idx < 4; idx++) {
3621     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3622     arrays_hashcode_elvcast(vtmp[idx], eltype);
3623     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3624   }
3625   // index += 32;
3626   addl(index, 32);
3627   // index < bound;
3628   cmpl(index, bound);
3629   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3630   // }
3631 
3632   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3633   subl(cnt1, bound);
3634   // release bound
3635 
3636   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3637   for (int idx = 0; idx < 4; idx++) {
3638     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3639     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3640     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3641   }
3642   // result += vresult.reduceLanes(ADD);
3643   for (int idx = 0; idx < 4; idx++) {
3644     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3645   }
3646 
3647   // } else if (cnt1 < 32) {
3648 
3649   bind(SHORT_UNROLLED_BEGIN);
3650   // int i = 1;
3651   movl(index, 1);
3652   cmpl(index, cnt1);
3653   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3654 
3655   // for (; i < cnt1 ; i += 2) {
3656   bind(SHORT_UNROLLED_LOOP_BEGIN);
3657   movl(tmp3, 961);
3658   imull(result, tmp3);
3659   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3660   movl(tmp3, tmp2);
3661   shll(tmp3, 5);
3662   subl(tmp3, tmp2);
3663   addl(result, tmp3);
3664   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3665   addl(result, tmp3);
3666   addl(index, 2);
3667   cmpl(index, cnt1);
3668   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3669 
3670   // }
3671   // if (i >= cnt1) {
3672   bind(SHORT_UNROLLED_LOOP_EXIT);
3673   jccb(Assembler::greater, END);
3674   movl(tmp2, result);
3675   shll(result, 5);
3676   subl(result, tmp2);
3677   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3678   addl(result, tmp3);
3679   // }
3680   bind(END);
3681 
3682   BLOCK_COMMENT("} // arrays_hashcode");
3683 
3684 } // arrays_hashcode
3685 
3686 // helper function for string_compare
3687 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3688                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3689                                            Address::ScaleFactor scale2, Register index, int ae) {
3690   if (ae == StrIntrinsicNode::LL) {
3691     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3692     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3693   } else if (ae == StrIntrinsicNode::UU) {
3694     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3695     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3696   } else {
3697     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3698     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3699   }
3700 }
3701 
3702 // Compare strings, used for char[] and byte[].
3703 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3704                                        Register cnt1, Register cnt2, Register result,
3705                                        XMMRegister vec1, int ae, KRegister mask) {
3706   ShortBranchVerifier sbv(this);
3707   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3708   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3709   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3710   int stride2x2 = 0x40;
3711   Address::ScaleFactor scale = Address::no_scale;
3712   Address::ScaleFactor scale1 = Address::no_scale;
3713   Address::ScaleFactor scale2 = Address::no_scale;
3714 
3715   if (ae != StrIntrinsicNode::LL) {
3716     stride2x2 = 0x20;
3717   }
3718 
3719   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3720     shrl(cnt2, 1);
3721   }
3722   // Compute the minimum of the string lengths and the
3723   // difference of the string lengths (stack).
3724   // Do the conditional move stuff
3725   movl(result, cnt1);
3726   subl(cnt1, cnt2);
3727   push(cnt1);
3728   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3729 
3730   // Is the minimum length zero?
3731   testl(cnt2, cnt2);
3732   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3733   if (ae == StrIntrinsicNode::LL) {
3734     // Load first bytes
3735     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3736     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3737   } else if (ae == StrIntrinsicNode::UU) {
3738     // Load first characters
3739     load_unsigned_short(result, Address(str1, 0));
3740     load_unsigned_short(cnt1, Address(str2, 0));
3741   } else {
3742     load_unsigned_byte(result, Address(str1, 0));
3743     load_unsigned_short(cnt1, Address(str2, 0));
3744   }
3745   subl(result, cnt1);
3746   jcc(Assembler::notZero,  POP_LABEL);
3747 
3748   if (ae == StrIntrinsicNode::UU) {
3749     // Divide length by 2 to get number of chars
3750     shrl(cnt2, 1);
3751   }
3752   cmpl(cnt2, 1);
3753   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3754 
3755   // Check if the strings start at the same location and setup scale and stride
3756   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3757     cmpptr(str1, str2);
3758     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3759     if (ae == StrIntrinsicNode::LL) {
3760       scale = Address::times_1;
3761       stride = 16;
3762     } else {
3763       scale = Address::times_2;
3764       stride = 8;
3765     }
3766   } else {
3767     scale1 = Address::times_1;
3768     scale2 = Address::times_2;
3769     // scale not used
3770     stride = 8;
3771   }
3772 
3773   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3774     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3775     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3776     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3777     Label COMPARE_TAIL_LONG;
3778     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3779 
3780     int pcmpmask = 0x19;
3781     if (ae == StrIntrinsicNode::LL) {
3782       pcmpmask &= ~0x01;
3783     }
3784 
3785     // Setup to compare 16-chars (32-bytes) vectors,
3786     // start from first character again because it has aligned address.
3787     if (ae == StrIntrinsicNode::LL) {
3788       stride2 = 32;
3789     } else {
3790       stride2 = 16;
3791     }
3792     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3793       adr_stride = stride << scale;
3794     } else {
3795       adr_stride1 = 8;  //stride << scale1;
3796       adr_stride2 = 16; //stride << scale2;
3797     }
3798 
3799     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3800     // rax and rdx are used by pcmpestri as elements counters
3801     movl(result, cnt2);
3802     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3803     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3804 
3805     // fast path : compare first 2 8-char vectors.
3806     bind(COMPARE_16_CHARS);
3807     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3808       movdqu(vec1, Address(str1, 0));
3809     } else {
3810       pmovzxbw(vec1, Address(str1, 0));
3811     }
3812     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3813     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3814 
3815     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3816       movdqu(vec1, Address(str1, adr_stride));
3817       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3818     } else {
3819       pmovzxbw(vec1, Address(str1, adr_stride1));
3820       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3821     }
3822     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3823     addl(cnt1, stride);
3824 
3825     // Compare the characters at index in cnt1
3826     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3827     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3828     subl(result, cnt2);
3829     jmp(POP_LABEL);
3830 
3831     // Setup the registers to start vector comparison loop
3832     bind(COMPARE_WIDE_VECTORS);
3833     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3834       lea(str1, Address(str1, result, scale));
3835       lea(str2, Address(str2, result, scale));
3836     } else {
3837       lea(str1, Address(str1, result, scale1));
3838       lea(str2, Address(str2, result, scale2));
3839     }
3840     subl(result, stride2);
3841     subl(cnt2, stride2);
3842     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3843     negptr(result);
3844 
3845     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3846     bind(COMPARE_WIDE_VECTORS_LOOP);
3847 
3848 #ifdef _LP64
3849     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3850       cmpl(cnt2, stride2x2);
3851       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3852       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3853       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3854 
3855       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3856       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3857         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3858         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3859       } else {
3860         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3861         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3862       }
3863       kortestql(mask, mask);
3864       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3865       addptr(result, stride2x2);  // update since we already compared at this addr
3866       subl(cnt2, stride2x2);      // and sub the size too
3867       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3868 
3869       vpxor(vec1, vec1);
3870       jmpb(COMPARE_WIDE_TAIL);
3871     }//if (VM_Version::supports_avx512vlbw())
3872 #endif // _LP64
3873 
3874 
3875     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3876     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3877       vmovdqu(vec1, Address(str1, result, scale));
3878       vpxor(vec1, Address(str2, result, scale));
3879     } else {
3880       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3881       vpxor(vec1, Address(str2, result, scale2));
3882     }
3883     vptest(vec1, vec1);
3884     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3885     addptr(result, stride2);
3886     subl(cnt2, stride2);
3887     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3888     // clean upper bits of YMM registers
3889     vpxor(vec1, vec1);
3890 
3891     // compare wide vectors tail
3892     bind(COMPARE_WIDE_TAIL);
3893     testptr(result, result);
3894     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3895 
3896     movl(result, stride2);
3897     movl(cnt2, result);
3898     negptr(result);
3899     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3900 
3901     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3902     bind(VECTOR_NOT_EQUAL);
3903     // clean upper bits of YMM registers
3904     vpxor(vec1, vec1);
3905     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3906       lea(str1, Address(str1, result, scale));
3907       lea(str2, Address(str2, result, scale));
3908     } else {
3909       lea(str1, Address(str1, result, scale1));
3910       lea(str2, Address(str2, result, scale2));
3911     }
3912     jmp(COMPARE_16_CHARS);
3913 
3914     // Compare tail chars, length between 1 to 15 chars
3915     bind(COMPARE_TAIL_LONG);
3916     movl(cnt2, result);
3917     cmpl(cnt2, stride);
3918     jcc(Assembler::less, COMPARE_SMALL_STR);
3919 
3920     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3921       movdqu(vec1, Address(str1, 0));
3922     } else {
3923       pmovzxbw(vec1, Address(str1, 0));
3924     }
3925     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3926     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3927     subptr(cnt2, stride);
3928     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3929     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3930       lea(str1, Address(str1, result, scale));
3931       lea(str2, Address(str2, result, scale));
3932     } else {
3933       lea(str1, Address(str1, result, scale1));
3934       lea(str2, Address(str2, result, scale2));
3935     }
3936     negptr(cnt2);
3937     jmpb(WHILE_HEAD_LABEL);
3938 
3939     bind(COMPARE_SMALL_STR);
3940   } else if (UseSSE42Intrinsics) {
3941     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3942     int pcmpmask = 0x19;
3943     // Setup to compare 8-char (16-byte) vectors,
3944     // start from first character again because it has aligned address.
3945     movl(result, cnt2);
3946     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3947     if (ae == StrIntrinsicNode::LL) {
3948       pcmpmask &= ~0x01;
3949     }
3950     jcc(Assembler::zero, COMPARE_TAIL);
3951     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3952       lea(str1, Address(str1, result, scale));
3953       lea(str2, Address(str2, result, scale));
3954     } else {
3955       lea(str1, Address(str1, result, scale1));
3956       lea(str2, Address(str2, result, scale2));
3957     }
3958     negptr(result);
3959 
3960     // pcmpestri
3961     //   inputs:
3962     //     vec1- substring
3963     //     rax - negative string length (elements count)
3964     //     mem - scanned string
3965     //     rdx - string length (elements count)
3966     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3967     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3968     //   outputs:
3969     //     rcx - first mismatched element index
3970     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3971 
3972     bind(COMPARE_WIDE_VECTORS);
3973     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3974       movdqu(vec1, Address(str1, result, scale));
3975       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3976     } else {
3977       pmovzxbw(vec1, Address(str1, result, scale1));
3978       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3979     }
3980     // After pcmpestri cnt1(rcx) contains mismatched element index
3981 
3982     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3983     addptr(result, stride);
3984     subptr(cnt2, stride);
3985     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3986 
3987     // compare wide vectors tail
3988     testptr(result, result);
3989     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3990 
3991     movl(cnt2, stride);
3992     movl(result, stride);
3993     negptr(result);
3994     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3995       movdqu(vec1, Address(str1, result, scale));
3996       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3997     } else {
3998       pmovzxbw(vec1, Address(str1, result, scale1));
3999       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4000     }
4001     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4002 
4003     // Mismatched characters in the vectors
4004     bind(VECTOR_NOT_EQUAL);
4005     addptr(cnt1, result);
4006     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4007     subl(result, cnt2);
4008     jmpb(POP_LABEL);
4009 
4010     bind(COMPARE_TAIL); // limit is zero
4011     movl(cnt2, result);
4012     // Fallthru to tail compare
4013   }
4014   // Shift str2 and str1 to the end of the arrays, negate min
4015   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4016     lea(str1, Address(str1, cnt2, scale));
4017     lea(str2, Address(str2, cnt2, scale));
4018   } else {
4019     lea(str1, Address(str1, cnt2, scale1));
4020     lea(str2, Address(str2, cnt2, scale2));
4021   }
4022   decrementl(cnt2);  // first character was compared already
4023   negptr(cnt2);
4024 
4025   // Compare the rest of the elements
4026   bind(WHILE_HEAD_LABEL);
4027   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4028   subl(result, cnt1);
4029   jccb(Assembler::notZero, POP_LABEL);
4030   increment(cnt2);
4031   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4032 
4033   // Strings are equal up to min length.  Return the length difference.
4034   bind(LENGTH_DIFF_LABEL);
4035   pop(result);
4036   if (ae == StrIntrinsicNode::UU) {
4037     // Divide diff by 2 to get number of chars
4038     sarl(result, 1);
4039   }
4040   jmpb(DONE_LABEL);
4041 
4042 #ifdef _LP64
4043   if (VM_Version::supports_avx512vlbw()) {
4044 
4045     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4046 
4047     kmovql(cnt1, mask);
4048     notq(cnt1);
4049     bsfq(cnt2, cnt1);
4050     if (ae != StrIntrinsicNode::LL) {
4051       // Divide diff by 2 to get number of chars
4052       sarl(cnt2, 1);
4053     }
4054     addq(result, cnt2);
4055     if (ae == StrIntrinsicNode::LL) {
4056       load_unsigned_byte(cnt1, Address(str2, result));
4057       load_unsigned_byte(result, Address(str1, result));
4058     } else if (ae == StrIntrinsicNode::UU) {
4059       load_unsigned_short(cnt1, Address(str2, result, scale));
4060       load_unsigned_short(result, Address(str1, result, scale));
4061     } else {
4062       load_unsigned_short(cnt1, Address(str2, result, scale2));
4063       load_unsigned_byte(result, Address(str1, result, scale1));
4064     }
4065     subl(result, cnt1);
4066     jmpb(POP_LABEL);
4067   }//if (VM_Version::supports_avx512vlbw())
4068 #endif // _LP64
4069 
4070   // Discard the stored length difference
4071   bind(POP_LABEL);
4072   pop(cnt1);
4073 
4074   // That's it
4075   bind(DONE_LABEL);
4076   if(ae == StrIntrinsicNode::UL) {
4077     negl(result);
4078   }
4079 
4080 }
4081 
4082 // Search for Non-ASCII character (Negative byte value) in a byte array,
4083 // return the index of the first such character, otherwise the length
4084 // of the array segment searched.
4085 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4086 //   @IntrinsicCandidate
4087 //   public static int countPositives(byte[] ba, int off, int len) {
4088 //     for (int i = off; i < off + len; i++) {
4089 //       if (ba[i] < 0) {
4090 //         return i - off;
4091 //       }
4092 //     }
4093 //     return len;
4094 //   }
4095 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4096   Register result, Register tmp1,
4097   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4098   // rsi: byte array
4099   // rcx: len
4100   // rax: result
4101   ShortBranchVerifier sbv(this);
4102   assert_different_registers(ary1, len, result, tmp1);
4103   assert_different_registers(vec1, vec2);
4104   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4105 
4106   movl(result, len); // copy
4107   // len == 0
4108   testl(len, len);
4109   jcc(Assembler::zero, DONE);
4110 
4111   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4112     VM_Version::supports_avx512vlbw() &&
4113     VM_Version::supports_bmi2()) {
4114 
4115     Label test_64_loop, test_tail, BREAK_LOOP;
4116     movl(tmp1, len);
4117     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4118 
4119     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4120     andl(len,  0xffffffc0); // vector count (in chars)
4121     jccb(Assembler::zero, test_tail);
4122 
4123     lea(ary1, Address(ary1, len, Address::times_1));
4124     negptr(len);
4125 
4126     bind(test_64_loop);
4127     // Check whether our 64 elements of size byte contain negatives
4128     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4129     kortestql(mask1, mask1);
4130     jcc(Assembler::notZero, BREAK_LOOP);
4131 
4132     addptr(len, 64);
4133     jccb(Assembler::notZero, test_64_loop);
4134 
4135     bind(test_tail);
4136     // bail out when there is nothing to be done
4137     testl(tmp1, -1);
4138     jcc(Assembler::zero, DONE);
4139 
4140 
4141     // check the tail for absense of negatives
4142     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4143 #ifdef _LP64
4144     {
4145       Register tmp3_aliased = len;
4146       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4147       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4148       notq(tmp3_aliased);
4149       kmovql(mask2, tmp3_aliased);
4150     }
4151 #else
4152     Label k_init;
4153     jmp(k_init);
4154 
4155     // We could not read 64-bits from a general purpose register thus we move
4156     // data required to compose 64 1's to the instruction stream
4157     // We emit 64 byte wide series of elements from 0..63 which later on would
4158     // be used as a compare targets with tail count contained in tmp1 register.
4159     // Result would be a k register having tmp1 consecutive number or 1
4160     // counting from least significant bit.
4161     address tmp = pc();
4162     emit_int64(0x0706050403020100);
4163     emit_int64(0x0F0E0D0C0B0A0908);
4164     emit_int64(0x1716151413121110);
4165     emit_int64(0x1F1E1D1C1B1A1918);
4166     emit_int64(0x2726252423222120);
4167     emit_int64(0x2F2E2D2C2B2A2928);
4168     emit_int64(0x3736353433323130);
4169     emit_int64(0x3F3E3D3C3B3A3938);
4170 
4171     bind(k_init);
4172     lea(len, InternalAddress(tmp));
4173     // create mask to test for negative byte inside a vector
4174     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4175     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4176 
4177 #endif
4178     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4179     ktestq(mask1, mask2);
4180     jcc(Assembler::zero, DONE);
4181 
4182     // do a full check for negative registers in the tail
4183     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4184                      // ary1 already pointing to the right place
4185     jmpb(TAIL_START);
4186 
4187     bind(BREAK_LOOP);
4188     // At least one byte in the last 64 byte block was negative.
4189     // Set up to look at the last 64 bytes as if they were a tail
4190     lea(ary1, Address(ary1, len, Address::times_1));
4191     addptr(result, len);
4192     // Ignore the very last byte: if all others are positive,
4193     // it must be negative, so we can skip right to the 2+1 byte
4194     // end comparison at this point
4195     orl(result, 63);
4196     movl(len, 63);
4197     // Fallthru to tail compare
4198   } else {
4199 
4200     if (UseAVX >= 2 && UseSSE >= 2) {
4201       // With AVX2, use 32-byte vector compare
4202       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4203 
4204       // Compare 32-byte vectors
4205       testl(len, 0xffffffe0);   // vector count (in bytes)
4206       jccb(Assembler::zero, TAIL_START);
4207 
4208       andl(len, 0xffffffe0);
4209       lea(ary1, Address(ary1, len, Address::times_1));
4210       negptr(len);
4211 
4212       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4213       movdl(vec2, tmp1);
4214       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4215 
4216       bind(COMPARE_WIDE_VECTORS);
4217       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4218       vptest(vec1, vec2);
4219       jccb(Assembler::notZero, BREAK_LOOP);
4220       addptr(len, 32);
4221       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4222 
4223       testl(result, 0x0000001f);   // any bytes remaining?
4224       jcc(Assembler::zero, DONE);
4225 
4226       // Quick test using the already prepared vector mask
4227       movl(len, result);
4228       andl(len, 0x0000001f);
4229       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4230       vptest(vec1, vec2);
4231       jcc(Assembler::zero, DONE);
4232       // There are zeros, jump to the tail to determine exactly where
4233       jmpb(TAIL_START);
4234 
4235       bind(BREAK_LOOP);
4236       // At least one byte in the last 32-byte vector is negative.
4237       // Set up to look at the last 32 bytes as if they were a tail
4238       lea(ary1, Address(ary1, len, Address::times_1));
4239       addptr(result, len);
4240       // Ignore the very last byte: if all others are positive,
4241       // it must be negative, so we can skip right to the 2+1 byte
4242       // end comparison at this point
4243       orl(result, 31);
4244       movl(len, 31);
4245       // Fallthru to tail compare
4246     } else if (UseSSE42Intrinsics) {
4247       // With SSE4.2, use double quad vector compare
4248       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4249 
4250       // Compare 16-byte vectors
4251       testl(len, 0xfffffff0);   // vector count (in bytes)
4252       jcc(Assembler::zero, TAIL_START);
4253 
4254       andl(len, 0xfffffff0);
4255       lea(ary1, Address(ary1, len, Address::times_1));
4256       negptr(len);
4257 
4258       movl(tmp1, 0x80808080);
4259       movdl(vec2, tmp1);
4260       pshufd(vec2, vec2, 0);
4261 
4262       bind(COMPARE_WIDE_VECTORS);
4263       movdqu(vec1, Address(ary1, len, Address::times_1));
4264       ptest(vec1, vec2);
4265       jccb(Assembler::notZero, BREAK_LOOP);
4266       addptr(len, 16);
4267       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4268 
4269       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4270       jcc(Assembler::zero, DONE);
4271 
4272       // Quick test using the already prepared vector mask
4273       movl(len, result);
4274       andl(len, 0x0000000f);   // tail count (in bytes)
4275       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4276       ptest(vec1, vec2);
4277       jcc(Assembler::zero, DONE);
4278       jmpb(TAIL_START);
4279 
4280       bind(BREAK_LOOP);
4281       // At least one byte in the last 16-byte vector is negative.
4282       // Set up and look at the last 16 bytes as if they were a tail
4283       lea(ary1, Address(ary1, len, Address::times_1));
4284       addptr(result, len);
4285       // Ignore the very last byte: if all others are positive,
4286       // it must be negative, so we can skip right to the 2+1 byte
4287       // end comparison at this point
4288       orl(result, 15);
4289       movl(len, 15);
4290       // Fallthru to tail compare
4291     }
4292   }
4293 
4294   bind(TAIL_START);
4295   // Compare 4-byte vectors
4296   andl(len, 0xfffffffc); // vector count (in bytes)
4297   jccb(Assembler::zero, COMPARE_CHAR);
4298 
4299   lea(ary1, Address(ary1, len, Address::times_1));
4300   negptr(len);
4301 
4302   bind(COMPARE_VECTORS);
4303   movl(tmp1, Address(ary1, len, Address::times_1));
4304   andl(tmp1, 0x80808080);
4305   jccb(Assembler::notZero, TAIL_ADJUST);
4306   addptr(len, 4);
4307   jccb(Assembler::notZero, COMPARE_VECTORS);
4308 
4309   // Compare trailing char (final 2-3 bytes), if any
4310   bind(COMPARE_CHAR);
4311 
4312   testl(result, 0x2);   // tail  char
4313   jccb(Assembler::zero, COMPARE_BYTE);
4314   load_unsigned_short(tmp1, Address(ary1, 0));
4315   andl(tmp1, 0x00008080);
4316   jccb(Assembler::notZero, CHAR_ADJUST);
4317   lea(ary1, Address(ary1, 2));
4318 
4319   bind(COMPARE_BYTE);
4320   testl(result, 0x1);   // tail  byte
4321   jccb(Assembler::zero, DONE);
4322   load_unsigned_byte(tmp1, Address(ary1, 0));
4323   testl(tmp1, 0x00000080);
4324   jccb(Assembler::zero, DONE);
4325   subptr(result, 1);
4326   jmpb(DONE);
4327 
4328   bind(TAIL_ADJUST);
4329   // there are negative bits in the last 4 byte block.
4330   // Adjust result and check the next three bytes
4331   addptr(result, len);
4332   orl(result, 3);
4333   lea(ary1, Address(ary1, len, Address::times_1));
4334   jmpb(COMPARE_CHAR);
4335 
4336   bind(CHAR_ADJUST);
4337   // We are looking at a char + optional byte tail, and found that one
4338   // of the bytes in the char is negative. Adjust the result, check the
4339   // first byte and readjust if needed.
4340   andl(result, 0xfffffffc);
4341   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4342   jccb(Assembler::notZero, DONE);
4343   addptr(result, 1);
4344 
4345   // That's it
4346   bind(DONE);
4347   if (UseAVX >= 2 && UseSSE >= 2) {
4348     // clean upper bits of YMM registers
4349     vpxor(vec1, vec1);
4350     vpxor(vec2, vec2);
4351   }
4352 }
4353 
4354 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4355 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4356                                       Register limit, Register result, Register chr,
4357                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4358                                       KRegister mask, bool expand_ary2) {
4359   // for expand_ary2, limit is the (smaller) size of the second array.
4360   ShortBranchVerifier sbv(this);
4361   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4362 
4363   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4364          "Expansion only implemented for AVX2");
4365 
4366   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4367   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4368 
4369   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4370   int scaleIncr = expand_ary2 ? 8 : 16;
4371 
4372   if (is_array_equ) {
4373     // Check the input args
4374     cmpoop(ary1, ary2);
4375     jcc(Assembler::equal, TRUE_LABEL);
4376 
4377     // Need additional checks for arrays_equals.
4378     testptr(ary1, ary1);
4379     jcc(Assembler::zero, FALSE_LABEL);
4380     testptr(ary2, ary2);
4381     jcc(Assembler::zero, FALSE_LABEL);
4382 
4383     // Check the lengths
4384     movl(limit, Address(ary1, length_offset));
4385     cmpl(limit, Address(ary2, length_offset));
4386     jcc(Assembler::notEqual, FALSE_LABEL);
4387   }
4388 
4389   // count == 0
4390   testl(limit, limit);
4391   jcc(Assembler::zero, TRUE_LABEL);
4392 
4393   if (is_array_equ) {
4394     // Load array address
4395     lea(ary1, Address(ary1, base_offset));
4396     lea(ary2, Address(ary2, base_offset));
4397   }
4398 
4399   if (is_array_equ && is_char) {
4400     // arrays_equals when used for char[].
4401     shll(limit, 1);      // byte count != 0
4402   }
4403   movl(result, limit); // copy
4404 
4405   if (UseAVX >= 2) {
4406     // With AVX2, use 32-byte vector compare
4407     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4408 
4409     // Compare 32-byte vectors
4410     if (expand_ary2) {
4411       andl(result, 0x0000000f);  //   tail count (in bytes)
4412       andl(limit, 0xfffffff0);   // vector count (in bytes)
4413       jcc(Assembler::zero, COMPARE_TAIL);
4414     } else {
4415       andl(result, 0x0000001f);  //   tail count (in bytes)
4416       andl(limit, 0xffffffe0);   // vector count (in bytes)
4417       jcc(Assembler::zero, COMPARE_TAIL_16);
4418     }
4419 
4420     lea(ary1, Address(ary1, limit, scaleFactor));
4421     lea(ary2, Address(ary2, limit, Address::times_1));
4422     negptr(limit);
4423 
4424 #ifdef _LP64
4425     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4426       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4427 
4428       cmpl(limit, -64);
4429       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4430 
4431       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4432 
4433       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4434       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4435       kortestql(mask, mask);
4436       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4437       addptr(limit, 64);  // update since we already compared at this addr
4438       cmpl(limit, -64);
4439       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4440 
4441       // At this point we may still need to compare -limit+result bytes.
4442       // We could execute the next two instruction and just continue via non-wide path:
4443       //  cmpl(limit, 0);
4444       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4445       // But since we stopped at the points ary{1,2}+limit which are
4446       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4447       // (|limit| <= 32 and result < 32),
4448       // we may just compare the last 64 bytes.
4449       //
4450       addptr(result, -64);   // it is safe, bc we just came from this area
4451       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4452       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4453       kortestql(mask, mask);
4454       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4455 
4456       jmp(TRUE_LABEL);
4457 
4458       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4459 
4460     }//if (VM_Version::supports_avx512vlbw())
4461 #endif //_LP64
4462     bind(COMPARE_WIDE_VECTORS);
4463     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4464     if (expand_ary2) {
4465       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4466     } else {
4467       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4468     }
4469     vpxor(vec1, vec2);
4470 
4471     vptest(vec1, vec1);
4472     jcc(Assembler::notZero, FALSE_LABEL);
4473     addptr(limit, scaleIncr * 2);
4474     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4475 
4476     testl(result, result);
4477     jcc(Assembler::zero, TRUE_LABEL);
4478 
4479     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4480     if (expand_ary2) {
4481       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4482     } else {
4483       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4484     }
4485     vpxor(vec1, vec2);
4486 
4487     vptest(vec1, vec1);
4488     jcc(Assembler::notZero, FALSE_LABEL);
4489     jmp(TRUE_LABEL);
4490 
4491     bind(COMPARE_TAIL_16); // limit is zero
4492     movl(limit, result);
4493 
4494     // Compare 16-byte chunks
4495     andl(result, 0x0000000f);  //   tail count (in bytes)
4496     andl(limit, 0xfffffff0);   // vector count (in bytes)
4497     jcc(Assembler::zero, COMPARE_TAIL);
4498 
4499     lea(ary1, Address(ary1, limit, scaleFactor));
4500     lea(ary2, Address(ary2, limit, Address::times_1));
4501     negptr(limit);
4502 
4503     bind(COMPARE_WIDE_VECTORS_16);
4504     movdqu(vec1, Address(ary1, limit, scaleFactor));
4505     if (expand_ary2) {
4506       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4507     } else {
4508       movdqu(vec2, Address(ary2, limit, Address::times_1));
4509     }
4510     pxor(vec1, vec2);
4511 
4512     ptest(vec1, vec1);
4513     jcc(Assembler::notZero, FALSE_LABEL);
4514     addptr(limit, scaleIncr);
4515     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4516 
4517     bind(COMPARE_TAIL); // limit is zero
4518     movl(limit, result);
4519     // Fallthru to tail compare
4520   } else if (UseSSE42Intrinsics) {
4521     // With SSE4.2, use double quad vector compare
4522     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4523 
4524     // Compare 16-byte vectors
4525     andl(result, 0x0000000f);  //   tail count (in bytes)
4526     andl(limit, 0xfffffff0);   // vector count (in bytes)
4527     jcc(Assembler::zero, COMPARE_TAIL);
4528 
4529     lea(ary1, Address(ary1, limit, Address::times_1));
4530     lea(ary2, Address(ary2, limit, Address::times_1));
4531     negptr(limit);
4532 
4533     bind(COMPARE_WIDE_VECTORS);
4534     movdqu(vec1, Address(ary1, limit, Address::times_1));
4535     movdqu(vec2, Address(ary2, limit, Address::times_1));
4536     pxor(vec1, vec2);
4537 
4538     ptest(vec1, vec1);
4539     jcc(Assembler::notZero, FALSE_LABEL);
4540     addptr(limit, 16);
4541     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4542 
4543     testl(result, result);
4544     jcc(Assembler::zero, TRUE_LABEL);
4545 
4546     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4547     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4548     pxor(vec1, vec2);
4549 
4550     ptest(vec1, vec1);
4551     jccb(Assembler::notZero, FALSE_LABEL);
4552     jmpb(TRUE_LABEL);
4553 
4554     bind(COMPARE_TAIL); // limit is zero
4555     movl(limit, result);
4556     // Fallthru to tail compare
4557   }
4558 
4559   // Compare 4-byte vectors
4560   if (expand_ary2) {
4561     testl(result, result);
4562     jccb(Assembler::zero, TRUE_LABEL);
4563   } else {
4564     andl(limit, 0xfffffffc); // vector count (in bytes)
4565     jccb(Assembler::zero, COMPARE_CHAR);
4566   }
4567 
4568   lea(ary1, Address(ary1, limit, scaleFactor));
4569   lea(ary2, Address(ary2, limit, Address::times_1));
4570   negptr(limit);
4571 
4572   bind(COMPARE_VECTORS);
4573   if (expand_ary2) {
4574     // There are no "vector" operations for bytes to shorts
4575     movzbl(chr, Address(ary2, limit, Address::times_1));
4576     cmpw(Address(ary1, limit, Address::times_2), chr);
4577     jccb(Assembler::notEqual, FALSE_LABEL);
4578     addptr(limit, 1);
4579     jcc(Assembler::notZero, COMPARE_VECTORS);
4580     jmp(TRUE_LABEL);
4581   } else {
4582     movl(chr, Address(ary1, limit, Address::times_1));
4583     cmpl(chr, Address(ary2, limit, Address::times_1));
4584     jccb(Assembler::notEqual, FALSE_LABEL);
4585     addptr(limit, 4);
4586     jcc(Assembler::notZero, COMPARE_VECTORS);
4587   }
4588 
4589   // Compare trailing char (final 2 bytes), if any
4590   bind(COMPARE_CHAR);
4591   testl(result, 0x2);   // tail  char
4592   jccb(Assembler::zero, COMPARE_BYTE);
4593   load_unsigned_short(chr, Address(ary1, 0));
4594   load_unsigned_short(limit, Address(ary2, 0));
4595   cmpl(chr, limit);
4596   jccb(Assembler::notEqual, FALSE_LABEL);
4597 
4598   if (is_array_equ && is_char) {
4599     bind(COMPARE_BYTE);
4600   } else {
4601     lea(ary1, Address(ary1, 2));
4602     lea(ary2, Address(ary2, 2));
4603 
4604     bind(COMPARE_BYTE);
4605     testl(result, 0x1);   // tail  byte
4606     jccb(Assembler::zero, TRUE_LABEL);
4607     load_unsigned_byte(chr, Address(ary1, 0));
4608     load_unsigned_byte(limit, Address(ary2, 0));
4609     cmpl(chr, limit);
4610     jccb(Assembler::notEqual, FALSE_LABEL);
4611   }
4612   bind(TRUE_LABEL);
4613   movl(result, 1);   // return true
4614   jmpb(DONE);
4615 
4616   bind(FALSE_LABEL);
4617   xorl(result, result); // return false
4618 
4619   // That's it
4620   bind(DONE);
4621   if (UseAVX >= 2) {
4622     // clean upper bits of YMM registers
4623     vpxor(vec1, vec1);
4624     vpxor(vec2, vec2);
4625   }
4626 }
4627 
4628 #ifdef _LP64
4629 
4630 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4631 #define __ masm.
4632   Register dst = stub.data<0>();
4633   XMMRegister src = stub.data<1>();
4634   address target = stub.data<2>();
4635   __ bind(stub.entry());
4636   __ subptr(rsp, 8);
4637   __ movdbl(Address(rsp), src);
4638   __ call(RuntimeAddress(target));
4639   __ pop(dst);
4640   __ jmp(stub.continuation());
4641 #undef __
4642 }
4643 
4644 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4645   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4646   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4647 
4648   address slowpath_target;
4649   if (dst_bt == T_INT) {
4650     if (src_bt == T_FLOAT) {
4651       cvttss2sil(dst, src);
4652       cmpl(dst, 0x80000000);
4653       slowpath_target = StubRoutines::x86::f2i_fixup();
4654     } else {
4655       cvttsd2sil(dst, src);
4656       cmpl(dst, 0x80000000);
4657       slowpath_target = StubRoutines::x86::d2i_fixup();
4658     }
4659   } else {
4660     if (src_bt == T_FLOAT) {
4661       cvttss2siq(dst, src);
4662       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4663       slowpath_target = StubRoutines::x86::f2l_fixup();
4664     } else {
4665       cvttsd2siq(dst, src);
4666       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4667       slowpath_target = StubRoutines::x86::d2l_fixup();
4668     }
4669   }
4670 
4671   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4672   jcc(Assembler::equal, stub->entry());
4673   bind(stub->continuation());
4674 }
4675 
4676 #endif // _LP64
4677 
4678 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4679                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4680   switch(ideal_opc) {
4681     case Op_LShiftVS:
4682       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4683     case Op_LShiftVI:
4684       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4685     case Op_LShiftVL:
4686       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4687     case Op_RShiftVS:
4688       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4689     case Op_RShiftVI:
4690       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4691     case Op_RShiftVL:
4692       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4693     case Op_URShiftVS:
4694       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4695     case Op_URShiftVI:
4696       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4697     case Op_URShiftVL:
4698       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4699     case Op_RotateRightV:
4700       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4701     case Op_RotateLeftV:
4702       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4703     default:
4704       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4705       break;
4706   }
4707 }
4708 
4709 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4710                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4711   if (is_unsigned) {
4712     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4713   } else {
4714     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4715   }
4716 }
4717 
4718 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4719                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4720   switch (elem_bt) {
4721     case T_BYTE:
4722       if (ideal_opc == Op_SaturatingAddV) {
4723         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4724       } else {
4725         assert(ideal_opc == Op_SaturatingSubV, "");
4726         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4727       }
4728       break;
4729     case T_SHORT:
4730       if (ideal_opc == Op_SaturatingAddV) {
4731         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4732       } else {
4733         assert(ideal_opc == Op_SaturatingSubV, "");
4734         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4735       }
4736       break;
4737     default:
4738       fatal("Unsupported type %s", type2name(elem_bt));
4739       break;
4740   }
4741 }
4742 
4743 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4744                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4745   switch (elem_bt) {
4746     case T_BYTE:
4747       if (ideal_opc == Op_SaturatingAddV) {
4748         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4749       } else {
4750         assert(ideal_opc == Op_SaturatingSubV, "");
4751         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4752       }
4753       break;
4754     case T_SHORT:
4755       if (ideal_opc == Op_SaturatingAddV) {
4756         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4757       } else {
4758         assert(ideal_opc == Op_SaturatingSubV, "");
4759         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4760       }
4761       break;
4762     default:
4763       fatal("Unsupported type %s", type2name(elem_bt));
4764       break;
4765   }
4766 }
4767 
4768 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4769                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4770   if (is_unsigned) {
4771     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4772   } else {
4773     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4774   }
4775 }
4776 
4777 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4778                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4779   switch (elem_bt) {
4780     case T_BYTE:
4781       if (ideal_opc == Op_SaturatingAddV) {
4782         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4783       } else {
4784         assert(ideal_opc == Op_SaturatingSubV, "");
4785         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4786       }
4787       break;
4788     case T_SHORT:
4789       if (ideal_opc == Op_SaturatingAddV) {
4790         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4791       } else {
4792         assert(ideal_opc == Op_SaturatingSubV, "");
4793         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4794       }
4795       break;
4796     default:
4797       fatal("Unsupported type %s", type2name(elem_bt));
4798       break;
4799   }
4800 }
4801 
4802 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4803                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4804   switch (elem_bt) {
4805     case T_BYTE:
4806       if (ideal_opc == Op_SaturatingAddV) {
4807         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4808       } else {
4809         assert(ideal_opc == Op_SaturatingSubV, "");
4810         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4811       }
4812       break;
4813     case T_SHORT:
4814       if (ideal_opc == Op_SaturatingAddV) {
4815         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4816       } else {
4817         assert(ideal_opc == Op_SaturatingSubV, "");
4818         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4819       }
4820       break;
4821     default:
4822       fatal("Unsupported type %s", type2name(elem_bt));
4823       break;
4824   }
4825 }
4826 
4827 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4828                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4829                                     bool is_varshift) {
4830   switch (ideal_opc) {
4831     case Op_AddVB:
4832       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4833     case Op_AddVS:
4834       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4835     case Op_AddVI:
4836       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4837     case Op_AddVL:
4838       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4839     case Op_AddVF:
4840       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4841     case Op_AddVD:
4842       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4843     case Op_SubVB:
4844       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4845     case Op_SubVS:
4846       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4847     case Op_SubVI:
4848       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4849     case Op_SubVL:
4850       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4851     case Op_SubVF:
4852       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4853     case Op_SubVD:
4854       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4855     case Op_MulVS:
4856       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4857     case Op_MulVI:
4858       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4859     case Op_MulVL:
4860       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4861     case Op_MulVF:
4862       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4863     case Op_MulVD:
4864       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4865     case Op_DivVF:
4866       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4867     case Op_DivVD:
4868       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4869     case Op_SqrtVF:
4870       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4871     case Op_SqrtVD:
4872       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4873     case Op_AbsVB:
4874       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4875     case Op_AbsVS:
4876       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4877     case Op_AbsVI:
4878       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4879     case Op_AbsVL:
4880       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4881     case Op_FmaVF:
4882       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4883     case Op_FmaVD:
4884       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4885     case Op_VectorRearrange:
4886       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4887     case Op_LShiftVS:
4888       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4889     case Op_LShiftVI:
4890       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4891     case Op_LShiftVL:
4892       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4893     case Op_RShiftVS:
4894       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4895     case Op_RShiftVI:
4896       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4897     case Op_RShiftVL:
4898       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4899     case Op_URShiftVS:
4900       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4901     case Op_URShiftVI:
4902       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4903     case Op_URShiftVL:
4904       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4905     case Op_RotateLeftV:
4906       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4907     case Op_RotateRightV:
4908       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4909     case Op_MaxV:
4910       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4911     case Op_MinV:
4912       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4913     case Op_UMinV:
4914       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4915     case Op_UMaxV:
4916       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4917     case Op_XorV:
4918       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4919     case Op_OrV:
4920       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4921     case Op_AndV:
4922       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4923     default:
4924       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4925       break;
4926   }
4927 }
4928 
4929 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4930                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4931   switch (ideal_opc) {
4932     case Op_AddVB:
4933       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4934     case Op_AddVS:
4935       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4936     case Op_AddVI:
4937       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4938     case Op_AddVL:
4939       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4940     case Op_AddVF:
4941       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4942     case Op_AddVD:
4943       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4944     case Op_SubVB:
4945       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4946     case Op_SubVS:
4947       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4948     case Op_SubVI:
4949       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4950     case Op_SubVL:
4951       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4952     case Op_SubVF:
4953       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4954     case Op_SubVD:
4955       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4956     case Op_MulVS:
4957       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4958     case Op_MulVI:
4959       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4960     case Op_MulVL:
4961       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4962     case Op_MulVF:
4963       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4964     case Op_MulVD:
4965       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4966     case Op_DivVF:
4967       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4968     case Op_DivVD:
4969       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4970     case Op_FmaVF:
4971       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4972     case Op_FmaVD:
4973       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4974     case Op_MaxV:
4975       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4976     case Op_MinV:
4977       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4978     case Op_UMaxV:
4979       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4980     case Op_UMinV:
4981       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4982     case Op_XorV:
4983       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4984     case Op_OrV:
4985       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4986     case Op_AndV:
4987       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4988     default:
4989       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4990       break;
4991   }
4992 }
4993 
4994 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4995                                   KRegister src1, KRegister src2) {
4996   BasicType etype = T_ILLEGAL;
4997   switch(mask_len) {
4998     case 2:
4999     case 4:
5000     case 8:  etype = T_BYTE; break;
5001     case 16: etype = T_SHORT; break;
5002     case 32: etype = T_INT; break;
5003     case 64: etype = T_LONG; break;
5004     default: fatal("Unsupported type"); break;
5005   }
5006   assert(etype != T_ILLEGAL, "");
5007   switch(ideal_opc) {
5008     case Op_AndVMask:
5009       kand(etype, dst, src1, src2); break;
5010     case Op_OrVMask:
5011       kor(etype, dst, src1, src2); break;
5012     case Op_XorVMask:
5013       kxor(etype, dst, src1, src2); break;
5014     default:
5015       fatal("Unsupported masked operation"); break;
5016   }
5017 }
5018 
5019 /*
5020  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5021  * If src is NaN, the result is 0.
5022  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
5023  * the result is equal to the value of Integer.MIN_VALUE.
5024  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
5025  * the result is equal to the value of Integer.MAX_VALUE.
5026  */
5027 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5028                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5029                                                                    Register rscratch, AddressLiteral float_sign_flip,
5030                                                                    int vec_enc) {
5031   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5032   Label done;
5033   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
5034   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5035   vptest(xtmp2, xtmp2, vec_enc);
5036   jccb(Assembler::equal, done);
5037 
5038   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
5039   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
5040 
5041   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5042   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
5043   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
5044 
5045   // Recompute the mask for remaining special value.
5046   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
5047   // Extract SRC values corresponding to TRUE mask lanes.
5048   vpand(xtmp4, xtmp2, src, vec_enc);
5049   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
5050   // values are set.
5051   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5052 
5053   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5054   bind(done);
5055 }
5056 
5057 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5058                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5059                                                                     Register rscratch, AddressLiteral float_sign_flip,
5060                                                                     int vec_enc) {
5061   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5062   Label done;
5063   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5064   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5065   kortestwl(ktmp1, ktmp1);
5066   jccb(Assembler::equal, done);
5067 
5068   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5069   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5070   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5071 
5072   kxorwl(ktmp1, ktmp1, ktmp2);
5073   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5074   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5075   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5076   bind(done);
5077 }
5078 
5079 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5080                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5081                                                                      Register rscratch, AddressLiteral double_sign_flip,
5082                                                                      int vec_enc) {
5083   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5084 
5085   Label done;
5086   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5087   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5088   kortestwl(ktmp1, ktmp1);
5089   jccb(Assembler::equal, done);
5090 
5091   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5092   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5093   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5094 
5095   kxorwl(ktmp1, ktmp1, ktmp2);
5096   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5097   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5098   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5099   bind(done);
5100 }
5101 
5102 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5103                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5104                                                                      Register rscratch, AddressLiteral float_sign_flip,
5105                                                                      int vec_enc) {
5106   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5107   Label done;
5108   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5109   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5110   kortestwl(ktmp1, ktmp1);
5111   jccb(Assembler::equal, done);
5112 
5113   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5114   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5115   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5116 
5117   kxorwl(ktmp1, ktmp1, ktmp2);
5118   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5119   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5120   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5121   bind(done);
5122 }
5123 
5124 /*
5125  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5126  * If src is NaN, the result is 0.
5127  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5128  * the result is equal to the value of Long.MIN_VALUE.
5129  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5130  * the result is equal to the value of Long.MAX_VALUE.
5131  */
5132 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5133                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5134                                                                       Register rscratch, AddressLiteral double_sign_flip,
5135                                                                       int vec_enc) {
5136   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5137 
5138   Label done;
5139   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5140   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5141   kortestwl(ktmp1, ktmp1);
5142   jccb(Assembler::equal, done);
5143 
5144   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5145   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5146   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5147 
5148   kxorwl(ktmp1, ktmp1, ktmp2);
5149   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5150   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5151   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5152   bind(done);
5153 }
5154 
5155 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5156                                                              XMMRegister xtmp, int index, int vec_enc) {
5157    assert(vec_enc < Assembler::AVX_512bit, "");
5158    if (vec_enc == Assembler::AVX_256bit) {
5159      vextractf128_high(xtmp, src);
5160      vshufps(dst, src, xtmp, index, vec_enc);
5161    } else {
5162      vshufps(dst, src, zero, index, vec_enc);
5163    }
5164 }
5165 
5166 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5167                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5168                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5169   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5170 
5171   Label done;
5172   // Compare the destination lanes with float_sign_flip
5173   // value to get mask for all special values.
5174   movdqu(xtmp1, float_sign_flip, rscratch);
5175   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5176   ptest(xtmp2, xtmp2);
5177   jccb(Assembler::equal, done);
5178 
5179   // Flip float_sign_flip to get max integer value.
5180   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5181   pxor(xtmp1, xtmp4);
5182 
5183   // Set detination lanes corresponding to unordered source lanes as zero.
5184   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5185   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5186 
5187   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5188   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5189   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5190 
5191   // Recompute the mask for remaining special value.
5192   pxor(xtmp2, xtmp3);
5193   // Extract mask corresponding to non-negative source lanes.
5194   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5195 
5196   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5197   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5198   pand(xtmp3, xtmp2);
5199 
5200   // Replace destination lanes holding special value(0x80000000) with max int
5201   // if corresponding source lane holds a +ve value.
5202   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5203   bind(done);
5204 }
5205 
5206 
5207 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5208                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5209   switch(to_elem_bt) {
5210     case T_SHORT:
5211       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5212       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5213       vpackusdw(dst, dst, zero, vec_enc);
5214       if (vec_enc == Assembler::AVX_256bit) {
5215         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5216       }
5217       break;
5218     case  T_BYTE:
5219       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5220       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5221       vpackusdw(dst, dst, zero, vec_enc);
5222       if (vec_enc == Assembler::AVX_256bit) {
5223         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5224       }
5225       vpackuswb(dst, dst, zero, vec_enc);
5226       break;
5227     default: assert(false, "%s", type2name(to_elem_bt));
5228   }
5229 }
5230 
5231 /*
5232  * Algorithm for vector D2L and F2I conversions:-
5233  * a) Perform vector D2L/F2I cast.
5234  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5235  *    It signifies that source value could be any of the special floating point
5236  *    values(NaN,-Inf,Inf,Max,-Min).
5237  * c) Set destination to zero if source is NaN value.
5238  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5239  */
5240 
5241 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5242                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5243                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5244   int to_elem_sz = type2aelembytes(to_elem_bt);
5245   assert(to_elem_sz <= 4, "");
5246   vcvttps2dq(dst, src, vec_enc);
5247   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5248   if (to_elem_sz < 4) {
5249     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5250     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5251   }
5252 }
5253 
5254 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5255                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5256                                             Register rscratch, int vec_enc) {
5257   int to_elem_sz = type2aelembytes(to_elem_bt);
5258   assert(to_elem_sz <= 4, "");
5259   vcvttps2dq(dst, src, vec_enc);
5260   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5261   switch(to_elem_bt) {
5262     case T_INT:
5263       break;
5264     case T_SHORT:
5265       evpmovdw(dst, dst, vec_enc);
5266       break;
5267     case T_BYTE:
5268       evpmovdb(dst, dst, vec_enc);
5269       break;
5270     default: assert(false, "%s", type2name(to_elem_bt));
5271   }
5272 }
5273 
5274 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5275                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5276                                             Register rscratch, int vec_enc) {
5277   evcvttps2qq(dst, src, vec_enc);
5278   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5279 }
5280 
5281 // Handling for downcasting from double to integer or sub-word types on AVX2.
5282 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5283                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5284                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5285   int to_elem_sz = type2aelembytes(to_elem_bt);
5286   assert(to_elem_sz < 8, "");
5287   vcvttpd2dq(dst, src, vec_enc);
5288   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5289                                               float_sign_flip, vec_enc);
5290   if (to_elem_sz < 4) {
5291     // xtmp4 holds all zero lanes.
5292     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5293   }
5294 }
5295 
5296 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5297                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5298                                             KRegister ktmp2, AddressLiteral sign_flip,
5299                                             Register rscratch, int vec_enc) {
5300   if (VM_Version::supports_avx512dq()) {
5301     evcvttpd2qq(dst, src, vec_enc);
5302     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5303     switch(to_elem_bt) {
5304       case T_LONG:
5305         break;
5306       case T_INT:
5307         evpmovsqd(dst, dst, vec_enc);
5308         break;
5309       case T_SHORT:
5310         evpmovsqd(dst, dst, vec_enc);
5311         evpmovdw(dst, dst, vec_enc);
5312         break;
5313       case T_BYTE:
5314         evpmovsqd(dst, dst, vec_enc);
5315         evpmovdb(dst, dst, vec_enc);
5316         break;
5317       default: assert(false, "%s", type2name(to_elem_bt));
5318     }
5319   } else {
5320     assert(type2aelembytes(to_elem_bt) <= 4, "");
5321     vcvttpd2dq(dst, src, vec_enc);
5322     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5323     switch(to_elem_bt) {
5324       case T_INT:
5325         break;
5326       case T_SHORT:
5327         evpmovdw(dst, dst, vec_enc);
5328         break;
5329       case T_BYTE:
5330         evpmovdb(dst, dst, vec_enc);
5331         break;
5332       default: assert(false, "%s", type2name(to_elem_bt));
5333     }
5334   }
5335 }
5336 
5337 #ifdef _LP64
5338 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5339                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5340                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5341   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5342   // and re-instantiate original MXCSR.RC mode after that.
5343   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5344 
5345   mov64(tmp, julong_cast(0.5L));
5346   evpbroadcastq(xtmp1, tmp, vec_enc);
5347   vaddpd(xtmp1, src , xtmp1, vec_enc);
5348   evcvtpd2qq(dst, xtmp1, vec_enc);
5349   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5350                                                 double_sign_flip, vec_enc);;
5351 
5352   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5353 }
5354 
5355 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5356                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5357                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5358   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5359   // and re-instantiate original MXCSR.RC mode after that.
5360   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5361 
5362   movl(tmp, jint_cast(0.5));
5363   movq(xtmp1, tmp);
5364   vbroadcastss(xtmp1, xtmp1, vec_enc);
5365   vaddps(xtmp1, src , xtmp1, vec_enc);
5366   vcvtps2dq(dst, xtmp1, vec_enc);
5367   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5368                                               float_sign_flip, vec_enc);
5369 
5370   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5371 }
5372 
5373 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5374                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5375                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5376   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5377   // and re-instantiate original MXCSR.RC mode after that.
5378   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5379 
5380   movl(tmp, jint_cast(0.5));
5381   movq(xtmp1, tmp);
5382   vbroadcastss(xtmp1, xtmp1, vec_enc);
5383   vaddps(xtmp1, src , xtmp1, vec_enc);
5384   vcvtps2dq(dst, xtmp1, vec_enc);
5385   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5386 
5387   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5388 }
5389 #endif // _LP64
5390 
5391 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5392                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5393   switch (from_elem_bt) {
5394     case T_BYTE:
5395       switch (to_elem_bt) {
5396         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5397         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5398         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5399         default: ShouldNotReachHere();
5400       }
5401       break;
5402     case T_SHORT:
5403       switch (to_elem_bt) {
5404         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5405         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5406         default: ShouldNotReachHere();
5407       }
5408       break;
5409     case T_INT:
5410       assert(to_elem_bt == T_LONG, "");
5411       vpmovzxdq(dst, src, vlen_enc);
5412       break;
5413     default:
5414       ShouldNotReachHere();
5415   }
5416 }
5417 
5418 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5419                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5420   switch (from_elem_bt) {
5421     case T_BYTE:
5422       switch (to_elem_bt) {
5423         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5424         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5425         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5426         default: ShouldNotReachHere();
5427       }
5428       break;
5429     case T_SHORT:
5430       switch (to_elem_bt) {
5431         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5432         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5433         default: ShouldNotReachHere();
5434       }
5435       break;
5436     case T_INT:
5437       assert(to_elem_bt == T_LONG, "");
5438       vpmovsxdq(dst, src, vlen_enc);
5439       break;
5440     default:
5441       ShouldNotReachHere();
5442   }
5443 }
5444 
5445 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5446                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5447   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5448   assert(vlen_enc != AVX_512bit, "");
5449 
5450   int dst_bt_size = type2aelembytes(dst_bt);
5451   int src_bt_size = type2aelembytes(src_bt);
5452   if (dst_bt_size > src_bt_size) {
5453     switch (dst_bt_size / src_bt_size) {
5454       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5455       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5456       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5457       default: ShouldNotReachHere();
5458     }
5459   } else {
5460     assert(dst_bt_size < src_bt_size, "");
5461     switch (src_bt_size / dst_bt_size) {
5462       case 2: {
5463         if (vlen_enc == AVX_128bit) {
5464           vpacksswb(dst, src, src, vlen_enc);
5465         } else {
5466           vpacksswb(dst, src, src, vlen_enc);
5467           vpermq(dst, dst, 0x08, vlen_enc);
5468         }
5469         break;
5470       }
5471       case 4: {
5472         if (vlen_enc == AVX_128bit) {
5473           vpackssdw(dst, src, src, vlen_enc);
5474           vpacksswb(dst, dst, dst, vlen_enc);
5475         } else {
5476           vpackssdw(dst, src, src, vlen_enc);
5477           vpermq(dst, dst, 0x08, vlen_enc);
5478           vpacksswb(dst, dst, dst, AVX_128bit);
5479         }
5480         break;
5481       }
5482       case 8: {
5483         if (vlen_enc == AVX_128bit) {
5484           vpshufd(dst, src, 0x08, vlen_enc);
5485           vpackssdw(dst, dst, dst, vlen_enc);
5486           vpacksswb(dst, dst, dst, vlen_enc);
5487         } else {
5488           vpshufd(dst, src, 0x08, vlen_enc);
5489           vpermq(dst, dst, 0x08, vlen_enc);
5490           vpackssdw(dst, dst, dst, AVX_128bit);
5491           vpacksswb(dst, dst, dst, AVX_128bit);
5492         }
5493         break;
5494       }
5495       default: ShouldNotReachHere();
5496     }
5497   }
5498 }
5499 
5500 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5501                                    bool merge, BasicType bt, int vlen_enc) {
5502   if (bt == T_INT) {
5503     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5504   } else {
5505     assert(bt == T_LONG, "");
5506     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5507   }
5508 }
5509 
5510 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5511                                    bool merge, BasicType bt, int vlen_enc) {
5512   if (bt == T_INT) {
5513     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5514   } else {
5515     assert(bt == T_LONG, "");
5516     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5517   }
5518 }
5519 
5520 #ifdef _LP64
5521 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5522                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5523                                                int vec_enc) {
5524   int index = 0;
5525   int vindex = 0;
5526   mov64(rtmp1, 0x0101010101010101L);
5527   pdepq(rtmp1, src, rtmp1);
5528   if (mask_len > 8) {
5529     movq(rtmp2, src);
5530     vpxor(xtmp, xtmp, xtmp, vec_enc);
5531     movq(xtmp, rtmp1);
5532   }
5533   movq(dst, rtmp1);
5534 
5535   mask_len -= 8;
5536   while (mask_len > 0) {
5537     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5538     index++;
5539     if ((index % 2) == 0) {
5540       pxor(xtmp, xtmp);
5541     }
5542     mov64(rtmp1, 0x0101010101010101L);
5543     shrq(rtmp2, 8);
5544     pdepq(rtmp1, rtmp2, rtmp1);
5545     pinsrq(xtmp, rtmp1, index % 2);
5546     vindex = index / 2;
5547     if (vindex) {
5548       // Write entire 16 byte vector when both 64 bit
5549       // lanes are update to save redundant instructions.
5550       if (index % 2) {
5551         vinsertf128(dst, dst, xtmp, vindex);
5552       }
5553     } else {
5554       vmovdqu(dst, xtmp);
5555     }
5556     mask_len -= 8;
5557   }
5558 }
5559 
5560 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5561   switch(opc) {
5562     case Op_VectorMaskTrueCount:
5563       popcntq(dst, tmp);
5564       break;
5565     case Op_VectorMaskLastTrue:
5566       if (VM_Version::supports_lzcnt()) {
5567         lzcntq(tmp, tmp);
5568         movl(dst, 63);
5569         subl(dst, tmp);
5570       } else {
5571         movl(dst, -1);
5572         bsrq(tmp, tmp);
5573         cmov32(Assembler::notZero, dst, tmp);
5574       }
5575       break;
5576     case Op_VectorMaskFirstTrue:
5577       if (VM_Version::supports_bmi1()) {
5578         if (masklen < 32) {
5579           orl(tmp, 1 << masklen);
5580           tzcntl(dst, tmp);
5581         } else if (masklen == 32) {
5582           tzcntl(dst, tmp);
5583         } else {
5584           assert(masklen == 64, "");
5585           tzcntq(dst, tmp);
5586         }
5587       } else {
5588         if (masklen < 32) {
5589           orl(tmp, 1 << masklen);
5590           bsfl(dst, tmp);
5591         } else {
5592           assert(masklen == 32 || masklen == 64, "");
5593           movl(dst, masklen);
5594           if (masklen == 32)  {
5595             bsfl(tmp, tmp);
5596           } else {
5597             bsfq(tmp, tmp);
5598           }
5599           cmov32(Assembler::notZero, dst, tmp);
5600         }
5601       }
5602       break;
5603     case Op_VectorMaskToLong:
5604       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5605       break;
5606     default: assert(false, "Unhandled mask operation");
5607   }
5608 }
5609 
5610 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5611                                               int masklen, int masksize, int vec_enc) {
5612   assert(VM_Version::supports_popcnt(), "");
5613 
5614   if(VM_Version::supports_avx512bw()) {
5615     kmovql(tmp, mask);
5616   } else {
5617     assert(masklen <= 16, "");
5618     kmovwl(tmp, mask);
5619   }
5620 
5621   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5622   // operations needs to be clipped.
5623   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5624     andq(tmp, (1 << masklen) - 1);
5625   }
5626 
5627   vector_mask_operation_helper(opc, dst, tmp, masklen);
5628 }
5629 
5630 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5631                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5632   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5633          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5634   assert(VM_Version::supports_popcnt(), "");
5635 
5636   bool need_clip = false;
5637   switch(bt) {
5638     case T_BOOLEAN:
5639       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5640       vpxor(xtmp, xtmp, xtmp, vec_enc);
5641       vpsubb(xtmp, xtmp, mask, vec_enc);
5642       vpmovmskb(tmp, xtmp, vec_enc);
5643       need_clip = masklen < 16;
5644       break;
5645     case T_BYTE:
5646       vpmovmskb(tmp, mask, vec_enc);
5647       need_clip = masklen < 16;
5648       break;
5649     case T_SHORT:
5650       vpacksswb(xtmp, mask, mask, vec_enc);
5651       if (masklen >= 16) {
5652         vpermpd(xtmp, xtmp, 8, vec_enc);
5653       }
5654       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5655       need_clip = masklen < 16;
5656       break;
5657     case T_INT:
5658     case T_FLOAT:
5659       vmovmskps(tmp, mask, vec_enc);
5660       need_clip = masklen < 4;
5661       break;
5662     case T_LONG:
5663     case T_DOUBLE:
5664       vmovmskpd(tmp, mask, vec_enc);
5665       need_clip = masklen < 2;
5666       break;
5667     default: assert(false, "Unhandled type, %s", type2name(bt));
5668   }
5669 
5670   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5671   // operations needs to be clipped.
5672   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5673     // need_clip implies masklen < 32
5674     andq(tmp, (1 << masklen) - 1);
5675   }
5676 
5677   vector_mask_operation_helper(opc, dst, tmp, masklen);
5678 }
5679 
5680 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5681                                              Register rtmp2, int mask_len) {
5682   kmov(rtmp1, src);
5683   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5684   mov64(rtmp2, -1L);
5685   pextq(rtmp2, rtmp2, rtmp1);
5686   kmov(dst, rtmp2);
5687 }
5688 
5689 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5690                                                     XMMRegister mask, Register rtmp, Register rscratch,
5691                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5692                                                     int vec_enc) {
5693   assert(type2aelembytes(bt) >= 4, "");
5694   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5695   address compress_perm_table = nullptr;
5696   address expand_perm_table = nullptr;
5697   if (type2aelembytes(bt) == 8) {
5698     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5699     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5700     vmovmskpd(rtmp, mask, vec_enc);
5701   } else {
5702     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5703     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5704     vmovmskps(rtmp, mask, vec_enc);
5705   }
5706   shlq(rtmp, 5); // for 32 byte permute row.
5707   if (opcode == Op_CompressV) {
5708     lea(rscratch, ExternalAddress(compress_perm_table));
5709   } else {
5710     lea(rscratch, ExternalAddress(expand_perm_table));
5711   }
5712   addptr(rtmp, rscratch);
5713   vmovdqu(permv, Address(rtmp));
5714   vpermps(dst, permv, src, Assembler::AVX_256bit);
5715   vpxor(xtmp, xtmp, xtmp, vec_enc);
5716   // Blend the result with zero vector using permute mask, each column entry
5717   // in a permute table row contains either a valid permute index or a -1 (default)
5718   // value, this can potentially be used as a blending mask after
5719   // compressing/expanding the source vector lanes.
5720   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5721 }
5722 
5723 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5724                                                bool merge, BasicType bt, int vec_enc) {
5725   if (opcode == Op_CompressV) {
5726     switch(bt) {
5727     case T_BYTE:
5728       evpcompressb(dst, mask, src, merge, vec_enc);
5729       break;
5730     case T_CHAR:
5731     case T_SHORT:
5732       evpcompressw(dst, mask, src, merge, vec_enc);
5733       break;
5734     case T_INT:
5735       evpcompressd(dst, mask, src, merge, vec_enc);
5736       break;
5737     case T_FLOAT:
5738       evcompressps(dst, mask, src, merge, vec_enc);
5739       break;
5740     case T_LONG:
5741       evpcompressq(dst, mask, src, merge, vec_enc);
5742       break;
5743     case T_DOUBLE:
5744       evcompresspd(dst, mask, src, merge, vec_enc);
5745       break;
5746     default:
5747       fatal("Unsupported type %s", type2name(bt));
5748       break;
5749     }
5750   } else {
5751     assert(opcode == Op_ExpandV, "");
5752     switch(bt) {
5753     case T_BYTE:
5754       evpexpandb(dst, mask, src, merge, vec_enc);
5755       break;
5756     case T_CHAR:
5757     case T_SHORT:
5758       evpexpandw(dst, mask, src, merge, vec_enc);
5759       break;
5760     case T_INT:
5761       evpexpandd(dst, mask, src, merge, vec_enc);
5762       break;
5763     case T_FLOAT:
5764       evexpandps(dst, mask, src, merge, vec_enc);
5765       break;
5766     case T_LONG:
5767       evpexpandq(dst, mask, src, merge, vec_enc);
5768       break;
5769     case T_DOUBLE:
5770       evexpandpd(dst, mask, src, merge, vec_enc);
5771       break;
5772     default:
5773       fatal("Unsupported type %s", type2name(bt));
5774       break;
5775     }
5776   }
5777 }
5778 #endif
5779 
5780 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5781                                            KRegister ktmp1, int vec_enc) {
5782   if (opcode == Op_SignumVD) {
5783     vsubpd(dst, zero, one, vec_enc);
5784     // if src < 0 ? -1 : 1
5785     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5786     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5787     // if src == NaN, -0.0 or 0.0 return src.
5788     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5789     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5790   } else {
5791     assert(opcode == Op_SignumVF, "");
5792     vsubps(dst, zero, one, vec_enc);
5793     // if src < 0 ? -1 : 1
5794     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5795     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5796     // if src == NaN, -0.0 or 0.0 return src.
5797     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5798     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5799   }
5800 }
5801 
5802 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5803                                           XMMRegister xtmp1, int vec_enc) {
5804   if (opcode == Op_SignumVD) {
5805     vsubpd(dst, zero, one, vec_enc);
5806     // if src < 0 ? -1 : 1
5807     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5808     // if src == NaN, -0.0 or 0.0 return src.
5809     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5810     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5811   } else {
5812     assert(opcode == Op_SignumVF, "");
5813     vsubps(dst, zero, one, vec_enc);
5814     // if src < 0 ? -1 : 1
5815     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5816     // if src == NaN, -0.0 or 0.0 return src.
5817     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5818     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5819   }
5820 }
5821 
5822 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5823   if (VM_Version::supports_avx512bw()) {
5824     if (mask_len > 32) {
5825       kmovql(dst, src);
5826     } else {
5827       kmovdl(dst, src);
5828       if (mask_len != 32) {
5829         kshiftrdl(dst, dst, 32 - mask_len);
5830       }
5831     }
5832   } else {
5833     assert(mask_len <= 16, "");
5834     kmovwl(dst, src);
5835     if (mask_len != 16) {
5836       kshiftrwl(dst, dst, 16 - mask_len);
5837     }
5838   }
5839 }
5840 
5841 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5842   int lane_size = type2aelembytes(bt);
5843   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5844   if ((is_LP64 || lane_size < 8) &&
5845       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5846        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5847     movptr(rtmp, imm32);
5848     switch(lane_size) {
5849       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5850       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5851       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5852       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5853       fatal("Unsupported lane size %d", lane_size);
5854       break;
5855     }
5856   } else {
5857     movptr(rtmp, imm32);
5858     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5859     switch(lane_size) {
5860       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5861       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5862       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5863       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5864       fatal("Unsupported lane size %d", lane_size);
5865       break;
5866     }
5867   }
5868 }
5869 
5870 //
5871 // Following is lookup table based popcount computation algorithm:-
5872 //       Index   Bit set count
5873 //     [ 0000 ->   0,
5874 //       0001 ->   1,
5875 //       0010 ->   1,
5876 //       0011 ->   2,
5877 //       0100 ->   1,
5878 //       0101 ->   2,
5879 //       0110 ->   2,
5880 //       0111 ->   3,
5881 //       1000 ->   1,
5882 //       1001 ->   2,
5883 //       1010 ->   3,
5884 //       1011 ->   3,
5885 //       1100 ->   2,
5886 //       1101 ->   3,
5887 //       1111 ->   4 ]
5888 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5889 //     shuffle indices for lookup table access.
5890 //  b. Right shift each byte of vector lane by 4 positions.
5891 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5892 //     shuffle indices for lookup table access.
5893 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5894 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5895 //     count of all the bytes of a quadword.
5896 //  f. Perform step e. for upper 128bit vector lane.
5897 //  g. Pack the bitset count of quadwords back to double word.
5898 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5899 
5900 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5901                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5902   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5903   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5904   vpsrlw(dst, src, 4, vec_enc);
5905   vpand(dst, dst, xtmp1, vec_enc);
5906   vpand(xtmp1, src, xtmp1, vec_enc);
5907   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5908   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5909   vpshufb(dst, xtmp2, dst, vec_enc);
5910   vpaddb(dst, dst, xtmp1, vec_enc);
5911 }
5912 
5913 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5914                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5915   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5916   // Following code is as per steps e,f,g and h of above algorithm.
5917   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5918   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5919   vpsadbw(dst, dst, xtmp2, vec_enc);
5920   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5921   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5922   vpackuswb(dst, xtmp1, dst, vec_enc);
5923 }
5924 
5925 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5926                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5927   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5928   // Add the popcount of upper and lower bytes of word.
5929   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5930   vpsrlw(dst, xtmp1, 8, vec_enc);
5931   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5932   vpaddw(dst, dst, xtmp1, vec_enc);
5933 }
5934 
5935 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5936                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5937   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5938   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5939   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5940 }
5941 
5942 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5943                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5944   switch(bt) {
5945     case T_LONG:
5946       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5947       break;
5948     case T_INT:
5949       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5950       break;
5951     case T_CHAR:
5952     case T_SHORT:
5953       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5954       break;
5955     case T_BYTE:
5956     case T_BOOLEAN:
5957       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5958       break;
5959     default:
5960       fatal("Unsupported type %s", type2name(bt));
5961       break;
5962   }
5963 }
5964 
5965 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5966                                                       KRegister mask, bool merge, int vec_enc) {
5967   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5968   switch(bt) {
5969     case T_LONG:
5970       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5971       evpopcntq(dst, mask, src, merge, vec_enc);
5972       break;
5973     case T_INT:
5974       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5975       evpopcntd(dst, mask, src, merge, vec_enc);
5976       break;
5977     case T_CHAR:
5978     case T_SHORT:
5979       assert(VM_Version::supports_avx512_bitalg(), "");
5980       evpopcntw(dst, mask, src, merge, vec_enc);
5981       break;
5982     case T_BYTE:
5983     case T_BOOLEAN:
5984       assert(VM_Version::supports_avx512_bitalg(), "");
5985       evpopcntb(dst, mask, src, merge, vec_enc);
5986       break;
5987     default:
5988       fatal("Unsupported type %s", type2name(bt));
5989       break;
5990   }
5991 }
5992 
5993 #ifndef _LP64
5994 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5995   assert(VM_Version::supports_avx512bw(), "");
5996   kmovdl(tmp, src);
5997   kunpckdql(dst, tmp, tmp);
5998 }
5999 #endif
6000 
6001 // Bit reversal algorithm first reverses the bits of each byte followed by
6002 // a byte level reversal for multi-byte primitive types (short/int/long).
6003 // Algorithm performs a lookup table access to get reverse bit sequence
6004 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6005 // is obtained by swapping the reverse bit sequences of upper and lower
6006 // nibble of a byte.
6007 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6008                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
6009   if (VM_Version::supports_avx512vlbw()) {
6010 
6011     // Get the reverse bit sequence of lower nibble of each byte.
6012     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6013     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6014     evpandq(dst, xtmp2, src, vec_enc);
6015     vpshufb(dst, xtmp1, dst, vec_enc);
6016     vpsllq(dst, dst, 4, vec_enc);
6017 
6018     // Get the reverse bit sequence of upper nibble of each byte.
6019     vpandn(xtmp2, xtmp2, src, vec_enc);
6020     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6021     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6022 
6023     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6024     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6025     evporq(xtmp2, dst, xtmp2, vec_enc);
6026     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6027 
6028   } else if(vec_enc == Assembler::AVX_512bit) {
6029     // Shift based bit reversal.
6030     assert(bt == T_LONG || bt == T_INT, "");
6031 
6032     // Swap lower and upper nibble of each byte.
6033     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6034 
6035     // Swap two least and most significant bits of each nibble.
6036     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6037 
6038     // Swap adjacent pair of bits.
6039     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6040     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6041 
6042     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6043     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6044   } else {
6045     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6046     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6047 
6048     // Get the reverse bit sequence of lower nibble of each byte.
6049     vpand(dst, xtmp2, src, vec_enc);
6050     vpshufb(dst, xtmp1, dst, vec_enc);
6051     vpsllq(dst, dst, 4, vec_enc);
6052 
6053     // Get the reverse bit sequence of upper nibble of each byte.
6054     vpandn(xtmp2, xtmp2, src, vec_enc);
6055     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6056     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6057 
6058     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6059     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6060     vpor(xtmp2, dst, xtmp2, vec_enc);
6061     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6062   }
6063 }
6064 
6065 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6066                                                 XMMRegister xtmp, Register rscratch) {
6067   assert(VM_Version::supports_gfni(), "");
6068   assert(rscratch != noreg || always_reachable(mask), "missing");
6069 
6070   // Galois field instruction based bit reversal based on following algorithm.
6071   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6072   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6073   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6074   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6075 }
6076 
6077 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6078                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6079   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6080   evpandq(dst, xtmp1, src, vec_enc);
6081   vpsllq(dst, dst, nbits, vec_enc);
6082   vpandn(xtmp1, xtmp1, src, vec_enc);
6083   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6084   evporq(dst, dst, xtmp1, vec_enc);
6085 }
6086 
6087 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6088                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6089   // Shift based bit reversal.
6090   assert(VM_Version::supports_evex(), "");
6091   switch(bt) {
6092     case T_LONG:
6093       // Swap upper and lower double word of each quad word.
6094       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6095       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6096       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6097       break;
6098     case T_INT:
6099       // Swap upper and lower word of each double word.
6100       evprord(xtmp1, k0, src, 16, true, vec_enc);
6101       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6102       break;
6103     case T_CHAR:
6104     case T_SHORT:
6105       // Swap upper and lower byte of each word.
6106       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6107       break;
6108     case T_BYTE:
6109       evmovdquq(dst, k0, src, true, vec_enc);
6110       break;
6111     default:
6112       fatal("Unsupported type %s", type2name(bt));
6113       break;
6114   }
6115 }
6116 
6117 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6118   if (bt == T_BYTE) {
6119     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6120       evmovdquq(dst, k0, src, true, vec_enc);
6121     } else {
6122       vmovdqu(dst, src);
6123     }
6124     return;
6125   }
6126   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6127   // pre-computed shuffle indices.
6128   switch(bt) {
6129     case T_LONG:
6130       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6131       break;
6132     case T_INT:
6133       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6134       break;
6135     case T_CHAR:
6136     case T_SHORT:
6137       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6138       break;
6139     default:
6140       fatal("Unsupported type %s", type2name(bt));
6141       break;
6142   }
6143   vpshufb(dst, src, dst, vec_enc);
6144 }
6145 
6146 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6147                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6148                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6149   assert(is_integral_type(bt), "");
6150   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6151   assert(VM_Version::supports_avx512cd(), "");
6152   switch(bt) {
6153     case T_LONG:
6154       evplzcntq(dst, ktmp, src, merge, vec_enc);
6155       break;
6156     case T_INT:
6157       evplzcntd(dst, ktmp, src, merge, vec_enc);
6158       break;
6159     case T_SHORT:
6160       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6161       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6162       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6163       vpunpckhwd(dst, xtmp1, src, vec_enc);
6164       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6165       vpackusdw(dst, xtmp2, dst, vec_enc);
6166       break;
6167     case T_BYTE:
6168       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6169       // accessing the lookup table.
6170       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6171       // accessing the lookup table.
6172       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6173       assert(VM_Version::supports_avx512bw(), "");
6174       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6175       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6176       vpand(xtmp2, dst, src, vec_enc);
6177       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6178       vpsrlw(xtmp3, src, 4, vec_enc);
6179       vpand(xtmp3, dst, xtmp3, vec_enc);
6180       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6181       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6182       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6183       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6184       break;
6185     default:
6186       fatal("Unsupported type %s", type2name(bt));
6187       break;
6188   }
6189 }
6190 
6191 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6192                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6193   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6194   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6195   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6196   // accessing the lookup table.
6197   vpand(dst, xtmp2, src, vec_enc);
6198   vpshufb(dst, xtmp1, dst, vec_enc);
6199   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6200   // accessing the lookup table.
6201   vpsrlw(xtmp3, src, 4, vec_enc);
6202   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6203   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6204   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6205   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6206   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6207   vpaddb(dst, dst, xtmp2, vec_enc);
6208   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6209 }
6210 
6211 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6212                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6213   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6214   // Add zero counts of lower byte and upper byte of a word if
6215   // upper byte holds a zero value.
6216   vpsrlw(xtmp3, src, 8, vec_enc);
6217   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6218   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6219   vpsllw(xtmp2, dst, 8, vec_enc);
6220   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6221   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6222   vpsrlw(dst, dst, 8, vec_enc);
6223 }
6224 
6225 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6226                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6227   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6228   // hence biased exponent can be used to compute leading zero count as per
6229   // following formula:-
6230   // LZCNT = 32 - (biased_exp - 127)
6231   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6232 
6233   // Broadcast 0xFF
6234   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6235   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6236 
6237   // Extract biased exponent.
6238   vcvtdq2ps(dst, src, vec_enc);
6239   vpsrld(dst, dst, 23, vec_enc);
6240   vpand(dst, dst, xtmp1, vec_enc);
6241 
6242   // Broadcast 127.
6243   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6244   // Exponent = biased_exp - 127
6245   vpsubd(dst, dst, xtmp1, vec_enc);
6246 
6247   // Exponent = Exponent  + 1
6248   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6249   vpaddd(dst, dst, xtmp3, vec_enc);
6250 
6251   // Replace -ve exponent with zero, exponent is -ve when src
6252   // lane contains a zero value.
6253   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6254   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6255 
6256   // Rematerialize broadcast 32.
6257   vpslld(xtmp1, xtmp3, 5, vec_enc);
6258   // Exponent is 32 if corresponding source lane contains max_int value.
6259   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6260   // LZCNT = 32 - exponent
6261   vpsubd(dst, xtmp1, dst, vec_enc);
6262 
6263   // Replace LZCNT with a value 1 if corresponding source lane
6264   // contains max_int value.
6265   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6266 
6267   // Replace biased_exp with 0 if source lane value is less than zero.
6268   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6269   vblendvps(dst, dst, xtmp2, src, vec_enc);
6270 }
6271 
6272 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6273                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6274   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6275   // Add zero counts of lower word and upper word of a double word if
6276   // upper word holds a zero value.
6277   vpsrld(xtmp3, src, 16, vec_enc);
6278   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6279   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6280   vpslld(xtmp2, dst, 16, vec_enc);
6281   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6282   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6283   vpsrld(dst, dst, 16, vec_enc);
6284   // Add zero counts of lower doubleword and upper doubleword of a
6285   // quadword if upper doubleword holds a zero value.
6286   vpsrlq(xtmp3, src, 32, vec_enc);
6287   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6288   vpsllq(xtmp2, dst, 32, vec_enc);
6289   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6290   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6291   vpsrlq(dst, dst, 32, vec_enc);
6292 }
6293 
6294 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6295                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6296                                                        Register rtmp, int vec_enc) {
6297   assert(is_integral_type(bt), "unexpected type");
6298   assert(vec_enc < Assembler::AVX_512bit, "");
6299   switch(bt) {
6300     case T_LONG:
6301       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6302       break;
6303     case T_INT:
6304       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6305       break;
6306     case T_SHORT:
6307       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6308       break;
6309     case T_BYTE:
6310       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6311       break;
6312     default:
6313       fatal("Unsupported type %s", type2name(bt));
6314       break;
6315   }
6316 }
6317 
6318 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6319   switch(bt) {
6320     case T_BYTE:
6321       vpsubb(dst, src1, src2, vec_enc);
6322       break;
6323     case T_SHORT:
6324       vpsubw(dst, src1, src2, vec_enc);
6325       break;
6326     case T_INT:
6327       vpsubd(dst, src1, src2, vec_enc);
6328       break;
6329     case T_LONG:
6330       vpsubq(dst, src1, src2, vec_enc);
6331       break;
6332     default:
6333       fatal("Unsupported type %s", type2name(bt));
6334       break;
6335   }
6336 }
6337 
6338 // Trailing zero count computation is based on leading zero count operation as per
6339 // following equation. All AVX3 targets support AVX512CD feature which offers
6340 // direct vector instruction to compute leading zero count.
6341 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6342 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6343                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6344                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6345   assert(is_integral_type(bt), "");
6346   // xtmp = -1
6347   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6348   // xtmp = xtmp + src
6349   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6350   // xtmp = xtmp & ~src
6351   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6352   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6353   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6354   vpsub(bt, dst, xtmp4, dst, vec_enc);
6355 }
6356 
6357 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6358 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6359 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6360                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6361   assert(is_integral_type(bt), "");
6362   // xtmp = 0
6363   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6364   // xtmp = 0 - src
6365   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6366   // xtmp = xtmp | src
6367   vpor(xtmp3, xtmp3, src, vec_enc);
6368   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6369   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6370   vpsub(bt, dst, xtmp1, dst, vec_enc);
6371 }
6372 
6373 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6374   Label done;
6375   Label neg_divisor_fastpath;
6376   cmpl(divisor, 0);
6377   jccb(Assembler::less, neg_divisor_fastpath);
6378   xorl(rdx, rdx);
6379   divl(divisor);
6380   jmpb(done);
6381   bind(neg_divisor_fastpath);
6382   // Fastpath for divisor < 0:
6383   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6384   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6385   movl(rdx, rax);
6386   subl(rdx, divisor);
6387   if (VM_Version::supports_bmi1()) {
6388     andnl(rax, rdx, rax);
6389   } else {
6390     notl(rdx);
6391     andl(rax, rdx);
6392   }
6393   shrl(rax, 31);
6394   bind(done);
6395 }
6396 
6397 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6398   Label done;
6399   Label neg_divisor_fastpath;
6400   cmpl(divisor, 0);
6401   jccb(Assembler::less, neg_divisor_fastpath);
6402   xorl(rdx, rdx);
6403   divl(divisor);
6404   jmpb(done);
6405   bind(neg_divisor_fastpath);
6406   // Fastpath when divisor < 0:
6407   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6408   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6409   movl(rdx, rax);
6410   subl(rax, divisor);
6411   if (VM_Version::supports_bmi1()) {
6412     andnl(rax, rax, rdx);
6413   } else {
6414     notl(rax);
6415     andl(rax, rdx);
6416   }
6417   sarl(rax, 31);
6418   andl(rax, divisor);
6419   subl(rdx, rax);
6420   bind(done);
6421 }
6422 
6423 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6424   Label done;
6425   Label neg_divisor_fastpath;
6426 
6427   cmpl(divisor, 0);
6428   jccb(Assembler::less, neg_divisor_fastpath);
6429   xorl(rdx, rdx);
6430   divl(divisor);
6431   jmpb(done);
6432   bind(neg_divisor_fastpath);
6433   // Fastpath for divisor < 0:
6434   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6435   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6436   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6437   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6438   movl(rdx, rax);
6439   subl(rax, divisor);
6440   if (VM_Version::supports_bmi1()) {
6441     andnl(rax, rax, rdx);
6442   } else {
6443     notl(rax);
6444     andl(rax, rdx);
6445   }
6446   movl(tmp, rax);
6447   shrl(rax, 31); // quotient
6448   sarl(tmp, 31);
6449   andl(tmp, divisor);
6450   subl(rdx, tmp); // remainder
6451   bind(done);
6452 }
6453 
6454 #ifdef _LP64
6455 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6456                                  XMMRegister xtmp2, Register rtmp) {
6457   if(VM_Version::supports_gfni()) {
6458     // Galois field instruction based bit reversal based on following algorithm.
6459     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6460     mov64(rtmp, 0x8040201008040201L);
6461     movq(xtmp1, src);
6462     movq(xtmp2, rtmp);
6463     gf2p8affineqb(xtmp1, xtmp2, 0);
6464     movq(dst, xtmp1);
6465   } else {
6466     // Swap even and odd numbered bits.
6467     movl(rtmp, src);
6468     andl(rtmp, 0x55555555);
6469     shll(rtmp, 1);
6470     movl(dst, src);
6471     andl(dst, 0xAAAAAAAA);
6472     shrl(dst, 1);
6473     orl(dst, rtmp);
6474 
6475     // Swap LSB and MSB 2 bits of each nibble.
6476     movl(rtmp, dst);
6477     andl(rtmp, 0x33333333);
6478     shll(rtmp, 2);
6479     andl(dst, 0xCCCCCCCC);
6480     shrl(dst, 2);
6481     orl(dst, rtmp);
6482 
6483     // Swap LSB and MSB 4 bits of each byte.
6484     movl(rtmp, dst);
6485     andl(rtmp, 0x0F0F0F0F);
6486     shll(rtmp, 4);
6487     andl(dst, 0xF0F0F0F0);
6488     shrl(dst, 4);
6489     orl(dst, rtmp);
6490   }
6491   bswapl(dst);
6492 }
6493 
6494 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6495                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6496   if(VM_Version::supports_gfni()) {
6497     // Galois field instruction based bit reversal based on following algorithm.
6498     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6499     mov64(rtmp1, 0x8040201008040201L);
6500     movq(xtmp1, src);
6501     movq(xtmp2, rtmp1);
6502     gf2p8affineqb(xtmp1, xtmp2, 0);
6503     movq(dst, xtmp1);
6504   } else {
6505     // Swap even and odd numbered bits.
6506     movq(rtmp1, src);
6507     mov64(rtmp2, 0x5555555555555555L);
6508     andq(rtmp1, rtmp2);
6509     shlq(rtmp1, 1);
6510     movq(dst, src);
6511     notq(rtmp2);
6512     andq(dst, rtmp2);
6513     shrq(dst, 1);
6514     orq(dst, rtmp1);
6515 
6516     // Swap LSB and MSB 2 bits of each nibble.
6517     movq(rtmp1, dst);
6518     mov64(rtmp2, 0x3333333333333333L);
6519     andq(rtmp1, rtmp2);
6520     shlq(rtmp1, 2);
6521     notq(rtmp2);
6522     andq(dst, rtmp2);
6523     shrq(dst, 2);
6524     orq(dst, rtmp1);
6525 
6526     // Swap LSB and MSB 4 bits of each byte.
6527     movq(rtmp1, dst);
6528     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6529     andq(rtmp1, rtmp2);
6530     shlq(rtmp1, 4);
6531     notq(rtmp2);
6532     andq(dst, rtmp2);
6533     shrq(dst, 4);
6534     orq(dst, rtmp1);
6535   }
6536   bswapq(dst);
6537 }
6538 
6539 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6540   Label done;
6541   Label neg_divisor_fastpath;
6542   cmpq(divisor, 0);
6543   jccb(Assembler::less, neg_divisor_fastpath);
6544   xorl(rdx, rdx);
6545   divq(divisor);
6546   jmpb(done);
6547   bind(neg_divisor_fastpath);
6548   // Fastpath for divisor < 0:
6549   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6550   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6551   movq(rdx, rax);
6552   subq(rdx, divisor);
6553   if (VM_Version::supports_bmi1()) {
6554     andnq(rax, rdx, rax);
6555   } else {
6556     notq(rdx);
6557     andq(rax, rdx);
6558   }
6559   shrq(rax, 63);
6560   bind(done);
6561 }
6562 
6563 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6564   Label done;
6565   Label neg_divisor_fastpath;
6566   cmpq(divisor, 0);
6567   jccb(Assembler::less, neg_divisor_fastpath);
6568   xorq(rdx, rdx);
6569   divq(divisor);
6570   jmp(done);
6571   bind(neg_divisor_fastpath);
6572   // Fastpath when divisor < 0:
6573   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6574   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6575   movq(rdx, rax);
6576   subq(rax, divisor);
6577   if (VM_Version::supports_bmi1()) {
6578     andnq(rax, rax, rdx);
6579   } else {
6580     notq(rax);
6581     andq(rax, rdx);
6582   }
6583   sarq(rax, 63);
6584   andq(rax, divisor);
6585   subq(rdx, rax);
6586   bind(done);
6587 }
6588 
6589 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6590   Label done;
6591   Label neg_divisor_fastpath;
6592   cmpq(divisor, 0);
6593   jccb(Assembler::less, neg_divisor_fastpath);
6594   xorq(rdx, rdx);
6595   divq(divisor);
6596   jmp(done);
6597   bind(neg_divisor_fastpath);
6598   // Fastpath for divisor < 0:
6599   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6600   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6601   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6602   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6603   movq(rdx, rax);
6604   subq(rax, divisor);
6605   if (VM_Version::supports_bmi1()) {
6606     andnq(rax, rax, rdx);
6607   } else {
6608     notq(rax);
6609     andq(rax, rdx);
6610   }
6611   movq(tmp, rax);
6612   shrq(rax, 63); // quotient
6613   sarq(tmp, 63);
6614   andq(tmp, divisor);
6615   subq(rdx, tmp); // remainder
6616   bind(done);
6617 }
6618 #endif
6619 
6620 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6621                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6622                                         int vlen_enc) {
6623   assert(VM_Version::supports_avx512bw(), "");
6624   // Byte shuffles are inlane operations and indices are determined using
6625   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6626   // normalized to index range 0-15. This makes sure that all the multiples
6627   // of an index value are placed at same relative position in 128 bit
6628   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6629   // will be 16th element in their respective 128 bit lanes.
6630   movl(rtmp, 16);
6631   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6632 
6633   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6634   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6635   // original shuffle indices and move the shuffled lanes corresponding to true
6636   // mask to destination vector.
6637   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6638   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6639   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6640 
6641   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6642   // and broadcasting second 128 bit lane.
6643   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6644   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6645   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6646   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6647   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6648 
6649   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6650   // and broadcasting third 128 bit lane.
6651   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6652   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6653   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6654   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6655   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6656 
6657   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6658   // and broadcasting third 128 bit lane.
6659   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6660   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6661   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6662   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6663   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6664 }
6665 
6666 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6667                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6668   if (vlen_enc == AVX_128bit) {
6669     vpermilps(dst, src, shuffle, vlen_enc);
6670   } else if (bt == T_INT) {
6671     vpermd(dst, shuffle, src, vlen_enc);
6672   } else {
6673     assert(bt == T_FLOAT, "");
6674     vpermps(dst, shuffle, src, vlen_enc);
6675   }
6676 }
6677 
6678 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6679   switch(opcode) {
6680     case Op_AddHF: vaddsh(dst, src1, src2); break;
6681     case Op_SubHF: vsubsh(dst, src1, src2); break;
6682     case Op_MulHF: vmulsh(dst, src1, src2); break;
6683     case Op_DivHF: vdivsh(dst, src1, src2); break;
6684     case Op_MaxHF: vmaxsh(dst, src1, src2); break;
6685     case Op_MinHF: vminsh(dst, src1, src2); break;
6686     default: assert(false, "%s", NodeClassNames[opcode]); break;
6687   }
6688 }
6689 
6690 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6691   switch(elem_bt) {
6692     case T_BYTE:
6693       if (ideal_opc == Op_SaturatingAddV) {
6694         vpaddsb(dst, src1, src2, vlen_enc);
6695       } else {
6696         assert(ideal_opc == Op_SaturatingSubV, "");
6697         vpsubsb(dst, src1, src2, vlen_enc);
6698       }
6699       break;
6700     case T_SHORT:
6701       if (ideal_opc == Op_SaturatingAddV) {
6702         vpaddsw(dst, src1, src2, vlen_enc);
6703       } else {
6704         assert(ideal_opc == Op_SaturatingSubV, "");
6705         vpsubsw(dst, src1, src2, vlen_enc);
6706       }
6707       break;
6708     default:
6709       fatal("Unsupported type %s", type2name(elem_bt));
6710       break;
6711   }
6712 }
6713 
6714 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6715   switch(elem_bt) {
6716     case T_BYTE:
6717       if (ideal_opc == Op_SaturatingAddV) {
6718         vpaddusb(dst, src1, src2, vlen_enc);
6719       } else {
6720         assert(ideal_opc == Op_SaturatingSubV, "");
6721         vpsubusb(dst, src1, src2, vlen_enc);
6722       }
6723       break;
6724     case T_SHORT:
6725       if (ideal_opc == Op_SaturatingAddV) {
6726         vpaddusw(dst, src1, src2, vlen_enc);
6727       } else {
6728         assert(ideal_opc == Op_SaturatingSubV, "");
6729         vpsubusw(dst, src1, src2, vlen_enc);
6730       }
6731       break;
6732     default:
6733       fatal("Unsupported type %s", type2name(elem_bt));
6734       break;
6735   }
6736 }
6737 
6738 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6739                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6740   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6741   // overflow_mask = Inp1 <u Inp2
6742   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6743   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6744   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6745 }
6746 
6747 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6748                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6749   // Emulate unsigned comparison using signed comparison
6750   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6751   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6752   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6753   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6754 
6755   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6756 
6757   // Res = INP1 - INP2 (non-commutative and non-associative)
6758   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6759   // Res = Mask ? Zero : Res
6760   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6761   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6762 }
6763 
6764 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6765                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6766   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6767   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6768   // Res = Signed Add INP1, INP2
6769   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6770   // T1 = SRC1 | SRC2
6771   vpor(xtmp1, src1, src2, vlen_enc);
6772   // Max_Unsigned = -1
6773   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6774   // Unsigned compare:  Mask = Res <u T1
6775   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6776   // res  = Mask ? Max_Unsigned : Res
6777   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6778 }
6779 
6780 //
6781 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6782 // unsigned addition operation.
6783 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6784 //
6785 // We empirically determined its semantic equivalence to following reduced expression
6786 //    overflow_mask =  (a + b) <u (a | b)
6787 //
6788 // and also verified it though Alive2 solver.
6789 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6790 //
6791 
6792 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6793                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6794   // Res = Signed Add INP1, INP2
6795   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6796   // Compute T1 = INP1 | INP2
6797   vpor(xtmp3, src1, src2, vlen_enc);
6798   // T1 = Minimum signed value.
6799   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6800   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6801   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6802   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6803   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6804   // Compute overflow detection mask = Res<1> <s T1
6805   if (elem_bt == T_INT) {
6806     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6807   } else {
6808     assert(elem_bt == T_LONG, "");
6809     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6810   }
6811   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6812 }
6813 
6814 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6815                                       int vlen_enc, bool xtmp2_hold_M1) {
6816   if (VM_Version::supports_avx512dq()) {
6817     evpmovq2m(ktmp, src, vlen_enc);
6818   } else {
6819     assert(VM_Version::supports_evex(), "");
6820     if (!xtmp2_hold_M1) {
6821       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6822     }
6823     evpsraq(xtmp1, src, 63, vlen_enc);
6824     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6825   }
6826 }
6827 
6828 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6829                                       int vlen_enc, bool xtmp2_hold_M1) {
6830   if (VM_Version::supports_avx512dq()) {
6831     evpmovd2m(ktmp, src, vlen_enc);
6832   } else {
6833     assert(VM_Version::supports_evex(), "");
6834     if (!xtmp2_hold_M1) {
6835       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6836     }
6837     vpsrad(xtmp1, src, 31, vlen_enc);
6838     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6839   }
6840 }
6841 
6842 
6843 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6844   if (elem_bt == T_LONG) {
6845     if (VM_Version::supports_evex()) {
6846       evpsraq(dst, src, 63, vlen_enc);
6847     } else {
6848       vpsrad(dst, src, 31, vlen_enc);
6849       vpshufd(dst, dst, 0xF5, vlen_enc);
6850     }
6851   } else {
6852     assert(elem_bt == T_INT, "");
6853     vpsrad(dst, src, 31, vlen_enc);
6854   }
6855 }
6856 
6857 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6858   if (compute_allones) {
6859     if (vlen_enc == Assembler::AVX_512bit) {
6860       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6861     } else {
6862       vpcmpeqq(allones, allones, allones, vlen_enc);
6863     }
6864   }
6865   if (elem_bt == T_LONG) {
6866     vpsrlq(dst, allones, 1, vlen_enc);
6867   } else {
6868     assert(elem_bt == T_INT, "");
6869     vpsrld(dst, allones, 1, vlen_enc);
6870   }
6871 }
6872 
6873 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6874   if (compute_allones) {
6875     if (vlen_enc == Assembler::AVX_512bit) {
6876       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6877     } else {
6878       vpcmpeqq(allones, allones, allones, vlen_enc);
6879     }
6880   }
6881   if (elem_bt == T_LONG) {
6882     vpsllq(dst, allones, 63, vlen_enc);
6883   } else {
6884     assert(elem_bt == T_INT, "");
6885     vpslld(dst, allones, 31, vlen_enc);
6886   }
6887 }
6888 
6889 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6890                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6891   switch(elem_bt) {
6892     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6893     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6894     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6895     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6896     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6897   }
6898 }
6899 
6900 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6901   switch(elem_bt) {
6902     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6903     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6904     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6905     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6906     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6907   }
6908 }
6909 
6910 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6911                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6912   if (elem_bt == T_LONG) {
6913     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6914   } else {
6915     assert(elem_bt == T_INT, "");
6916     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6917   }
6918 }
6919 
6920 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6921                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6922                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6923   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6924   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6925   // Overflow detection based on Hacker's delight section 2-13.
6926   if (ideal_opc == Op_SaturatingAddV) {
6927     // res = src1 + src2
6928     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6929     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6930     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6931     vpxor(xtmp1, dst, src1, vlen_enc);
6932     vpxor(xtmp2, dst, src2, vlen_enc);
6933     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6934   } else {
6935     assert(ideal_opc == Op_SaturatingSubV, "");
6936     // res = src1 - src2
6937     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6938     // Overflow occurs when both inputs have opposite polarity and
6939     // result polarity does not comply with first input polarity.
6940     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6941     vpxor(xtmp1, src1, src2, vlen_enc);
6942     vpxor(xtmp2, dst, src1, vlen_enc);
6943     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6944   }
6945 
6946   // Compute overflow detection mask.
6947   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6948   // Note: xtmp1 hold -1 in all its lanes after above call.
6949 
6950   // Compute mask based on first input polarity.
6951   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6952 
6953   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6954   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6955 
6956   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6957   // set bits in first input polarity mask holds a min value.
6958   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6959   // Blend destination lanes with saturated values using overflow detection mask.
6960   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6961 }
6962 
6963 
6964 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6965                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6966                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6967   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6968   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6969   // Overflow detection based on Hacker's delight section 2-13.
6970   if (ideal_opc == Op_SaturatingAddV) {
6971     // res = src1 + src2
6972     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6973     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6974     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6975     vpxor(xtmp1, dst, src1, vlen_enc);
6976     vpxor(xtmp2, dst, src2, vlen_enc);
6977     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6978   } else {
6979     assert(ideal_opc == Op_SaturatingSubV, "");
6980     // res = src1 - src2
6981     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6982     // Overflow occurs when both inputs have opposite polarity and
6983     // result polarity does not comply with first input polarity.
6984     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6985     vpxor(xtmp1, src1, src2, vlen_enc);
6986     vpxor(xtmp2, dst, src1, vlen_enc);
6987     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6988   }
6989 
6990   // Sign-extend to compute overflow detection mask.
6991   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6992 
6993   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6994   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6995   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6996 
6997   // Compose saturating min/max vector using first input polarity mask.
6998   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6999   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
7000 
7001   // Blend result with saturating vector using overflow detection mask.
7002   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
7003 }
7004 
7005 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7006   switch(elem_bt) {
7007     case T_BYTE:
7008       if (ideal_opc == Op_SaturatingAddV) {
7009         vpaddsb(dst, src1, src2, vlen_enc);
7010       } else {
7011         assert(ideal_opc == Op_SaturatingSubV, "");
7012         vpsubsb(dst, src1, src2, vlen_enc);
7013       }
7014       break;
7015     case T_SHORT:
7016       if (ideal_opc == Op_SaturatingAddV) {
7017         vpaddsw(dst, src1, src2, vlen_enc);
7018       } else {
7019         assert(ideal_opc == Op_SaturatingSubV, "");
7020         vpsubsw(dst, src1, src2, vlen_enc);
7021       }
7022       break;
7023     default:
7024       fatal("Unsupported type %s", type2name(elem_bt));
7025       break;
7026   }
7027 }
7028 
7029 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7030   switch(elem_bt) {
7031     case T_BYTE:
7032       if (ideal_opc == Op_SaturatingAddV) {
7033         vpaddusb(dst, src1, src2, vlen_enc);
7034       } else {
7035         assert(ideal_opc == Op_SaturatingSubV, "");
7036         vpsubusb(dst, src1, src2, vlen_enc);
7037       }
7038       break;
7039     case T_SHORT:
7040       if (ideal_opc == Op_SaturatingAddV) {
7041         vpaddusw(dst, src1, src2, vlen_enc);
7042       } else {
7043         assert(ideal_opc == Op_SaturatingSubV, "");
7044         vpsubusw(dst, src1, src2, vlen_enc);
7045       }
7046       break;
7047     default:
7048       fatal("Unsupported type %s", type2name(elem_bt));
7049       break;
7050   }
7051 }
7052 
7053 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7054                                                      XMMRegister src2, int vlen_enc) {
7055   switch(elem_bt) {
7056     case T_BYTE:
7057       evpermi2b(dst, src1, src2, vlen_enc);
7058       break;
7059     case T_SHORT:
7060       evpermi2w(dst, src1, src2, vlen_enc);
7061       break;
7062     case T_INT:
7063       evpermi2d(dst, src1, src2, vlen_enc);
7064       break;
7065     case T_LONG:
7066       evpermi2q(dst, src1, src2, vlen_enc);
7067       break;
7068     case T_FLOAT:
7069       evpermi2ps(dst, src1, src2, vlen_enc);
7070       break;
7071     case T_DOUBLE:
7072       evpermi2pd(dst, src1, src2, vlen_enc);
7073       break;
7074     default:
7075       fatal("Unsupported type %s", type2name(elem_bt));
7076       break;
7077   }
7078 }
7079 
7080 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7081   if (is_unsigned) {
7082     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7083   } else {
7084     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7085   }
7086 }
7087 
7088 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7089   if (is_unsigned) {
7090     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7091   } else {
7092     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7093   }
7094 }