1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  54 
  55   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  56   // NativeJump::patch_verified_entry will be able to patch out the entry
  57   // code safely. The push to verify stack depth is ok at 5 bytes,
  58   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  59   // stack bang then we must use the 6 byte frame allocation even if
  60   // we have no frame. :-(
  61   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  62 
  63   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  64   // Remove word for return addr
  65   framesize -= wordSize;
  66   stack_bang_size -= wordSize;
  67 
  68   // Calls to C2R adapters often do not accept exceptional returns.
  69   // We require that their callers must bang for them.  But be careful, because
  70   // some VM calls (such as call site linkage) can use several kilobytes of
  71   // stack.  But the stack safety zone should account for that.
  72   // See bugs 4446381, 4468289, 4497237.
  73   if (stack_bang_size > 0) {
  74     generate_stack_overflow_check(stack_bang_size);
  75 
  76     // We always push rbp, so that on return to interpreter rbp, will be
  77     // restored correctly and we can correct the stack.
  78     push(rbp);
  79     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  80     if (PreserveFramePointer) {
  81       mov(rbp, rsp);
  82     }
  83     // Remove word for ebp
  84     framesize -= wordSize;
  85 
  86     // Create frame
  87     if (framesize) {
  88       subptr(rsp, framesize);
  89     }
  90   } else {
  91     // Create frame (force generation of a 4 byte immediate value)
  92     subptr_imm32(rsp, framesize);
  93 
  94     // Save RBP register now.
  95     framesize -= wordSize;
  96     movptr(Address(rsp, framesize), rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       movptr(rbp, rsp);
 100       if (framesize > 0) {
 101         addptr(rbp, framesize);
 102       }
 103     }
 104   }
 105 
 106   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 107     framesize -= wordSize;
 108     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 109   }
 110 
 111 #ifndef _LP64
 112   // If method sets FPU control word do it now
 113   if (fp_mode_24b) {
 114     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 115   }
 116   if (UseSSE >= 2 && VerifyFPU) {
 117     verify_FPU(0, "FPU stack must be clean on entry");
 118   }
 119 #endif
 120 
 121 #ifdef ASSERT
 122   if (VerifyStackAtCalls) {
 123     Label L;
 124     push(rax);
 125     mov(rax, rsp);
 126     andptr(rax, StackAlignmentInBytes-1);
 127     cmpptr(rax, StackAlignmentInBytes-wordSize);
 128     pop(rax);
 129     jcc(Assembler::equal, L);
 130     STOP("Stack is not properly aligned!");
 131     bind(L);
 132   }
 133 #endif
 134 
 135   if (!is_stub) {
 136     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137  #ifdef _LP64
 138     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 139       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 140       Label dummy_slow_path;
 141       Label dummy_continuation;
 142       Label* slow_path = &dummy_slow_path;
 143       Label* continuation = &dummy_continuation;
 144       if (!Compile::current()->output()->in_scratch_emit_size()) {
 145         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 146         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 147         Compile::current()->output()->add_stub(stub);
 148         slow_path = &stub->entry();
 149         continuation = &stub->continuation();
 150       }
 151       bs->nmethod_entry_barrier(this, slow_path, continuation);
 152     }
 153 #else
 154     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 155     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 156 #endif
 157   }
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 // fast_lock and fast_unlock used by C2
 176 
 177 // Because the transitions from emitted code to the runtime
 178 // monitorenter/exit helper stubs are so slow it's critical that
 179 // we inline both the stack-locking fast path and the inflated fast path.
 180 //
 181 // See also: cmpFastLock and cmpFastUnlock.
 182 //
 183 // What follows is a specialized inline transliteration of the code
 184 // in enter() and exit(). If we're concerned about I$ bloat another
 185 // option would be to emit TrySlowEnter and TrySlowExit methods
 186 // at startup-time.  These methods would accept arguments as
 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 188 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 190 // In practice, however, the # of lock sites is bounded and is usually small.
 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 192 // if the processor uses simple bimodal branch predictors keyed by EIP
 193 // Since the helper routines would be called from multiple synchronization
 194 // sites.
 195 //
 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 198 // to those specialized methods.  That'd give us a mostly platform-independent
 199 // implementation that the JITs could optimize and inline at their pleasure.
 200 // Done correctly, the only time we'd need to cross to native could would be
 201 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 203 // (b) explicit barriers or fence operations.
 204 //
 205 // TODO:
 206 //
 207 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 208 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 209 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 210 //    the lock operators would typically be faster than reifying Self.
 211 //
 212 // *  Ideally I'd define the primitives as:
 213 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 214 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 215 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 216 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 217 //    Furthermore the register assignments are overconstrained, possibly resulting in
 218 //    sub-optimal code near the synchronization site.
 219 //
 220 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 221 //    Alternately, use a better sp-proximity test.
 222 //
 223 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 224 //    Either one is sufficient to uniquely identify a thread.
 225 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 226 //
 227 // *  Intrinsify notify() and notifyAll() for the common cases where the
 228 //    object is locked by the calling thread but the waitlist is empty.
 229 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 230 //
 231 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 232 //    But beware of excessive branch density on AMD Opterons.
 233 //
 234 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 235 //    or failure of the fast path.  If the fast path fails then we pass
 236 //    control to the slow path, typically in C.  In fast_lock and
 237 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 238 //    will emit a conditional branch immediately after the node.
 239 //    So we have branches to branches and lots of ICC.ZF games.
 240 //    Instead, it might be better to have C2 pass a "FailureLabel"
 241 //    into fast_lock and fast_unlock.  In the case of success, control
 242 //    will drop through the node.  ICC.ZF is undefined at exit.
 243 //    In the case of failure, the node will branch directly to the
 244 //    FailureLabel
 245 
 246 
 247 // obj: object to lock
 248 // box: on-stack box address (displaced header location) - KILLED
 249 // rax,: tmp -- KILLED
 250 // scr: tmp -- KILLED
 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 252                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 253                                  Metadata* method_data) {
 254   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 255   // Ensure the register assignments are disjoint
 256   assert(tmpReg == rax, "");
 257   assert(cx1Reg == noreg, "");
 258   assert(cx2Reg == noreg, "");
 259   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 260 
 261   // Possible cases that we'll encounter in fast_lock
 262   // ------------------------------------------------
 263   // * Inflated
 264   //    -- unlocked
 265   //    -- Locked
 266   //       = by self
 267   //       = by other
 268   // * neutral
 269   // * stack-locked
 270   //    -- by self
 271   //       = sp-proximity test hits
 272   //       = sp-proximity test generates false-negative
 273   //    -- by other
 274   //
 275 
 276   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 277 
 278   if (DiagnoseSyncOnValueBasedClasses != 0) {
 279     load_klass(tmpReg, objReg, scrReg);
 280     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 281     jcc(Assembler::notZero, DONE_LABEL);
 282   }
 283 
 284   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 285   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 286   jcc(Assembler::notZero, IsInflated);
 287 
 288   if (LockingMode == LM_MONITOR) {
 289     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 290     testptr(objReg, objReg);
 291   } else {
 292     assert(LockingMode == LM_LEGACY, "must be");
 293     // Attempt stack-locking ...
 294     orptr (tmpReg, markWord::unlocked_value);
 295     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 296     lock();
 297     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 298     jcc(Assembler::equal, COUNT);           // Success
 299 
 300     // Recursive locking.
 301     // The object is stack-locked: markword contains stack pointer to BasicLock.
 302     // Locked by current thread if difference with current SP is less than one page.
 303     subptr(tmpReg, rsp);
 304     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 305     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 306     movptr(Address(boxReg, 0), tmpReg);
 307   }
 308   jmp(DONE_LABEL);
 309 
 310   bind(IsInflated);
 311   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 312 
 313 #ifndef _LP64
 314   // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 315   orl(boxReg, 1);  // set ICC.ZF=0 to indicate failure
 316 #else
 317   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 318   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 319   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 320 
 321   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 322   movptr(boxReg, Address(r15_thread, JavaThread::lock_id_offset()));
 323   movq(scrReg, tmpReg);
 324   xorq(tmpReg, tmpReg);
 325   lock();
 326   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 327 
 328   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 329   jccb(Assembler::equal, COUNT);    // CAS above succeeded; propagate ZF = 1 (success)
 330 
 331   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 332   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 333   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 334   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 335 #endif // _LP64
 336   bind(DONE_LABEL);
 337 
 338   // ZFlag == 1 count in fast path
 339   // ZFlag == 0 count in slow path
 340   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 341 
 342   bind(COUNT);
 343   if (LockingMode == LM_LEGACY) {
 344 #ifdef _LP64
 345     // Count monitors in fast path
 346     increment(Address(thread, JavaThread::held_monitor_count_offset()));
 347 #endif
 348   }
 349   xorl(tmpReg, tmpReg); // Set ZF == 1
 350 
 351   bind(NO_COUNT);
 352 
 353   // At NO_COUNT the icc ZFlag is set as follows ...
 354   // fast_unlock uses the same protocol.
 355   // ZFlag == 1 -> Success
 356   // ZFlag == 0 -> Failure - force control through the slow path
 357 }
 358 
 359 // obj: object to unlock
 360 // box: box address (displaced header location), killed.  Must be EAX.
 361 // tmp: killed, cannot be obj nor box.
 362 //
 363 // Some commentary on balanced locking:
 364 //
 365 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 366 // Methods that don't have provably balanced locking are forced to run in the
 367 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 368 // The interpreter provides two properties:
 369 // I1:  At return-time the interpreter automatically and quietly unlocks any
 370 //      objects acquired the current activation (frame).  Recall that the
 371 //      interpreter maintains an on-stack list of locks currently held by
 372 //      a frame.
 373 // I2:  If a method attempts to unlock an object that is not held by the
 374 //      the frame the interpreter throws IMSX.
 375 //
 376 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 377 // B() doesn't have provably balanced locking so it runs in the interpreter.
 378 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 379 // is still locked by A().
 380 //
 381 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 382 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 383 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 384 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 385 // Arguably given that the spec legislates the JNI case as undefined our implementation
 386 // could reasonably *avoid* checking owner in fast_unlock().
 387 // In the interest of performance we elide m->Owner==Self check in unlock.
 388 // A perfectly viable alternative is to elide the owner check except when
 389 // Xcheck:jni is enabled.
 390 
 391 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 392   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 393   assert(boxReg == rax, "");
 394   assert_different_registers(objReg, boxReg, tmpReg);
 395 
 396   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 397 
 398   if (LockingMode == LM_LEGACY) {
 399     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 400     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 401   }
 402   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 403   if (LockingMode != LM_MONITOR) {
 404     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 405     jcc(Assembler::zero, Stacked);
 406   }
 407 
 408   // It's inflated.
 409 
 410 #ifndef _LP64
 411   // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 412   orl(boxReg, 1);  // set ICC.ZF=0 to indicate failure
 413   jmpb(DONE_LABEL);
 414 #else
 415   // Despite our balanced locking property we still check that m->_owner == Self
 416   // as java routines or native JNI code called by this thread might
 417   // have released the lock.
 418   // Refer to the comments in synchronizer.cpp for how we might encode extra
 419   // state in _succ so we can avoid fetching EntryList|cxq.
 420   //
 421   // If there's no contention try a 1-0 exit.  That is, exit without
 422   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 423   // we detect and recover from the race that the 1-0 exit admits.
 424   //
 425   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 426   // before it STs null into _owner, releasing the lock.  Updates
 427   // to data protected by the critical section must be visible before
 428   // we drop the lock (and thus before any other thread could acquire
 429   // the lock and observe the fields protected by the lock).
 430   // IA32's memory-model is SPO, so STs are ordered with respect to
 431   // each other and there's no need for an explicit barrier (fence).
 432   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 433   Label LSuccess, LNotRecursive;
 434 
 435   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 436   jccb(Assembler::equal, LNotRecursive);
 437 
 438   // Recursive inflated unlock
 439   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 440   jmpb(LSuccess);
 441 
 442   bind(LNotRecursive);
 443 
 444   // Set owner to null.
 445   // Release to satisfy the JMM
 446   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 447   // We need a full fence after clearing owner to avoid stranding.
 448   // StoreLoad achieves this.
 449   membar(StoreLoad);
 450 
 451   // Check if the entry lists are empty (EntryList first - by convention).
 452   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 453   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 454   jccb(Assembler::zero, LSuccess);    // If so we are done.
 455 
 456   // Check if there is a successor.
 457   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 458   jccb(Assembler::notZero, LSuccess); // If so we are done.
 459 
 460   // Save the monitor pointer in the current thread, so we can try to
 461   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 462   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 463   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 464 
 465   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 466   jmpb  (DONE_LABEL);
 467 
 468   bind  (LSuccess);
 469   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 470   jmpb  (DONE_LABEL);
 471 #endif  // _LP64
 472 
 473   if (LockingMode == LM_LEGACY) {
 474     bind  (Stacked);
 475     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 476     lock();
 477     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 478     // Intentional fall-thru into DONE_LABEL
 479   }
 480 
 481   bind(DONE_LABEL);
 482 
 483   // ZFlag == 1 count in fast path
 484   // ZFlag == 0 count in slow path
 485   jccb(Assembler::notZero, NO_COUNT);
 486 
 487   bind(COUNT);
 488 
 489   if (LockingMode == LM_LEGACY) {
 490     // Count monitors in fast path
 491 #ifdef _LP64
 492     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 493 #endif
 494   }
 495 
 496   xorl(tmpReg, tmpReg); // Set ZF == 1
 497 
 498   bind(NO_COUNT);
 499 }
 500 
 501 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 502                                               Register t, Register thread) {
 503   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 504   assert(rax_reg == rax, "Used for CAS");
 505   assert_different_registers(obj, box, rax_reg, t, thread);
 506 
 507   // Handle inflated monitor.
 508   Label inflated;
 509   // Finish fast lock successfully. ZF value is irrelevant.
 510   Label locked;
 511   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 512   Label slow_path;
 513 
 514   if (UseObjectMonitorTable) {
 515     // Clear cache in case fast locking succeeds.
 516     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 517   }
 518 
 519   if (DiagnoseSyncOnValueBasedClasses != 0) {
 520     load_klass(rax_reg, obj, t);
 521     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 522     jcc(Assembler::notZero, slow_path);
 523   }
 524 
 525   const Register mark = t;
 526 
 527   { // Lightweight Lock
 528 
 529     Label push;
 530 
 531     const Register top = UseObjectMonitorTable ? rax_reg : box;
 532 
 533     // Load the mark.
 534     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 535 
 536     // Prefetch top.
 537     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 538 
 539     // Check for monitor (0b10).
 540     testptr(mark, markWord::monitor_value);
 541     jcc(Assembler::notZero, inflated);
 542 
 543     // Check if lock-stack is full.
 544     cmpl(top, LockStack::end_offset() - 1);
 545     jcc(Assembler::greater, slow_path);
 546 
 547     // Check if recursive.
 548     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 549     jccb(Assembler::equal, push);
 550 
 551     // Try to lock. Transition lock bits 0b01 => 0b00
 552     movptr(rax_reg, mark);
 553     orptr(rax_reg, markWord::unlocked_value);
 554     andptr(mark, ~(int32_t)markWord::unlocked_value);
 555     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 556     jcc(Assembler::notEqual, slow_path);
 557 
 558     if (UseObjectMonitorTable) {
 559       // Need to reload top, clobbered by CAS.
 560       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 561     }
 562     bind(push);
 563     // After successful lock, push object on lock-stack.
 564     movptr(Address(thread, top), obj);
 565     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 566     jmpb(locked);
 567   }
 568 
 569   { // Handle inflated monitor.
 570     bind(inflated);
 571 
 572 #ifndef _LP64
 573     // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 574     orl(box, 1);  // set ICC.ZF=0 to indicate failure
 575     jmpb(slow_path);
 576 #else
 577     const Register monitor = t;
 578 
 579     if (!UseObjectMonitorTable) {
 580       assert(mark == monitor, "should be the same here");
 581     } else {
 582       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 583       // Fetch ObjectMonitor* from the cache or take the slow-path.
 584       Label monitor_found;
 585 
 586       // Load cache address
 587       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 588 
 589       const int num_unrolled = 2;
 590       for (int i = 0; i < num_unrolled; i++) {
 591         cmpptr(obj, Address(t));
 592         jccb(Assembler::equal, monitor_found);
 593         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 594       }
 595 
 596       Label loop;
 597 
 598       // Search for obj in cache.
 599       bind(loop);
 600 
 601       // Check for match.
 602       cmpptr(obj, Address(t));
 603       jccb(Assembler::equal, monitor_found);
 604 
 605       // Search until null encountered, guaranteed _null_sentinel at end.
 606       cmpptr(Address(t), 1);
 607       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 608       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 609       jmpb(loop);
 610 
 611       // Cache hit.
 612       bind(monitor_found);
 613       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 614     }
 615     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 616     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 617     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 618 
 619     Label monitor_locked;
 620     // Lock the monitor.
 621 
 622     if (UseObjectMonitorTable) {
 623       // Cache the monitor for unlock before trashing box. On failure to acquire
 624       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 625       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 626     }
 627 
 628     // Try to CAS owner (no owner => current thread's _lock_id).
 629     xorptr(rax_reg, rax_reg);
 630     movptr(box, Address(thread, JavaThread::lock_id_offset()));
 631     lock(); cmpxchgptr(box, owner_address);
 632     jccb(Assembler::equal, monitor_locked);
 633 
 634     // Check if recursive.
 635     cmpptr(box, rax_reg);
 636     jccb(Assembler::notEqual, slow_path);
 637 
 638     // Recursive.
 639     increment(recursions_address);
 640 
 641     bind(monitor_locked);
 642 #endif  // _LP64
 643   }
 644 
 645   bind(locked);
 646   // Set ZF = 1
 647   xorl(rax_reg, rax_reg);
 648 
 649 #ifdef ASSERT
 650   // Check that locked label is reached with ZF set.
 651   Label zf_correct;
 652   Label zf_bad_zero;
 653   jcc(Assembler::zero, zf_correct);
 654   jmp(zf_bad_zero);
 655 #endif
 656 
 657   bind(slow_path);
 658 #ifdef ASSERT
 659   // Check that slow_path label is reached with ZF not set.
 660   jcc(Assembler::notZero, zf_correct);
 661   stop("Fast Lock ZF != 0");
 662   bind(zf_bad_zero);
 663   stop("Fast Lock ZF != 1");
 664   bind(zf_correct);
 665 #endif
 666   // C2 uses the value of ZF to determine the continuation.
 667 }
 668 
 669 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 670   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 671   assert(reg_rax == rax, "Used for CAS");
 672   assert_different_registers(obj, reg_rax, t);
 673 
 674   // Handle inflated monitor.
 675   Label inflated, inflated_check_lock_stack;
 676   // Finish fast unlock successfully.  MUST jump with ZF == 1
 677   Label unlocked, slow_path;
 678 
 679   const Register mark = t;
 680   const Register monitor = t;
 681   const Register top = UseObjectMonitorTable ? t : reg_rax;
 682   const Register box = reg_rax;
 683 
 684   Label dummy;
 685   C2FastUnlockLightweightStub* stub = nullptr;
 686 
 687   if (!Compile::current()->output()->in_scratch_emit_size()) {
 688     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 689     Compile::current()->output()->add_stub(stub);
 690   }
 691 
 692   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 693 
 694   { // Lightweight Unlock
 695 
 696     // Load top.
 697     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 698 
 699     if (!UseObjectMonitorTable) {
 700       // Prefetch mark.
 701       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 702     }
 703 
 704     // Check if obj is top of lock-stack.
 705     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 706     // Top of lock stack was not obj. Must be monitor.
 707     jcc(Assembler::notEqual, inflated_check_lock_stack);
 708 
 709     // Pop lock-stack.
 710     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 711     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 712 
 713     // Check if recursive.
 714     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 715     jcc(Assembler::equal, unlocked);
 716 
 717     // We elide the monitor check, let the CAS fail instead.
 718 
 719     if (UseObjectMonitorTable) {
 720       // Load mark.
 721       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 722     }
 723 
 724     // Try to unlock. Transition lock bits 0b00 => 0b01
 725     movptr(reg_rax, mark);
 726     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 727     orptr(mark, markWord::unlocked_value);
 728     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 729     jcc(Assembler::notEqual, push_and_slow_path);
 730     jmp(unlocked);
 731   }
 732 
 733 
 734   { // Handle inflated monitor.
 735     bind(inflated_check_lock_stack);
 736 #ifdef ASSERT
 737     Label check_done;
 738     subl(top, oopSize);
 739     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 740     jcc(Assembler::below, check_done);
 741     cmpptr(obj, Address(thread, top));
 742     jccb(Assembler::notEqual, inflated_check_lock_stack);
 743     stop("Fast Unlock lock on stack");
 744     bind(check_done);
 745     if (UseObjectMonitorTable) {
 746       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 747     }
 748     testptr(mark, markWord::monitor_value);
 749     jccb(Assembler::notZero, inflated);
 750     stop("Fast Unlock not monitor");
 751 #endif
 752 
 753     bind(inflated);
 754 
 755 #ifndef _LP64
 756     // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 757     orl(t, 1);  // set ICC.ZF=0 to indicate failure
 758     jmpb(slow_path);
 759 #else
 760     if (!UseObjectMonitorTable) {
 761       assert(mark == monitor, "should be the same here");
 762     } else {
 763       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 764       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 765       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 766       cmpptr(monitor, alignof(ObjectMonitor*));
 767       jcc(Assembler::below, slow_path);
 768     }
 769     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 770     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 771     const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag};
 772     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 773     const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag};
 774     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 775 
 776     Label recursive;
 777 
 778     // Check if recursive.
 779     cmpptr(recursions_address, 0);
 780     jccb(Assembler::notZero, recursive);
 781 
 782     // Set owner to null.
 783     // Release to satisfy the JMM
 784     movptr(owner_address, NULL_WORD);
 785     // We need a full fence after clearing owner to avoid stranding.
 786     // StoreLoad achieves this.
 787     membar(StoreLoad);
 788 
 789     // Check if the entry lists are empty (EntryList first - by convention).
 790     movptr(reg_rax, EntryList_address);
 791     orptr(reg_rax, cxq_address);
 792     jccb(Assembler::zero, unlocked);    // If so we are done.
 793 
 794     // Check if there is a successor.
 795     cmpptr(succ_address, NULL_WORD);
 796     jccb(Assembler::notZero, unlocked); // If so we are done.
 797 
 798     // Save the monitor pointer in the current thread, so we can try to
 799     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 800     if (!UseObjectMonitorTable) {
 801       andptr(monitor, ~(int32_t)markWord::monitor_value);
 802     }
 803     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 804 
 805     orl(t, 1); // Fast Unlock ZF = 0
 806     jmpb(slow_path);
 807 
 808     // Recursive unlock.
 809     bind(recursive);
 810     decrement(recursions_address);
 811 #endif  // _LP64
 812   }
 813 
 814   bind(unlocked);
 815   xorl(t, t); // Fast Unlock ZF = 1
 816 
 817 #ifdef ASSERT
 818   // Check that unlocked label is reached with ZF set.
 819   Label zf_correct;
 820   Label zf_bad_zero;
 821   jcc(Assembler::zero, zf_correct);
 822   jmp(zf_bad_zero);
 823 #endif
 824 
 825   bind(slow_path);
 826   if (stub != nullptr) {
 827     bind(stub->slow_path_continuation());
 828   }
 829 #ifdef ASSERT
 830   // Check that stub->continuation() label is reached with ZF not set.
 831   jcc(Assembler::notZero, zf_correct);
 832   stop("Fast Unlock ZF != 0");
 833   bind(zf_bad_zero);
 834   stop("Fast Unlock ZF != 1");
 835   bind(zf_correct);
 836 #endif
 837   // C2 uses the value of ZF to determine the continuation.
 838 }
 839 
 840 //-------------------------------------------------------------------------------------------
 841 // Generic instructions support for use in .ad files C2 code generation
 842 
 843 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 844   if (dst != src) {
 845     movdqu(dst, src);
 846   }
 847   if (opcode == Op_AbsVD) {
 848     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 849   } else {
 850     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 851     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 852   }
 853 }
 854 
 855 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 856   if (opcode == Op_AbsVD) {
 857     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 858   } else {
 859     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 860     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 861   }
 862 }
 863 
 864 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 865   if (dst != src) {
 866     movdqu(dst, src);
 867   }
 868   if (opcode == Op_AbsVF) {
 869     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 870   } else {
 871     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 872     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 873   }
 874 }
 875 
 876 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 877   if (opcode == Op_AbsVF) {
 878     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 879   } else {
 880     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 881     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 882   }
 883 }
 884 
 885 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 886   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 887   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 888 
 889   if (opcode == Op_MinV) {
 890     if (elem_bt == T_BYTE) {
 891       pminsb(dst, src);
 892     } else if (elem_bt == T_SHORT) {
 893       pminsw(dst, src);
 894     } else if (elem_bt == T_INT) {
 895       pminsd(dst, src);
 896     } else {
 897       assert(elem_bt == T_LONG, "required");
 898       assert(tmp == xmm0, "required");
 899       assert_different_registers(dst, src, tmp);
 900       movdqu(xmm0, dst);
 901       pcmpgtq(xmm0, src);
 902       blendvpd(dst, src);  // xmm0 as mask
 903     }
 904   } else { // opcode == Op_MaxV
 905     if (elem_bt == T_BYTE) {
 906       pmaxsb(dst, src);
 907     } else if (elem_bt == T_SHORT) {
 908       pmaxsw(dst, src);
 909     } else if (elem_bt == T_INT) {
 910       pmaxsd(dst, src);
 911     } else {
 912       assert(elem_bt == T_LONG, "required");
 913       assert(tmp == xmm0, "required");
 914       assert_different_registers(dst, src, tmp);
 915       movdqu(xmm0, src);
 916       pcmpgtq(xmm0, dst);
 917       blendvpd(dst, src);  // xmm0 as mask
 918     }
 919   }
 920 }
 921 
 922 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 923                                   XMMRegister src1, Address src2, int vlen_enc) {
 924   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 925   if (opcode == Op_UMinV) {
 926     switch(elem_bt) {
 927       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 928       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 929       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 930       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 931       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 932     }
 933   } else {
 934     assert(opcode == Op_UMaxV, "required");
 935     switch(elem_bt) {
 936       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 937       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 938       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 939       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 940       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 941     }
 942   }
 943 }
 944 
 945 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 946   // For optimality, leverage a full vector width of 512 bits
 947   // for operations over smaller vector sizes on AVX512 targets.
 948   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 949     if (opcode == Op_UMaxV) {
 950       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 951     } else {
 952       assert(opcode == Op_UMinV, "required");
 953       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 954     }
 955   } else {
 956     // T1 = -1
 957     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 958     // T1 = -1 << 63
 959     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 960     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 961     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 962     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 963     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 964     // Mask = T2 > T1
 965     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 966     if (opcode == Op_UMaxV) {
 967       // Res = Mask ? Src2 : Src1
 968       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 969     } else {
 970       // Res = Mask ? Src1 : Src2
 971       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 972     }
 973   }
 974 }
 975 
 976 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 977                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 978   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 979   if (opcode == Op_UMinV) {
 980     switch(elem_bt) {
 981       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 982       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 983       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 984       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 985       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 986     }
 987   } else {
 988     assert(opcode == Op_UMaxV, "required");
 989     switch(elem_bt) {
 990       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 991       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 992       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 993       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 994       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 995     }
 996   }
 997 }
 998 
 999 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1000                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1001                                  int vlen_enc) {
1002   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1003 
1004   if (opcode == Op_MinV) {
1005     if (elem_bt == T_BYTE) {
1006       vpminsb(dst, src1, src2, vlen_enc);
1007     } else if (elem_bt == T_SHORT) {
1008       vpminsw(dst, src1, src2, vlen_enc);
1009     } else if (elem_bt == T_INT) {
1010       vpminsd(dst, src1, src2, vlen_enc);
1011     } else {
1012       assert(elem_bt == T_LONG, "required");
1013       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1014         vpminsq(dst, src1, src2, vlen_enc);
1015       } else {
1016         assert_different_registers(dst, src1, src2);
1017         vpcmpgtq(dst, src1, src2, vlen_enc);
1018         vblendvpd(dst, src1, src2, dst, vlen_enc);
1019       }
1020     }
1021   } else { // opcode == Op_MaxV
1022     if (elem_bt == T_BYTE) {
1023       vpmaxsb(dst, src1, src2, vlen_enc);
1024     } else if (elem_bt == T_SHORT) {
1025       vpmaxsw(dst, src1, src2, vlen_enc);
1026     } else if (elem_bt == T_INT) {
1027       vpmaxsd(dst, src1, src2, vlen_enc);
1028     } else {
1029       assert(elem_bt == T_LONG, "required");
1030       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1031         vpmaxsq(dst, src1, src2, vlen_enc);
1032       } else {
1033         assert_different_registers(dst, src1, src2);
1034         vpcmpgtq(dst, src1, src2, vlen_enc);
1035         vblendvpd(dst, src2, src1, dst, vlen_enc);
1036       }
1037     }
1038   }
1039 }
1040 
1041 // Float/Double min max
1042 
1043 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1044                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1045                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1046                                    int vlen_enc) {
1047   assert(UseAVX > 0, "required");
1048   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1049          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1050   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1051   assert_different_registers(a, tmp, atmp, btmp);
1052   assert_different_registers(b, tmp, atmp, btmp);
1053 
1054   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1055   bool is_double_word = is_double_word_type(elem_bt);
1056 
1057   /* Note on 'non-obvious' assembly sequence:
1058    *
1059    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1060    * and Java on how they handle floats:
1061    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1062    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1063    *
1064    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1065    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1066    *                (only useful when signs differ, noop otherwise)
1067    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1068 
1069    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1070    *   btmp = (b < +0.0) ? a : b
1071    *   atmp = (b < +0.0) ? b : a
1072    *   Tmp  = Max_Float(atmp , btmp)
1073    *   Res  = (atmp == NaN) ? atmp : Tmp
1074    */
1075 
1076   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1077   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1078   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1079   XMMRegister mask;
1080 
1081   if (!is_double_word && is_min) {
1082     mask = a;
1083     vblend = &MacroAssembler::vblendvps;
1084     vmaxmin = &MacroAssembler::vminps;
1085     vcmp = &MacroAssembler::vcmpps;
1086   } else if (!is_double_word && !is_min) {
1087     mask = b;
1088     vblend = &MacroAssembler::vblendvps;
1089     vmaxmin = &MacroAssembler::vmaxps;
1090     vcmp = &MacroAssembler::vcmpps;
1091   } else if (is_double_word && is_min) {
1092     mask = a;
1093     vblend = &MacroAssembler::vblendvpd;
1094     vmaxmin = &MacroAssembler::vminpd;
1095     vcmp = &MacroAssembler::vcmppd;
1096   } else {
1097     assert(is_double_word && !is_min, "sanity");
1098     mask = b;
1099     vblend = &MacroAssembler::vblendvpd;
1100     vmaxmin = &MacroAssembler::vmaxpd;
1101     vcmp = &MacroAssembler::vcmppd;
1102   }
1103 
1104   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1105   XMMRegister maxmin, scratch;
1106   if (dst == btmp) {
1107     maxmin = btmp;
1108     scratch = tmp;
1109   } else {
1110     maxmin = tmp;
1111     scratch = btmp;
1112   }
1113 
1114   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1115   if (precompute_mask && !is_double_word) {
1116     vpsrad(tmp, mask, 32, vlen_enc);
1117     mask = tmp;
1118   } else if (precompute_mask && is_double_word) {
1119     vpxor(tmp, tmp, tmp, vlen_enc);
1120     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1121     mask = tmp;
1122   }
1123 
1124   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1125   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1126   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1127   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1128   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1129 }
1130 
1131 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1132                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1133                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1134                                     int vlen_enc) {
1135   assert(UseAVX > 2, "required");
1136   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1137          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1138   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1139   assert_different_registers(dst, a, atmp, btmp);
1140   assert_different_registers(dst, b, atmp, btmp);
1141 
1142   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1143   bool is_double_word = is_double_word_type(elem_bt);
1144   bool merge = true;
1145 
1146   if (!is_double_word && is_min) {
1147     evpmovd2m(ktmp, a, vlen_enc);
1148     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1149     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1150     vminps(dst, atmp, btmp, vlen_enc);
1151     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1152     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1153   } else if (!is_double_word && !is_min) {
1154     evpmovd2m(ktmp, b, vlen_enc);
1155     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1156     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1157     vmaxps(dst, atmp, btmp, vlen_enc);
1158     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1159     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1160   } else if (is_double_word && is_min) {
1161     evpmovq2m(ktmp, a, vlen_enc);
1162     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1163     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1164     vminpd(dst, atmp, btmp, vlen_enc);
1165     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1166     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1167   } else {
1168     assert(is_double_word && !is_min, "sanity");
1169     evpmovq2m(ktmp, b, vlen_enc);
1170     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1171     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1172     vmaxpd(dst, atmp, btmp, vlen_enc);
1173     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1174     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1175   }
1176 }
1177 
1178 // Float/Double signum
1179 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1180   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1181 
1182   Label DONE_LABEL;
1183 
1184   if (opcode == Op_SignumF) {
1185     assert(UseSSE > 0, "required");
1186     ucomiss(dst, zero);
1187     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1188     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1189     movflt(dst, one);
1190     jcc(Assembler::above, DONE_LABEL);
1191     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1192   } else if (opcode == Op_SignumD) {
1193     assert(UseSSE > 1, "required");
1194     ucomisd(dst, zero);
1195     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1196     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1197     movdbl(dst, one);
1198     jcc(Assembler::above, DONE_LABEL);
1199     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1200   }
1201 
1202   bind(DONE_LABEL);
1203 }
1204 
1205 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1206   if (sign) {
1207     pmovsxbw(dst, src);
1208   } else {
1209     pmovzxbw(dst, src);
1210   }
1211 }
1212 
1213 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1214   if (sign) {
1215     vpmovsxbw(dst, src, vector_len);
1216   } else {
1217     vpmovzxbw(dst, src, vector_len);
1218   }
1219 }
1220 
1221 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1222   if (sign) {
1223     vpmovsxbd(dst, src, vector_len);
1224   } else {
1225     vpmovzxbd(dst, src, vector_len);
1226   }
1227 }
1228 
1229 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1230   if (sign) {
1231     vpmovsxwd(dst, src, vector_len);
1232   } else {
1233     vpmovzxwd(dst, src, vector_len);
1234   }
1235 }
1236 
1237 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1238                                      int shift, int vector_len) {
1239   if (opcode == Op_RotateLeftV) {
1240     if (etype == T_INT) {
1241       evprold(dst, src, shift, vector_len);
1242     } else {
1243       assert(etype == T_LONG, "expected type T_LONG");
1244       evprolq(dst, src, shift, vector_len);
1245     }
1246   } else {
1247     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1248     if (etype == T_INT) {
1249       evprord(dst, src, shift, vector_len);
1250     } else {
1251       assert(etype == T_LONG, "expected type T_LONG");
1252       evprorq(dst, src, shift, vector_len);
1253     }
1254   }
1255 }
1256 
1257 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1258                                      XMMRegister shift, int vector_len) {
1259   if (opcode == Op_RotateLeftV) {
1260     if (etype == T_INT) {
1261       evprolvd(dst, src, shift, vector_len);
1262     } else {
1263       assert(etype == T_LONG, "expected type T_LONG");
1264       evprolvq(dst, src, shift, vector_len);
1265     }
1266   } else {
1267     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1268     if (etype == T_INT) {
1269       evprorvd(dst, src, shift, vector_len);
1270     } else {
1271       assert(etype == T_LONG, "expected type T_LONG");
1272       evprorvq(dst, src, shift, vector_len);
1273     }
1274   }
1275 }
1276 
1277 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1278   if (opcode == Op_RShiftVI) {
1279     psrad(dst, shift);
1280   } else if (opcode == Op_LShiftVI) {
1281     pslld(dst, shift);
1282   } else {
1283     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1284     psrld(dst, shift);
1285   }
1286 }
1287 
1288 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1289   switch (opcode) {
1290     case Op_RShiftVI:  psrad(dst, shift); break;
1291     case Op_LShiftVI:  pslld(dst, shift); break;
1292     case Op_URShiftVI: psrld(dst, shift); break;
1293 
1294     default: assert(false, "%s", NodeClassNames[opcode]);
1295   }
1296 }
1297 
1298 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1299   if (opcode == Op_RShiftVI) {
1300     vpsrad(dst, nds, shift, vector_len);
1301   } else if (opcode == Op_LShiftVI) {
1302     vpslld(dst, nds, shift, vector_len);
1303   } else {
1304     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1305     vpsrld(dst, nds, shift, vector_len);
1306   }
1307 }
1308 
1309 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1310   switch (opcode) {
1311     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1312     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1313     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1314 
1315     default: assert(false, "%s", NodeClassNames[opcode]);
1316   }
1317 }
1318 
1319 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1320   switch (opcode) {
1321     case Op_RShiftVB:  // fall-through
1322     case Op_RShiftVS:  psraw(dst, shift); break;
1323 
1324     case Op_LShiftVB:  // fall-through
1325     case Op_LShiftVS:  psllw(dst, shift);   break;
1326 
1327     case Op_URShiftVS: // fall-through
1328     case Op_URShiftVB: psrlw(dst, shift);  break;
1329 
1330     default: assert(false, "%s", NodeClassNames[opcode]);
1331   }
1332 }
1333 
1334 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1335   switch (opcode) {
1336     case Op_RShiftVB:  // fall-through
1337     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1338 
1339     case Op_LShiftVB:  // fall-through
1340     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1341 
1342     case Op_URShiftVS: // fall-through
1343     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1344 
1345     default: assert(false, "%s", NodeClassNames[opcode]);
1346   }
1347 }
1348 
1349 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1350   switch (opcode) {
1351     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1352     case Op_LShiftVL:  psllq(dst, shift); break;
1353     case Op_URShiftVL: psrlq(dst, shift); break;
1354 
1355     default: assert(false, "%s", NodeClassNames[opcode]);
1356   }
1357 }
1358 
1359 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1360   if (opcode == Op_RShiftVL) {
1361     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1362   } else if (opcode == Op_LShiftVL) {
1363     psllq(dst, shift);
1364   } else {
1365     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1366     psrlq(dst, shift);
1367   }
1368 }
1369 
1370 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1371   switch (opcode) {
1372     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1373     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1374     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1375 
1376     default: assert(false, "%s", NodeClassNames[opcode]);
1377   }
1378 }
1379 
1380 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1381   if (opcode == Op_RShiftVL) {
1382     evpsraq(dst, nds, shift, vector_len);
1383   } else if (opcode == Op_LShiftVL) {
1384     vpsllq(dst, nds, shift, vector_len);
1385   } else {
1386     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1387     vpsrlq(dst, nds, shift, vector_len);
1388   }
1389 }
1390 
1391 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1392   switch (opcode) {
1393     case Op_RShiftVB:  // fall-through
1394     case Op_RShiftVS:  // fall-through
1395     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1396 
1397     case Op_LShiftVB:  // fall-through
1398     case Op_LShiftVS:  // fall-through
1399     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1400 
1401     case Op_URShiftVB: // fall-through
1402     case Op_URShiftVS: // fall-through
1403     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1404 
1405     default: assert(false, "%s", NodeClassNames[opcode]);
1406   }
1407 }
1408 
1409 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1410   switch (opcode) {
1411     case Op_RShiftVB:  // fall-through
1412     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1413 
1414     case Op_LShiftVB:  // fall-through
1415     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1416 
1417     case Op_URShiftVB: // fall-through
1418     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1419 
1420     default: assert(false, "%s", NodeClassNames[opcode]);
1421   }
1422 }
1423 
1424 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1425   assert(UseAVX >= 2, "required");
1426   switch (opcode) {
1427     case Op_RShiftVL: {
1428       if (UseAVX > 2) {
1429         assert(tmp == xnoreg, "not used");
1430         if (!VM_Version::supports_avx512vl()) {
1431           vlen_enc = Assembler::AVX_512bit;
1432         }
1433         evpsravq(dst, src, shift, vlen_enc);
1434       } else {
1435         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1436         vpsrlvq(dst, src, shift, vlen_enc);
1437         vpsrlvq(tmp, tmp, shift, vlen_enc);
1438         vpxor(dst, dst, tmp, vlen_enc);
1439         vpsubq(dst, dst, tmp, vlen_enc);
1440       }
1441       break;
1442     }
1443     case Op_LShiftVL: {
1444       assert(tmp == xnoreg, "not used");
1445       vpsllvq(dst, src, shift, vlen_enc);
1446       break;
1447     }
1448     case Op_URShiftVL: {
1449       assert(tmp == xnoreg, "not used");
1450       vpsrlvq(dst, src, shift, vlen_enc);
1451       break;
1452     }
1453     default: assert(false, "%s", NodeClassNames[opcode]);
1454   }
1455 }
1456 
1457 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1458 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1459   assert(opcode == Op_LShiftVB ||
1460          opcode == Op_RShiftVB ||
1461          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1462   bool sign = (opcode != Op_URShiftVB);
1463   assert(vector_len == 0, "required");
1464   vextendbd(sign, dst, src, 1);
1465   vpmovzxbd(vtmp, shift, 1);
1466   varshiftd(opcode, dst, dst, vtmp, 1);
1467   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1468   vextracti128_high(vtmp, dst);
1469   vpackusdw(dst, dst, vtmp, 0);
1470 }
1471 
1472 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1473 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1474   assert(opcode == Op_LShiftVB ||
1475          opcode == Op_RShiftVB ||
1476          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1477   bool sign = (opcode != Op_URShiftVB);
1478   int ext_vector_len = vector_len + 1;
1479   vextendbw(sign, dst, src, ext_vector_len);
1480   vpmovzxbw(vtmp, shift, ext_vector_len);
1481   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1482   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1483   if (vector_len == 0) {
1484     vextracti128_high(vtmp, dst);
1485     vpackuswb(dst, dst, vtmp, vector_len);
1486   } else {
1487     vextracti64x4_high(vtmp, dst);
1488     vpackuswb(dst, dst, vtmp, vector_len);
1489     vpermq(dst, dst, 0xD8, vector_len);
1490   }
1491 }
1492 
1493 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1494   switch(typ) {
1495     case T_BYTE:
1496       pinsrb(dst, val, idx);
1497       break;
1498     case T_SHORT:
1499       pinsrw(dst, val, idx);
1500       break;
1501     case T_INT:
1502       pinsrd(dst, val, idx);
1503       break;
1504     case T_LONG:
1505       pinsrq(dst, val, idx);
1506       break;
1507     default:
1508       assert(false,"Should not reach here.");
1509       break;
1510   }
1511 }
1512 
1513 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1514   switch(typ) {
1515     case T_BYTE:
1516       vpinsrb(dst, src, val, idx);
1517       break;
1518     case T_SHORT:
1519       vpinsrw(dst, src, val, idx);
1520       break;
1521     case T_INT:
1522       vpinsrd(dst, src, val, idx);
1523       break;
1524     case T_LONG:
1525       vpinsrq(dst, src, val, idx);
1526       break;
1527     default:
1528       assert(false,"Should not reach here.");
1529       break;
1530   }
1531 }
1532 
1533 #ifdef _LP64
1534 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1535                                                 XMMRegister dst, Register base,
1536                                                 Register idx_base,
1537                                                 Register offset, Register mask,
1538                                                 Register mask_idx, Register rtmp,
1539                                                 int vlen_enc) {
1540   vpxor(dst, dst, dst, vlen_enc);
1541   if (elem_bt == T_SHORT) {
1542     for (int i = 0; i < 4; i++) {
1543       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1544       Label skip_load;
1545       btq(mask, mask_idx);
1546       jccb(Assembler::carryClear, skip_load);
1547       movl(rtmp, Address(idx_base, i * 4));
1548       if (offset != noreg) {
1549         addl(rtmp, offset);
1550       }
1551       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1552       bind(skip_load);
1553       incq(mask_idx);
1554     }
1555   } else {
1556     assert(elem_bt == T_BYTE, "");
1557     for (int i = 0; i < 8; i++) {
1558       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1559       Label skip_load;
1560       btq(mask, mask_idx);
1561       jccb(Assembler::carryClear, skip_load);
1562       movl(rtmp, Address(idx_base, i * 4));
1563       if (offset != noreg) {
1564         addl(rtmp, offset);
1565       }
1566       pinsrb(dst, Address(base, rtmp), i);
1567       bind(skip_load);
1568       incq(mask_idx);
1569     }
1570   }
1571 }
1572 #endif // _LP64
1573 
1574 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1575                                          Register base, Register idx_base,
1576                                          Register offset, Register rtmp,
1577                                          int vlen_enc) {
1578   vpxor(dst, dst, dst, vlen_enc);
1579   if (elem_bt == T_SHORT) {
1580     for (int i = 0; i < 4; i++) {
1581       // dst[i] = src[offset + idx_base[i]]
1582       movl(rtmp, Address(idx_base, i * 4));
1583       if (offset != noreg) {
1584         addl(rtmp, offset);
1585       }
1586       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1587     }
1588   } else {
1589     assert(elem_bt == T_BYTE, "");
1590     for (int i = 0; i < 8; i++) {
1591       // dst[i] = src[offset + idx_base[i]]
1592       movl(rtmp, Address(idx_base, i * 4));
1593       if (offset != noreg) {
1594         addl(rtmp, offset);
1595       }
1596       pinsrb(dst, Address(base, rtmp), i);
1597     }
1598   }
1599 }
1600 
1601 /*
1602  * Gather using hybrid algorithm, first partially unroll scalar loop
1603  * to accumulate values from gather indices into a quad-word(64bit) slice.
1604  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1605  * permutation to place the slice into appropriate vector lane
1606  * locations in destination vector. Following pseudo code describes the
1607  * algorithm in detail:
1608  *
1609  * DST_VEC = ZERO_VEC
1610  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1611  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1612  * FOREACH_ITER:
1613  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1614  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1615  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1616  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1617  *
1618  * With each iteration, doubleword permute indices (0,1) corresponding
1619  * to gathered quadword gets right shifted by two lane positions.
1620  *
1621  */
1622 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1623                                         Register base, Register idx_base,
1624                                         Register offset, Register mask,
1625                                         XMMRegister xtmp1, XMMRegister xtmp2,
1626                                         XMMRegister temp_dst, Register rtmp,
1627                                         Register mask_idx, Register length,
1628                                         int vector_len, int vlen_enc) {
1629   Label GATHER8_LOOP;
1630   assert(is_subword_type(elem_ty), "");
1631   movl(length, vector_len);
1632   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1633   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1634   vallones(xtmp2, vlen_enc);
1635   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1636   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1637   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1638 
1639   bind(GATHER8_LOOP);
1640     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1641     if (mask == noreg) {
1642       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1643     } else {
1644       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1645     }
1646     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1647     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1648     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1649     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1650     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1651     vpor(dst, dst, temp_dst, vlen_enc);
1652     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1653     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1654     jcc(Assembler::notEqual, GATHER8_LOOP);
1655 }
1656 
1657 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1658   switch(typ) {
1659     case T_INT:
1660       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1661       break;
1662     case T_FLOAT:
1663       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1664       break;
1665     case T_LONG:
1666       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1667       break;
1668     case T_DOUBLE:
1669       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1670       break;
1671     default:
1672       assert(false,"Should not reach here.");
1673       break;
1674   }
1675 }
1676 
1677 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1678   switch(typ) {
1679     case T_INT:
1680       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1681       break;
1682     case T_FLOAT:
1683       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1684       break;
1685     case T_LONG:
1686       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1687       break;
1688     case T_DOUBLE:
1689       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1690       break;
1691     default:
1692       assert(false,"Should not reach here.");
1693       break;
1694   }
1695 }
1696 
1697 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1698   switch(typ) {
1699     case T_INT:
1700       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1701       break;
1702     case T_FLOAT:
1703       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1704       break;
1705     case T_LONG:
1706       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1707       break;
1708     case T_DOUBLE:
1709       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1710       break;
1711     default:
1712       assert(false,"Should not reach here.");
1713       break;
1714   }
1715 }
1716 
1717 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1718   if (vlen_in_bytes <= 16) {
1719     pxor (dst, dst);
1720     psubb(dst, src);
1721     switch (elem_bt) {
1722       case T_BYTE:   /* nothing to do */ break;
1723       case T_SHORT:  pmovsxbw(dst, dst); break;
1724       case T_INT:    pmovsxbd(dst, dst); break;
1725       case T_FLOAT:  pmovsxbd(dst, dst); break;
1726       case T_LONG:   pmovsxbq(dst, dst); break;
1727       case T_DOUBLE: pmovsxbq(dst, dst); break;
1728 
1729       default: assert(false, "%s", type2name(elem_bt));
1730     }
1731   } else {
1732     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1733     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1734 
1735     vpxor (dst, dst, dst, vlen_enc);
1736     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1737 
1738     switch (elem_bt) {
1739       case T_BYTE:   /* nothing to do */            break;
1740       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1741       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1742       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1743       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1744       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1745 
1746       default: assert(false, "%s", type2name(elem_bt));
1747     }
1748   }
1749 }
1750 
1751 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1752   if (novlbwdq) {
1753     vpmovsxbd(xtmp, src, vlen_enc);
1754     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1755             Assembler::eq, true, vlen_enc, noreg);
1756   } else {
1757     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1758     vpsubb(xtmp, xtmp, src, vlen_enc);
1759     evpmovb2m(dst, xtmp, vlen_enc);
1760   }
1761 }
1762 
1763 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1764   switch (vlen_in_bytes) {
1765     case 4:  movdl(dst, src);   break;
1766     case 8:  movq(dst, src);    break;
1767     case 16: movdqu(dst, src);  break;
1768     case 32: vmovdqu(dst, src); break;
1769     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1770     default: ShouldNotReachHere();
1771   }
1772 }
1773 
1774 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1775   assert(rscratch != noreg || always_reachable(src), "missing");
1776 
1777   if (reachable(src)) {
1778     load_vector(dst, as_Address(src), vlen_in_bytes);
1779   } else {
1780     lea(rscratch, src);
1781     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1782   }
1783 }
1784 
1785 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1786   int vlen_enc = vector_length_encoding(vlen);
1787   if (VM_Version::supports_avx()) {
1788     if (bt == T_LONG) {
1789       if (VM_Version::supports_avx2()) {
1790         vpbroadcastq(dst, src, vlen_enc);
1791       } else {
1792         vmovddup(dst, src, vlen_enc);
1793       }
1794     } else if (bt == T_DOUBLE) {
1795       if (vlen_enc != Assembler::AVX_128bit) {
1796         vbroadcastsd(dst, src, vlen_enc, noreg);
1797       } else {
1798         vmovddup(dst, src, vlen_enc);
1799       }
1800     } else {
1801       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1802         vpbroadcastd(dst, src, vlen_enc);
1803       } else {
1804         vbroadcastss(dst, src, vlen_enc);
1805       }
1806     }
1807   } else if (VM_Version::supports_sse3()) {
1808     movddup(dst, src);
1809   } else {
1810     movq(dst, src);
1811     if (vlen == 16) {
1812       punpcklqdq(dst, dst);
1813     }
1814   }
1815 }
1816 
1817 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1818   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1819   int offset = exact_log2(type2aelembytes(bt)) << 6;
1820   if (is_floating_point_type(bt)) {
1821     offset += 128;
1822   }
1823   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1824   load_vector(dst, addr, vlen_in_bytes);
1825 }
1826 
1827 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1828 
1829 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1830   int vector_len = Assembler::AVX_128bit;
1831 
1832   switch (opcode) {
1833     case Op_AndReductionV:  pand(dst, src); break;
1834     case Op_OrReductionV:   por (dst, src); break;
1835     case Op_XorReductionV:  pxor(dst, src); break;
1836     case Op_MinReductionV:
1837       switch (typ) {
1838         case T_BYTE:        pminsb(dst, src); break;
1839         case T_SHORT:       pminsw(dst, src); break;
1840         case T_INT:         pminsd(dst, src); break;
1841         case T_LONG:        assert(UseAVX > 2, "required");
1842                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1843         default:            assert(false, "wrong type");
1844       }
1845       break;
1846     case Op_MaxReductionV:
1847       switch (typ) {
1848         case T_BYTE:        pmaxsb(dst, src); break;
1849         case T_SHORT:       pmaxsw(dst, src); break;
1850         case T_INT:         pmaxsd(dst, src); break;
1851         case T_LONG:        assert(UseAVX > 2, "required");
1852                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1853         default:            assert(false, "wrong type");
1854       }
1855       break;
1856     case Op_AddReductionVF: addss(dst, src); break;
1857     case Op_AddReductionVD: addsd(dst, src); break;
1858     case Op_AddReductionVI:
1859       switch (typ) {
1860         case T_BYTE:        paddb(dst, src); break;
1861         case T_SHORT:       paddw(dst, src); break;
1862         case T_INT:         paddd(dst, src); break;
1863         default:            assert(false, "wrong type");
1864       }
1865       break;
1866     case Op_AddReductionVL: paddq(dst, src); break;
1867     case Op_MulReductionVF: mulss(dst, src); break;
1868     case Op_MulReductionVD: mulsd(dst, src); break;
1869     case Op_MulReductionVI:
1870       switch (typ) {
1871         case T_SHORT:       pmullw(dst, src); break;
1872         case T_INT:         pmulld(dst, src); break;
1873         default:            assert(false, "wrong type");
1874       }
1875       break;
1876     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1877                             evpmullq(dst, dst, src, vector_len); break;
1878     default:                assert(false, "wrong opcode");
1879   }
1880 }
1881 
1882 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1883   switch (opcode) {
1884     case Op_AddReductionVF: addps(dst, src); break;
1885     case Op_AddReductionVD: addpd(dst, src); break;
1886     case Op_MulReductionVF: mulps(dst, src); break;
1887     case Op_MulReductionVD: mulpd(dst, src); break;
1888     default:                assert(false, "%s", NodeClassNames[opcode]);
1889   }
1890 }
1891 
1892 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1893   int vector_len = Assembler::AVX_256bit;
1894 
1895   switch (opcode) {
1896     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1897     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1898     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1899     case Op_MinReductionV:
1900       switch (typ) {
1901         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1902         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1903         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1904         case T_LONG:        assert(UseAVX > 2, "required");
1905                             vpminsq(dst, src1, src2, vector_len); break;
1906         default:            assert(false, "wrong type");
1907       }
1908       break;
1909     case Op_MaxReductionV:
1910       switch (typ) {
1911         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1912         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1913         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1914         case T_LONG:        assert(UseAVX > 2, "required");
1915                             vpmaxsq(dst, src1, src2, vector_len); break;
1916         default:            assert(false, "wrong type");
1917       }
1918       break;
1919     case Op_AddReductionVI:
1920       switch (typ) {
1921         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1922         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1923         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1924         default:            assert(false, "wrong type");
1925       }
1926       break;
1927     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1928     case Op_MulReductionVI:
1929       switch (typ) {
1930         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1931         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1932         default:            assert(false, "wrong type");
1933       }
1934       break;
1935     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1936     default:                assert(false, "wrong opcode");
1937   }
1938 }
1939 
1940 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1941   int vector_len = Assembler::AVX_256bit;
1942 
1943   switch (opcode) {
1944     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1945     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1946     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1947     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1948     default:                assert(false, "%s", NodeClassNames[opcode]);
1949   }
1950 }
1951 
1952 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1953                                   XMMRegister dst, XMMRegister src,
1954                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1955   switch (opcode) {
1956     case Op_AddReductionVF:
1957     case Op_MulReductionVF:
1958       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1959       break;
1960 
1961     case Op_AddReductionVD:
1962     case Op_MulReductionVD:
1963       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1964       break;
1965 
1966     default: assert(false, "wrong opcode");
1967   }
1968 }
1969 
1970 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1971                                             XMMRegister dst, XMMRegister src,
1972                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1973   switch (opcode) {
1974     case Op_AddReductionVF:
1975     case Op_MulReductionVF:
1976       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1977       break;
1978 
1979     case Op_AddReductionVD:
1980     case Op_MulReductionVD:
1981       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1982       break;
1983 
1984     default: assert(false, "%s", NodeClassNames[opcode]);
1985   }
1986 }
1987 
1988 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1989                              Register dst, Register src1, XMMRegister src2,
1990                              XMMRegister vtmp1, XMMRegister vtmp2) {
1991   switch (vlen) {
1992     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1993     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1994     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1995     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1996 
1997     default: assert(false, "wrong vector length");
1998   }
1999 }
2000 
2001 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2002                              Register dst, Register src1, XMMRegister src2,
2003                              XMMRegister vtmp1, XMMRegister vtmp2) {
2004   switch (vlen) {
2005     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2006     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2007     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2008     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2009 
2010     default: assert(false, "wrong vector length");
2011   }
2012 }
2013 
2014 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2015                              Register dst, Register src1, XMMRegister src2,
2016                              XMMRegister vtmp1, XMMRegister vtmp2) {
2017   switch (vlen) {
2018     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2019     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2020     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2021     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2022 
2023     default: assert(false, "wrong vector length");
2024   }
2025 }
2026 
2027 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2028                              Register dst, Register src1, XMMRegister src2,
2029                              XMMRegister vtmp1, XMMRegister vtmp2) {
2030   switch (vlen) {
2031     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2032     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2033     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2034     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2035 
2036     default: assert(false, "wrong vector length");
2037   }
2038 }
2039 
2040 #ifdef _LP64
2041 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2042                              Register dst, Register src1, XMMRegister src2,
2043                              XMMRegister vtmp1, XMMRegister vtmp2) {
2044   switch (vlen) {
2045     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2046     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2047     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2048 
2049     default: assert(false, "wrong vector length");
2050   }
2051 }
2052 #endif // _LP64
2053 
2054 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2055   switch (vlen) {
2056     case 2:
2057       assert(vtmp2 == xnoreg, "");
2058       reduce2F(opcode, dst, src, vtmp1);
2059       break;
2060     case 4:
2061       assert(vtmp2 == xnoreg, "");
2062       reduce4F(opcode, dst, src, vtmp1);
2063       break;
2064     case 8:
2065       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2066       break;
2067     case 16:
2068       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2069       break;
2070     default: assert(false, "wrong vector length");
2071   }
2072 }
2073 
2074 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2075   switch (vlen) {
2076     case 2:
2077       assert(vtmp2 == xnoreg, "");
2078       reduce2D(opcode, dst, src, vtmp1);
2079       break;
2080     case 4:
2081       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2082       break;
2083     case 8:
2084       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2085       break;
2086     default: assert(false, "wrong vector length");
2087   }
2088 }
2089 
2090 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2091   switch (vlen) {
2092     case 2:
2093       assert(vtmp1 == xnoreg, "");
2094       assert(vtmp2 == xnoreg, "");
2095       unorderedReduce2F(opcode, dst, src);
2096       break;
2097     case 4:
2098       assert(vtmp2 == xnoreg, "");
2099       unorderedReduce4F(opcode, dst, src, vtmp1);
2100       break;
2101     case 8:
2102       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2103       break;
2104     case 16:
2105       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2106       break;
2107     default: assert(false, "wrong vector length");
2108   }
2109 }
2110 
2111 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2112   switch (vlen) {
2113     case 2:
2114       assert(vtmp1 == xnoreg, "");
2115       assert(vtmp2 == xnoreg, "");
2116       unorderedReduce2D(opcode, dst, src);
2117       break;
2118     case 4:
2119       assert(vtmp2 == xnoreg, "");
2120       unorderedReduce4D(opcode, dst, src, vtmp1);
2121       break;
2122     case 8:
2123       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2124       break;
2125     default: assert(false, "wrong vector length");
2126   }
2127 }
2128 
2129 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2130   if (opcode == Op_AddReductionVI) {
2131     if (vtmp1 != src2) {
2132       movdqu(vtmp1, src2);
2133     }
2134     phaddd(vtmp1, vtmp1);
2135   } else {
2136     pshufd(vtmp1, src2, 0x1);
2137     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2138   }
2139   movdl(vtmp2, src1);
2140   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2141   movdl(dst, vtmp1);
2142 }
2143 
2144 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2145   if (opcode == Op_AddReductionVI) {
2146     if (vtmp1 != src2) {
2147       movdqu(vtmp1, src2);
2148     }
2149     phaddd(vtmp1, src2);
2150     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2151   } else {
2152     pshufd(vtmp2, src2, 0xE);
2153     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2154     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2155   }
2156 }
2157 
2158 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2159   if (opcode == Op_AddReductionVI) {
2160     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2161     vextracti128_high(vtmp2, vtmp1);
2162     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2163     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2164   } else {
2165     vextracti128_high(vtmp1, src2);
2166     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2167     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2168   }
2169 }
2170 
2171 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2172   vextracti64x4_high(vtmp2, src2);
2173   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2174   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2175 }
2176 
2177 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2178   pshufd(vtmp2, src2, 0x1);
2179   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2180   movdqu(vtmp1, vtmp2);
2181   psrldq(vtmp1, 2);
2182   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2183   movdqu(vtmp2, vtmp1);
2184   psrldq(vtmp2, 1);
2185   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2186   movdl(vtmp2, src1);
2187   pmovsxbd(vtmp1, vtmp1);
2188   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2189   pextrb(dst, vtmp1, 0x0);
2190   movsbl(dst, dst);
2191 }
2192 
2193 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2194   pshufd(vtmp1, src2, 0xE);
2195   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2196   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2197 }
2198 
2199 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2200   vextracti128_high(vtmp2, src2);
2201   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2202   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2203 }
2204 
2205 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2206   vextracti64x4_high(vtmp1, src2);
2207   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2208   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2209 }
2210 
2211 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2212   pmovsxbw(vtmp2, src2);
2213   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2214 }
2215 
2216 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2217   if (UseAVX > 1) {
2218     int vector_len = Assembler::AVX_256bit;
2219     vpmovsxbw(vtmp1, src2, vector_len);
2220     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2221   } else {
2222     pmovsxbw(vtmp2, src2);
2223     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2224     pshufd(vtmp2, src2, 0x1);
2225     pmovsxbw(vtmp2, src2);
2226     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2227   }
2228 }
2229 
2230 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2231   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2232     int vector_len = Assembler::AVX_512bit;
2233     vpmovsxbw(vtmp1, src2, vector_len);
2234     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2235   } else {
2236     assert(UseAVX >= 2,"Should not reach here.");
2237     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2238     vextracti128_high(vtmp2, src2);
2239     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2240   }
2241 }
2242 
2243 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2244   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2245   vextracti64x4_high(vtmp2, src2);
2246   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2247 }
2248 
2249 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2250   if (opcode == Op_AddReductionVI) {
2251     if (vtmp1 != src2) {
2252       movdqu(vtmp1, src2);
2253     }
2254     phaddw(vtmp1, vtmp1);
2255     phaddw(vtmp1, vtmp1);
2256   } else {
2257     pshufd(vtmp2, src2, 0x1);
2258     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2259     movdqu(vtmp1, vtmp2);
2260     psrldq(vtmp1, 2);
2261     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2262   }
2263   movdl(vtmp2, src1);
2264   pmovsxwd(vtmp1, vtmp1);
2265   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2266   pextrw(dst, vtmp1, 0x0);
2267   movswl(dst, dst);
2268 }
2269 
2270 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2271   if (opcode == Op_AddReductionVI) {
2272     if (vtmp1 != src2) {
2273       movdqu(vtmp1, src2);
2274     }
2275     phaddw(vtmp1, src2);
2276   } else {
2277     pshufd(vtmp1, src2, 0xE);
2278     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2279   }
2280   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2281 }
2282 
2283 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2284   if (opcode == Op_AddReductionVI) {
2285     int vector_len = Assembler::AVX_256bit;
2286     vphaddw(vtmp2, src2, src2, vector_len);
2287     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2288   } else {
2289     vextracti128_high(vtmp2, src2);
2290     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2291   }
2292   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2293 }
2294 
2295 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2296   int vector_len = Assembler::AVX_256bit;
2297   vextracti64x4_high(vtmp1, src2);
2298   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2299   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2300 }
2301 
2302 #ifdef _LP64
2303 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2304   pshufd(vtmp2, src2, 0xE);
2305   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2306   movdq(vtmp1, src1);
2307   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2308   movdq(dst, vtmp1);
2309 }
2310 
2311 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2312   vextracti128_high(vtmp1, src2);
2313   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2314   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2315 }
2316 
2317 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2318   vextracti64x4_high(vtmp2, src2);
2319   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2320   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2321 }
2322 
2323 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2324   mov64(temp, -1L);
2325   bzhiq(temp, temp, len);
2326   kmovql(dst, temp);
2327 }
2328 #endif // _LP64
2329 
2330 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2331   reduce_operation_128(T_FLOAT, opcode, dst, src);
2332   pshufd(vtmp, src, 0x1);
2333   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2334 }
2335 
2336 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2337   reduce2F(opcode, dst, src, vtmp);
2338   pshufd(vtmp, src, 0x2);
2339   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2340   pshufd(vtmp, src, 0x3);
2341   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2342 }
2343 
2344 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2345   reduce4F(opcode, dst, src, vtmp2);
2346   vextractf128_high(vtmp2, src);
2347   reduce4F(opcode, dst, vtmp2, vtmp1);
2348 }
2349 
2350 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2351   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2352   vextracti64x4_high(vtmp1, src);
2353   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2354 }
2355 
2356 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2357   pshufd(dst, src, 0x1);
2358   reduce_operation_128(T_FLOAT, opcode, dst, src);
2359 }
2360 
2361 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2362   pshufd(vtmp, src, 0xE);
2363   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2364   unorderedReduce2F(opcode, dst, vtmp);
2365 }
2366 
2367 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2368   vextractf128_high(vtmp1, src);
2369   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2370   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2371 }
2372 
2373 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2374   vextractf64x4_high(vtmp2, src);
2375   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2376   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2377 }
2378 
2379 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2380   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2381   pshufd(vtmp, src, 0xE);
2382   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2383 }
2384 
2385 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2386   reduce2D(opcode, dst, src, vtmp2);
2387   vextractf128_high(vtmp2, src);
2388   reduce2D(opcode, dst, vtmp2, vtmp1);
2389 }
2390 
2391 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2392   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2393   vextracti64x4_high(vtmp1, src);
2394   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2395 }
2396 
2397 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2398   pshufd(dst, src, 0xE);
2399   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2400 }
2401 
2402 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2403   vextractf128_high(vtmp, src);
2404   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2405   unorderedReduce2D(opcode, dst, vtmp);
2406 }
2407 
2408 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2409   vextractf64x4_high(vtmp2, src);
2410   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2411   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2412 }
2413 
2414 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2415   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2416 }
2417 
2418 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2419   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2420 }
2421 
2422 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2423   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2424 }
2425 
2426 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2427                                  int vec_enc) {
2428   switch(elem_bt) {
2429     case T_INT:
2430     case T_FLOAT:
2431       vmaskmovps(dst, src, mask, vec_enc);
2432       break;
2433     case T_LONG:
2434     case T_DOUBLE:
2435       vmaskmovpd(dst, src, mask, vec_enc);
2436       break;
2437     default:
2438       fatal("Unsupported type %s", type2name(elem_bt));
2439       break;
2440   }
2441 }
2442 
2443 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2444                                  int vec_enc) {
2445   switch(elem_bt) {
2446     case T_INT:
2447     case T_FLOAT:
2448       vmaskmovps(dst, src, mask, vec_enc);
2449       break;
2450     case T_LONG:
2451     case T_DOUBLE:
2452       vmaskmovpd(dst, src, mask, vec_enc);
2453       break;
2454     default:
2455       fatal("Unsupported type %s", type2name(elem_bt));
2456       break;
2457   }
2458 }
2459 
2460 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2461                                           XMMRegister dst, XMMRegister src,
2462                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2463                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2464   const int permconst[] = {1, 14};
2465   XMMRegister wsrc = src;
2466   XMMRegister wdst = xmm_0;
2467   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2468 
2469   int vlen_enc = Assembler::AVX_128bit;
2470   if (vlen == 16) {
2471     vlen_enc = Assembler::AVX_256bit;
2472   }
2473 
2474   for (int i = log2(vlen) - 1; i >=0; i--) {
2475     if (i == 0 && !is_dst_valid) {
2476       wdst = dst;
2477     }
2478     if (i == 3) {
2479       vextracti64x4_high(wtmp, wsrc);
2480     } else if (i == 2) {
2481       vextracti128_high(wtmp, wsrc);
2482     } else { // i = [0,1]
2483       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2484     }
2485     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2486     wsrc = wdst;
2487     vlen_enc = Assembler::AVX_128bit;
2488   }
2489   if (is_dst_valid) {
2490     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2491   }
2492 }
2493 
2494 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2495                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2496                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2497   XMMRegister wsrc = src;
2498   XMMRegister wdst = xmm_0;
2499   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2500   int vlen_enc = Assembler::AVX_128bit;
2501   if (vlen == 8) {
2502     vlen_enc = Assembler::AVX_256bit;
2503   }
2504   for (int i = log2(vlen) - 1; i >=0; i--) {
2505     if (i == 0 && !is_dst_valid) {
2506       wdst = dst;
2507     }
2508     if (i == 1) {
2509       vextracti128_high(wtmp, wsrc);
2510     } else if (i == 2) {
2511       vextracti64x4_high(wtmp, wsrc);
2512     } else {
2513       assert(i == 0, "%d", i);
2514       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2515     }
2516     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2517     wsrc = wdst;
2518     vlen_enc = Assembler::AVX_128bit;
2519   }
2520   if (is_dst_valid) {
2521     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2522   }
2523 }
2524 
2525 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2526   switch (bt) {
2527     case T_BYTE:  pextrb(dst, src, idx); break;
2528     case T_SHORT: pextrw(dst, src, idx); break;
2529     case T_INT:   pextrd(dst, src, idx); break;
2530     case T_LONG:  pextrq(dst, src, idx); break;
2531 
2532     default:
2533       assert(false,"Should not reach here.");
2534       break;
2535   }
2536 }
2537 
2538 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2539   int esize =  type2aelembytes(typ);
2540   int elem_per_lane = 16/esize;
2541   int lane = elemindex / elem_per_lane;
2542   int eindex = elemindex % elem_per_lane;
2543 
2544   if (lane >= 2) {
2545     assert(UseAVX > 2, "required");
2546     vextractf32x4(dst, src, lane & 3);
2547     return dst;
2548   } else if (lane > 0) {
2549     assert(UseAVX > 0, "required");
2550     vextractf128(dst, src, lane);
2551     return dst;
2552   } else {
2553     return src;
2554   }
2555 }
2556 
2557 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2558   if (typ == T_BYTE) {
2559     movsbl(dst, dst);
2560   } else if (typ == T_SHORT) {
2561     movswl(dst, dst);
2562   }
2563 }
2564 
2565 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2566   int esize =  type2aelembytes(typ);
2567   int elem_per_lane = 16/esize;
2568   int eindex = elemindex % elem_per_lane;
2569   assert(is_integral_type(typ),"required");
2570 
2571   if (eindex == 0) {
2572     if (typ == T_LONG) {
2573       movq(dst, src);
2574     } else {
2575       movdl(dst, src);
2576       movsxl(typ, dst);
2577     }
2578   } else {
2579     extract(typ, dst, src, eindex);
2580     movsxl(typ, dst);
2581   }
2582 }
2583 
2584 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2585   int esize =  type2aelembytes(typ);
2586   int elem_per_lane = 16/esize;
2587   int eindex = elemindex % elem_per_lane;
2588   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2589 
2590   if (eindex == 0) {
2591     movq(dst, src);
2592   } else {
2593     if (typ == T_FLOAT) {
2594       if (UseAVX == 0) {
2595         movdqu(dst, src);
2596         shufps(dst, dst, eindex);
2597       } else {
2598         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2599       }
2600     } else {
2601       if (UseAVX == 0) {
2602         movdqu(dst, src);
2603         psrldq(dst, eindex*esize);
2604       } else {
2605         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2606       }
2607       movq(dst, dst);
2608     }
2609   }
2610   // Zero upper bits
2611   if (typ == T_FLOAT) {
2612     if (UseAVX == 0) {
2613       assert(vtmp != xnoreg, "required.");
2614       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2615       pand(dst, vtmp);
2616     } else {
2617       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2618     }
2619   }
2620 }
2621 
2622 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2623   switch(typ) {
2624     case T_BYTE:
2625     case T_BOOLEAN:
2626       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2627       break;
2628     case T_SHORT:
2629     case T_CHAR:
2630       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2631       break;
2632     case T_INT:
2633     case T_FLOAT:
2634       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2635       break;
2636     case T_LONG:
2637     case T_DOUBLE:
2638       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2639       break;
2640     default:
2641       assert(false,"Should not reach here.");
2642       break;
2643   }
2644 }
2645 
2646 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2647   assert(rscratch != noreg || always_reachable(src2), "missing");
2648 
2649   switch(typ) {
2650     case T_BOOLEAN:
2651     case T_BYTE:
2652       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2653       break;
2654     case T_CHAR:
2655     case T_SHORT:
2656       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2657       break;
2658     case T_INT:
2659     case T_FLOAT:
2660       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2661       break;
2662     case T_LONG:
2663     case T_DOUBLE:
2664       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2665       break;
2666     default:
2667       assert(false,"Should not reach here.");
2668       break;
2669   }
2670 }
2671 
2672 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2673   switch(typ) {
2674     case T_BYTE:
2675       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2676       break;
2677     case T_SHORT:
2678       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2679       break;
2680     case T_INT:
2681     case T_FLOAT:
2682       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2683       break;
2684     case T_LONG:
2685     case T_DOUBLE:
2686       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2687       break;
2688     default:
2689       assert(false,"Should not reach here.");
2690       break;
2691   }
2692 }
2693 
2694 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2695   assert(vlen_in_bytes <= 32, "");
2696   int esize = type2aelembytes(bt);
2697   if (vlen_in_bytes == 32) {
2698     assert(vtmp == xnoreg, "required.");
2699     if (esize >= 4) {
2700       vtestps(src1, src2, AVX_256bit);
2701     } else {
2702       vptest(src1, src2, AVX_256bit);
2703     }
2704     return;
2705   }
2706   if (vlen_in_bytes < 16) {
2707     // Duplicate the lower part to fill the whole register,
2708     // Don't need to do so for src2
2709     assert(vtmp != xnoreg, "required");
2710     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2711     pshufd(vtmp, src1, shuffle_imm);
2712   } else {
2713     assert(vtmp == xnoreg, "required");
2714     vtmp = src1;
2715   }
2716   if (esize >= 4 && VM_Version::supports_avx()) {
2717     vtestps(vtmp, src2, AVX_128bit);
2718   } else {
2719     ptest(vtmp, src2);
2720   }
2721 }
2722 
2723 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2724 #ifdef ASSERT
2725   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2726   bool is_bw_supported = VM_Version::supports_avx512bw();
2727   if (is_bw && !is_bw_supported) {
2728     assert(vlen_enc != Assembler::AVX_512bit, "required");
2729     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2730            "XMM register should be 0-15");
2731   }
2732 #endif // ASSERT
2733   switch (elem_bt) {
2734     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2735     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2736     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2737     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2738     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2739     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2740     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2741   }
2742 }
2743 
2744 #ifdef _LP64
2745 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2746   assert(UseAVX >= 2, "required");
2747   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2748   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2749   if ((UseAVX > 2) &&
2750       (!is_bw || VM_Version::supports_avx512bw()) &&
2751       (!is_vl || VM_Version::supports_avx512vl())) {
2752     switch (elem_bt) {
2753       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2754       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2755       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2756       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2757       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2758     }
2759   } else {
2760     assert(vlen_enc != Assembler::AVX_512bit, "required");
2761     assert((dst->encoding() < 16),"XMM register should be 0-15");
2762     switch (elem_bt) {
2763       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2764       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2765       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2766       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2767       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2768       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2769       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2770     }
2771   }
2772 }
2773 #endif
2774 
2775 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2776   switch (to_elem_bt) {
2777     case T_SHORT:
2778       vpmovsxbw(dst, src, vlen_enc);
2779       break;
2780     case T_INT:
2781       vpmovsxbd(dst, src, vlen_enc);
2782       break;
2783     case T_FLOAT:
2784       vpmovsxbd(dst, src, vlen_enc);
2785       vcvtdq2ps(dst, dst, vlen_enc);
2786       break;
2787     case T_LONG:
2788       vpmovsxbq(dst, src, vlen_enc);
2789       break;
2790     case T_DOUBLE: {
2791       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2792       vpmovsxbd(dst, src, mid_vlen_enc);
2793       vcvtdq2pd(dst, dst, vlen_enc);
2794       break;
2795     }
2796     default:
2797       fatal("Unsupported type %s", type2name(to_elem_bt));
2798       break;
2799   }
2800 }
2801 
2802 //-------------------------------------------------------------------------------------------
2803 
2804 // IndexOf for constant substrings with size >= 8 chars
2805 // which don't need to be loaded through stack.
2806 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2807                                          Register cnt1, Register cnt2,
2808                                          int int_cnt2,  Register result,
2809                                          XMMRegister vec, Register tmp,
2810                                          int ae) {
2811   ShortBranchVerifier sbv(this);
2812   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2813   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2814 
2815   // This method uses the pcmpestri instruction with bound registers
2816   //   inputs:
2817   //     xmm - substring
2818   //     rax - substring length (elements count)
2819   //     mem - scanned string
2820   //     rdx - string length (elements count)
2821   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2822   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2823   //   outputs:
2824   //     rcx - matched index in string
2825   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2826   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2827   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2828   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2829   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2830 
2831   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2832         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2833         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2834 
2835   // Note, inline_string_indexOf() generates checks:
2836   // if (substr.count > string.count) return -1;
2837   // if (substr.count == 0) return 0;
2838   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2839 
2840   // Load substring.
2841   if (ae == StrIntrinsicNode::UL) {
2842     pmovzxbw(vec, Address(str2, 0));
2843   } else {
2844     movdqu(vec, Address(str2, 0));
2845   }
2846   movl(cnt2, int_cnt2);
2847   movptr(result, str1); // string addr
2848 
2849   if (int_cnt2 > stride) {
2850     jmpb(SCAN_TO_SUBSTR);
2851 
2852     // Reload substr for rescan, this code
2853     // is executed only for large substrings (> 8 chars)
2854     bind(RELOAD_SUBSTR);
2855     if (ae == StrIntrinsicNode::UL) {
2856       pmovzxbw(vec, Address(str2, 0));
2857     } else {
2858       movdqu(vec, Address(str2, 0));
2859     }
2860     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2861 
2862     bind(RELOAD_STR);
2863     // We came here after the beginning of the substring was
2864     // matched but the rest of it was not so we need to search
2865     // again. Start from the next element after the previous match.
2866 
2867     // cnt2 is number of substring reminding elements and
2868     // cnt1 is number of string reminding elements when cmp failed.
2869     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2870     subl(cnt1, cnt2);
2871     addl(cnt1, int_cnt2);
2872     movl(cnt2, int_cnt2); // Now restore cnt2
2873 
2874     decrementl(cnt1);     // Shift to next element
2875     cmpl(cnt1, cnt2);
2876     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2877 
2878     addptr(result, (1<<scale1));
2879 
2880   } // (int_cnt2 > 8)
2881 
2882   // Scan string for start of substr in 16-byte vectors
2883   bind(SCAN_TO_SUBSTR);
2884   pcmpestri(vec, Address(result, 0), mode);
2885   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2886   subl(cnt1, stride);
2887   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2888   cmpl(cnt1, cnt2);
2889   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2890   addptr(result, 16);
2891   jmpb(SCAN_TO_SUBSTR);
2892 
2893   // Found a potential substr
2894   bind(FOUND_CANDIDATE);
2895   // Matched whole vector if first element matched (tmp(rcx) == 0).
2896   if (int_cnt2 == stride) {
2897     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2898   } else { // int_cnt2 > 8
2899     jccb(Assembler::overflow, FOUND_SUBSTR);
2900   }
2901   // After pcmpestri tmp(rcx) contains matched element index
2902   // Compute start addr of substr
2903   lea(result, Address(result, tmp, scale1));
2904 
2905   // Make sure string is still long enough
2906   subl(cnt1, tmp);
2907   cmpl(cnt1, cnt2);
2908   if (int_cnt2 == stride) {
2909     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2910   } else { // int_cnt2 > 8
2911     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2912   }
2913   // Left less then substring.
2914 
2915   bind(RET_NOT_FOUND);
2916   movl(result, -1);
2917   jmp(EXIT);
2918 
2919   if (int_cnt2 > stride) {
2920     // This code is optimized for the case when whole substring
2921     // is matched if its head is matched.
2922     bind(MATCH_SUBSTR_HEAD);
2923     pcmpestri(vec, Address(result, 0), mode);
2924     // Reload only string if does not match
2925     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2926 
2927     Label CONT_SCAN_SUBSTR;
2928     // Compare the rest of substring (> 8 chars).
2929     bind(FOUND_SUBSTR);
2930     // First 8 chars are already matched.
2931     negptr(cnt2);
2932     addptr(cnt2, stride);
2933 
2934     bind(SCAN_SUBSTR);
2935     subl(cnt1, stride);
2936     cmpl(cnt2, -stride); // Do not read beyond substring
2937     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2938     // Back-up strings to avoid reading beyond substring:
2939     // cnt1 = cnt1 - cnt2 + 8
2940     addl(cnt1, cnt2); // cnt2 is negative
2941     addl(cnt1, stride);
2942     movl(cnt2, stride); negptr(cnt2);
2943     bind(CONT_SCAN_SUBSTR);
2944     if (int_cnt2 < (int)G) {
2945       int tail_off1 = int_cnt2<<scale1;
2946       int tail_off2 = int_cnt2<<scale2;
2947       if (ae == StrIntrinsicNode::UL) {
2948         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2949       } else {
2950         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2951       }
2952       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2953     } else {
2954       // calculate index in register to avoid integer overflow (int_cnt2*2)
2955       movl(tmp, int_cnt2);
2956       addptr(tmp, cnt2);
2957       if (ae == StrIntrinsicNode::UL) {
2958         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2959       } else {
2960         movdqu(vec, Address(str2, tmp, scale2, 0));
2961       }
2962       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2963     }
2964     // Need to reload strings pointers if not matched whole vector
2965     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2966     addptr(cnt2, stride);
2967     jcc(Assembler::negative, SCAN_SUBSTR);
2968     // Fall through if found full substring
2969 
2970   } // (int_cnt2 > 8)
2971 
2972   bind(RET_FOUND);
2973   // Found result if we matched full small substring.
2974   // Compute substr offset
2975   subptr(result, str1);
2976   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2977     shrl(result, 1); // index
2978   }
2979   bind(EXIT);
2980 
2981 } // string_indexofC8
2982 
2983 // Small strings are loaded through stack if they cross page boundary.
2984 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2985                                        Register cnt1, Register cnt2,
2986                                        int int_cnt2,  Register result,
2987                                        XMMRegister vec, Register tmp,
2988                                        int ae) {
2989   ShortBranchVerifier sbv(this);
2990   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2991   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2992 
2993   //
2994   // int_cnt2 is length of small (< 8 chars) constant substring
2995   // or (-1) for non constant substring in which case its length
2996   // is in cnt2 register.
2997   //
2998   // Note, inline_string_indexOf() generates checks:
2999   // if (substr.count > string.count) return -1;
3000   // if (substr.count == 0) return 0;
3001   //
3002   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3003   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3004   // This method uses the pcmpestri instruction with bound registers
3005   //   inputs:
3006   //     xmm - substring
3007   //     rax - substring length (elements count)
3008   //     mem - scanned string
3009   //     rdx - string length (elements count)
3010   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3011   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3012   //   outputs:
3013   //     rcx - matched index in string
3014   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3015   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3016   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3017   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3018 
3019   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3020         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3021         FOUND_CANDIDATE;
3022 
3023   { //========================================================
3024     // We don't know where these strings are located
3025     // and we can't read beyond them. Load them through stack.
3026     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3027 
3028     movptr(tmp, rsp); // save old SP
3029 
3030     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3031       if (int_cnt2 == (1>>scale2)) { // One byte
3032         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3033         load_unsigned_byte(result, Address(str2, 0));
3034         movdl(vec, result); // move 32 bits
3035       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3036         // Not enough header space in 32-bit VM: 12+3 = 15.
3037         movl(result, Address(str2, -1));
3038         shrl(result, 8);
3039         movdl(vec, result); // move 32 bits
3040       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3041         load_unsigned_short(result, Address(str2, 0));
3042         movdl(vec, result); // move 32 bits
3043       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3044         movdl(vec, Address(str2, 0)); // move 32 bits
3045       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3046         movq(vec, Address(str2, 0));  // move 64 bits
3047       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3048         // Array header size is 12 bytes in 32-bit VM
3049         // + 6 bytes for 3 chars == 18 bytes,
3050         // enough space to load vec and shift.
3051         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3052         if (ae == StrIntrinsicNode::UL) {
3053           int tail_off = int_cnt2-8;
3054           pmovzxbw(vec, Address(str2, tail_off));
3055           psrldq(vec, -2*tail_off);
3056         }
3057         else {
3058           int tail_off = int_cnt2*(1<<scale2);
3059           movdqu(vec, Address(str2, tail_off-16));
3060           psrldq(vec, 16-tail_off);
3061         }
3062       }
3063     } else { // not constant substring
3064       cmpl(cnt2, stride);
3065       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3066 
3067       // We can read beyond string if srt+16 does not cross page boundary
3068       // since heaps are aligned and mapped by pages.
3069       assert(os::vm_page_size() < (int)G, "default page should be small");
3070       movl(result, str2); // We need only low 32 bits
3071       andl(result, ((int)os::vm_page_size()-1));
3072       cmpl(result, ((int)os::vm_page_size()-16));
3073       jccb(Assembler::belowEqual, CHECK_STR);
3074 
3075       // Move small strings to stack to allow load 16 bytes into vec.
3076       subptr(rsp, 16);
3077       int stk_offset = wordSize-(1<<scale2);
3078       push(cnt2);
3079 
3080       bind(COPY_SUBSTR);
3081       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3082         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3083         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3084       } else if (ae == StrIntrinsicNode::UU) {
3085         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3086         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3087       }
3088       decrement(cnt2);
3089       jccb(Assembler::notZero, COPY_SUBSTR);
3090 
3091       pop(cnt2);
3092       movptr(str2, rsp);  // New substring address
3093     } // non constant
3094 
3095     bind(CHECK_STR);
3096     cmpl(cnt1, stride);
3097     jccb(Assembler::aboveEqual, BIG_STRINGS);
3098 
3099     // Check cross page boundary.
3100     movl(result, str1); // We need only low 32 bits
3101     andl(result, ((int)os::vm_page_size()-1));
3102     cmpl(result, ((int)os::vm_page_size()-16));
3103     jccb(Assembler::belowEqual, BIG_STRINGS);
3104 
3105     subptr(rsp, 16);
3106     int stk_offset = -(1<<scale1);
3107     if (int_cnt2 < 0) { // not constant
3108       push(cnt2);
3109       stk_offset += wordSize;
3110     }
3111     movl(cnt2, cnt1);
3112 
3113     bind(COPY_STR);
3114     if (ae == StrIntrinsicNode::LL) {
3115       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3116       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3117     } else {
3118       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3119       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3120     }
3121     decrement(cnt2);
3122     jccb(Assembler::notZero, COPY_STR);
3123 
3124     if (int_cnt2 < 0) { // not constant
3125       pop(cnt2);
3126     }
3127     movptr(str1, rsp);  // New string address
3128 
3129     bind(BIG_STRINGS);
3130     // Load substring.
3131     if (int_cnt2 < 0) { // -1
3132       if (ae == StrIntrinsicNode::UL) {
3133         pmovzxbw(vec, Address(str2, 0));
3134       } else {
3135         movdqu(vec, Address(str2, 0));
3136       }
3137       push(cnt2);       // substr count
3138       push(str2);       // substr addr
3139       push(str1);       // string addr
3140     } else {
3141       // Small (< 8 chars) constant substrings are loaded already.
3142       movl(cnt2, int_cnt2);
3143     }
3144     push(tmp);  // original SP
3145 
3146   } // Finished loading
3147 
3148   //========================================================
3149   // Start search
3150   //
3151 
3152   movptr(result, str1); // string addr
3153 
3154   if (int_cnt2  < 0) {  // Only for non constant substring
3155     jmpb(SCAN_TO_SUBSTR);
3156 
3157     // SP saved at sp+0
3158     // String saved at sp+1*wordSize
3159     // Substr saved at sp+2*wordSize
3160     // Substr count saved at sp+3*wordSize
3161 
3162     // Reload substr for rescan, this code
3163     // is executed only for large substrings (> 8 chars)
3164     bind(RELOAD_SUBSTR);
3165     movptr(str2, Address(rsp, 2*wordSize));
3166     movl(cnt2, Address(rsp, 3*wordSize));
3167     if (ae == StrIntrinsicNode::UL) {
3168       pmovzxbw(vec, Address(str2, 0));
3169     } else {
3170       movdqu(vec, Address(str2, 0));
3171     }
3172     // We came here after the beginning of the substring was
3173     // matched but the rest of it was not so we need to search
3174     // again. Start from the next element after the previous match.
3175     subptr(str1, result); // Restore counter
3176     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3177       shrl(str1, 1);
3178     }
3179     addl(cnt1, str1);
3180     decrementl(cnt1);   // Shift to next element
3181     cmpl(cnt1, cnt2);
3182     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3183 
3184     addptr(result, (1<<scale1));
3185   } // non constant
3186 
3187   // Scan string for start of substr in 16-byte vectors
3188   bind(SCAN_TO_SUBSTR);
3189   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3190   pcmpestri(vec, Address(result, 0), mode);
3191   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3192   subl(cnt1, stride);
3193   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3194   cmpl(cnt1, cnt2);
3195   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3196   addptr(result, 16);
3197 
3198   bind(ADJUST_STR);
3199   cmpl(cnt1, stride); // Do not read beyond string
3200   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3201   // Back-up string to avoid reading beyond string.
3202   lea(result, Address(result, cnt1, scale1, -16));
3203   movl(cnt1, stride);
3204   jmpb(SCAN_TO_SUBSTR);
3205 
3206   // Found a potential substr
3207   bind(FOUND_CANDIDATE);
3208   // After pcmpestri tmp(rcx) contains matched element index
3209 
3210   // Make sure string is still long enough
3211   subl(cnt1, tmp);
3212   cmpl(cnt1, cnt2);
3213   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3214   // Left less then substring.
3215 
3216   bind(RET_NOT_FOUND);
3217   movl(result, -1);
3218   jmp(CLEANUP);
3219 
3220   bind(FOUND_SUBSTR);
3221   // Compute start addr of substr
3222   lea(result, Address(result, tmp, scale1));
3223   if (int_cnt2 > 0) { // Constant substring
3224     // Repeat search for small substring (< 8 chars)
3225     // from new point without reloading substring.
3226     // Have to check that we don't read beyond string.
3227     cmpl(tmp, stride-int_cnt2);
3228     jccb(Assembler::greater, ADJUST_STR);
3229     // Fall through if matched whole substring.
3230   } else { // non constant
3231     assert(int_cnt2 == -1, "should be != 0");
3232 
3233     addl(tmp, cnt2);
3234     // Found result if we matched whole substring.
3235     cmpl(tmp, stride);
3236     jcc(Assembler::lessEqual, RET_FOUND);
3237 
3238     // Repeat search for small substring (<= 8 chars)
3239     // from new point 'str1' without reloading substring.
3240     cmpl(cnt2, stride);
3241     // Have to check that we don't read beyond string.
3242     jccb(Assembler::lessEqual, ADJUST_STR);
3243 
3244     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3245     // Compare the rest of substring (> 8 chars).
3246     movptr(str1, result);
3247 
3248     cmpl(tmp, cnt2);
3249     // First 8 chars are already matched.
3250     jccb(Assembler::equal, CHECK_NEXT);
3251 
3252     bind(SCAN_SUBSTR);
3253     pcmpestri(vec, Address(str1, 0), mode);
3254     // Need to reload strings pointers if not matched whole vector
3255     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3256 
3257     bind(CHECK_NEXT);
3258     subl(cnt2, stride);
3259     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3260     addptr(str1, 16);
3261     if (ae == StrIntrinsicNode::UL) {
3262       addptr(str2, 8);
3263     } else {
3264       addptr(str2, 16);
3265     }
3266     subl(cnt1, stride);
3267     cmpl(cnt2, stride); // Do not read beyond substring
3268     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3269     // Back-up strings to avoid reading beyond substring.
3270 
3271     if (ae == StrIntrinsicNode::UL) {
3272       lea(str2, Address(str2, cnt2, scale2, -8));
3273       lea(str1, Address(str1, cnt2, scale1, -16));
3274     } else {
3275       lea(str2, Address(str2, cnt2, scale2, -16));
3276       lea(str1, Address(str1, cnt2, scale1, -16));
3277     }
3278     subl(cnt1, cnt2);
3279     movl(cnt2, stride);
3280     addl(cnt1, stride);
3281     bind(CONT_SCAN_SUBSTR);
3282     if (ae == StrIntrinsicNode::UL) {
3283       pmovzxbw(vec, Address(str2, 0));
3284     } else {
3285       movdqu(vec, Address(str2, 0));
3286     }
3287     jmp(SCAN_SUBSTR);
3288 
3289     bind(RET_FOUND_LONG);
3290     movptr(str1, Address(rsp, wordSize));
3291   } // non constant
3292 
3293   bind(RET_FOUND);
3294   // Compute substr offset
3295   subptr(result, str1);
3296   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3297     shrl(result, 1); // index
3298   }
3299   bind(CLEANUP);
3300   pop(rsp); // restore SP
3301 
3302 } // string_indexof
3303 
3304 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3305                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3306   ShortBranchVerifier sbv(this);
3307   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3308 
3309   int stride = 8;
3310 
3311   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3312         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3313         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3314         FOUND_SEQ_CHAR, DONE_LABEL;
3315 
3316   movptr(result, str1);
3317   if (UseAVX >= 2) {
3318     cmpl(cnt1, stride);
3319     jcc(Assembler::less, SCAN_TO_CHAR);
3320     cmpl(cnt1, 2*stride);
3321     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3322     movdl(vec1, ch);
3323     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3324     vpxor(vec2, vec2);
3325     movl(tmp, cnt1);
3326     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3327     andl(cnt1,0x0000000F);  //tail count (in chars)
3328 
3329     bind(SCAN_TO_16_CHAR_LOOP);
3330     vmovdqu(vec3, Address(result, 0));
3331     vpcmpeqw(vec3, vec3, vec1, 1);
3332     vptest(vec2, vec3);
3333     jcc(Assembler::carryClear, FOUND_CHAR);
3334     addptr(result, 32);
3335     subl(tmp, 2*stride);
3336     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3337     jmp(SCAN_TO_8_CHAR);
3338     bind(SCAN_TO_8_CHAR_INIT);
3339     movdl(vec1, ch);
3340     pshuflw(vec1, vec1, 0x00);
3341     pshufd(vec1, vec1, 0);
3342     pxor(vec2, vec2);
3343   }
3344   bind(SCAN_TO_8_CHAR);
3345   cmpl(cnt1, stride);
3346   jcc(Assembler::less, SCAN_TO_CHAR);
3347   if (UseAVX < 2) {
3348     movdl(vec1, ch);
3349     pshuflw(vec1, vec1, 0x00);
3350     pshufd(vec1, vec1, 0);
3351     pxor(vec2, vec2);
3352   }
3353   movl(tmp, cnt1);
3354   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3355   andl(cnt1,0x00000007);  //tail count (in chars)
3356 
3357   bind(SCAN_TO_8_CHAR_LOOP);
3358   movdqu(vec3, Address(result, 0));
3359   pcmpeqw(vec3, vec1);
3360   ptest(vec2, vec3);
3361   jcc(Assembler::carryClear, FOUND_CHAR);
3362   addptr(result, 16);
3363   subl(tmp, stride);
3364   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3365   bind(SCAN_TO_CHAR);
3366   testl(cnt1, cnt1);
3367   jcc(Assembler::zero, RET_NOT_FOUND);
3368   bind(SCAN_TO_CHAR_LOOP);
3369   load_unsigned_short(tmp, Address(result, 0));
3370   cmpl(ch, tmp);
3371   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3372   addptr(result, 2);
3373   subl(cnt1, 1);
3374   jccb(Assembler::zero, RET_NOT_FOUND);
3375   jmp(SCAN_TO_CHAR_LOOP);
3376 
3377   bind(RET_NOT_FOUND);
3378   movl(result, -1);
3379   jmpb(DONE_LABEL);
3380 
3381   bind(FOUND_CHAR);
3382   if (UseAVX >= 2) {
3383     vpmovmskb(tmp, vec3);
3384   } else {
3385     pmovmskb(tmp, vec3);
3386   }
3387   bsfl(ch, tmp);
3388   addptr(result, ch);
3389 
3390   bind(FOUND_SEQ_CHAR);
3391   subptr(result, str1);
3392   shrl(result, 1);
3393 
3394   bind(DONE_LABEL);
3395 } // string_indexof_char
3396 
3397 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3398                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3399   ShortBranchVerifier sbv(this);
3400   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3401 
3402   int stride = 16;
3403 
3404   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3405         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3406         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3407         FOUND_SEQ_CHAR, DONE_LABEL;
3408 
3409   movptr(result, str1);
3410   if (UseAVX >= 2) {
3411     cmpl(cnt1, stride);
3412     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3413     cmpl(cnt1, stride*2);
3414     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3415     movdl(vec1, ch);
3416     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3417     vpxor(vec2, vec2);
3418     movl(tmp, cnt1);
3419     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3420     andl(cnt1,0x0000001F);  //tail count (in chars)
3421 
3422     bind(SCAN_TO_32_CHAR_LOOP);
3423     vmovdqu(vec3, Address(result, 0));
3424     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3425     vptest(vec2, vec3);
3426     jcc(Assembler::carryClear, FOUND_CHAR);
3427     addptr(result, 32);
3428     subl(tmp, stride*2);
3429     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3430     jmp(SCAN_TO_16_CHAR);
3431 
3432     bind(SCAN_TO_16_CHAR_INIT);
3433     movdl(vec1, ch);
3434     pxor(vec2, vec2);
3435     pshufb(vec1, vec2);
3436   }
3437 
3438   bind(SCAN_TO_16_CHAR);
3439   cmpl(cnt1, stride);
3440   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3441   if (UseAVX < 2) {
3442     movdl(vec1, ch);
3443     pxor(vec2, vec2);
3444     pshufb(vec1, vec2);
3445   }
3446   movl(tmp, cnt1);
3447   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3448   andl(cnt1,0x0000000F);  //tail count (in bytes)
3449 
3450   bind(SCAN_TO_16_CHAR_LOOP);
3451   movdqu(vec3, Address(result, 0));
3452   pcmpeqb(vec3, vec1);
3453   ptest(vec2, vec3);
3454   jcc(Assembler::carryClear, FOUND_CHAR);
3455   addptr(result, 16);
3456   subl(tmp, stride);
3457   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3458 
3459   bind(SCAN_TO_CHAR_INIT);
3460   testl(cnt1, cnt1);
3461   jcc(Assembler::zero, RET_NOT_FOUND);
3462   bind(SCAN_TO_CHAR_LOOP);
3463   load_unsigned_byte(tmp, Address(result, 0));
3464   cmpl(ch, tmp);
3465   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3466   addptr(result, 1);
3467   subl(cnt1, 1);
3468   jccb(Assembler::zero, RET_NOT_FOUND);
3469   jmp(SCAN_TO_CHAR_LOOP);
3470 
3471   bind(RET_NOT_FOUND);
3472   movl(result, -1);
3473   jmpb(DONE_LABEL);
3474 
3475   bind(FOUND_CHAR);
3476   if (UseAVX >= 2) {
3477     vpmovmskb(tmp, vec3);
3478   } else {
3479     pmovmskb(tmp, vec3);
3480   }
3481   bsfl(ch, tmp);
3482   addptr(result, ch);
3483 
3484   bind(FOUND_SEQ_CHAR);
3485   subptr(result, str1);
3486 
3487   bind(DONE_LABEL);
3488 } // stringL_indexof_char
3489 
3490 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3491   switch (eltype) {
3492   case T_BOOLEAN: return sizeof(jboolean);
3493   case T_BYTE:  return sizeof(jbyte);
3494   case T_SHORT: return sizeof(jshort);
3495   case T_CHAR:  return sizeof(jchar);
3496   case T_INT:   return sizeof(jint);
3497   default:
3498     ShouldNotReachHere();
3499     return -1;
3500   }
3501 }
3502 
3503 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3504   switch (eltype) {
3505   // T_BOOLEAN used as surrogate for unsigned byte
3506   case T_BOOLEAN: movzbl(dst, src);   break;
3507   case T_BYTE:    movsbl(dst, src);   break;
3508   case T_SHORT:   movswl(dst, src);   break;
3509   case T_CHAR:    movzwl(dst, src);   break;
3510   case T_INT:     movl(dst, src);     break;
3511   default:
3512     ShouldNotReachHere();
3513   }
3514 }
3515 
3516 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3517   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3518 }
3519 
3520 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3521   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3522 }
3523 
3524 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3525   const int vlen = Assembler::AVX_256bit;
3526   switch (eltype) {
3527   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3528   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3529   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3530   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3531   case T_INT:
3532     // do nothing
3533     break;
3534   default:
3535     ShouldNotReachHere();
3536   }
3537 }
3538 
3539 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3540                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3541                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3542                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3543                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3544                                         BasicType eltype) {
3545   ShortBranchVerifier sbv(this);
3546   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3547   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3548   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3549 
3550   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3551         SHORT_UNROLLED_LOOP_EXIT,
3552         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3553         UNROLLED_VECTOR_LOOP_BEGIN,
3554         END;
3555   switch (eltype) {
3556   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3557   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3558   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3559   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3560   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3561   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3562   }
3563 
3564   // For "renaming" for readibility of the code
3565   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3566                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3567                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3568 
3569   const int elsize = arrays_hashcode_elsize(eltype);
3570 
3571   /*
3572     if (cnt1 >= 2) {
3573       if (cnt1 >= 32) {
3574         UNROLLED VECTOR LOOP
3575       }
3576       UNROLLED SCALAR LOOP
3577     }
3578     SINGLE SCALAR
3579    */
3580 
3581   cmpl(cnt1, 32);
3582   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3583 
3584   // cnt1 >= 32 && generate_vectorized_loop
3585   xorl(index, index);
3586 
3587   // vresult = IntVector.zero(I256);
3588   for (int idx = 0; idx < 4; idx++) {
3589     vpxor(vresult[idx], vresult[idx]);
3590   }
3591   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3592   Register bound = tmp2;
3593   Register next = tmp3;
3594   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3595   movl(next, Address(tmp2, 0));
3596   movdl(vnext, next);
3597   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3598 
3599   // index = 0;
3600   // bound = cnt1 & ~(32 - 1);
3601   movl(bound, cnt1);
3602   andl(bound, ~(32 - 1));
3603   // for (; index < bound; index += 32) {
3604   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3605   // result *= next;
3606   imull(result, next);
3607   // loop fission to upfront the cost of fetching from memory, OOO execution
3608   // can then hopefully do a better job of prefetching
3609   for (int idx = 0; idx < 4; idx++) {
3610     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3611   }
3612   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3613   for (int idx = 0; idx < 4; idx++) {
3614     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3615     arrays_hashcode_elvcast(vtmp[idx], eltype);
3616     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3617   }
3618   // index += 32;
3619   addl(index, 32);
3620   // index < bound;
3621   cmpl(index, bound);
3622   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3623   // }
3624 
3625   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3626   subl(cnt1, bound);
3627   // release bound
3628 
3629   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3630   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3631   for (int idx = 0; idx < 4; idx++) {
3632     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, (int)((8 * idx + 1) * sizeof(jint))), T_INT);
3633     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3634   }
3635   // result += vresult.reduceLanes(ADD);
3636   for (int idx = 0; idx < 4; idx++) {
3637     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3638   }
3639 
3640   // } else if (cnt1 < 32) {
3641 
3642   bind(SHORT_UNROLLED_BEGIN);
3643   // int i = 1;
3644   movl(index, 1);
3645   cmpl(index, cnt1);
3646   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3647 
3648   // for (; i < cnt1 ; i += 2) {
3649   bind(SHORT_UNROLLED_LOOP_BEGIN);
3650   movl(tmp3, 961);
3651   imull(result, tmp3);
3652   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3653   movl(tmp3, tmp2);
3654   shll(tmp3, 5);
3655   subl(tmp3, tmp2);
3656   addl(result, tmp3);
3657   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3658   addl(result, tmp3);
3659   addl(index, 2);
3660   cmpl(index, cnt1);
3661   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3662 
3663   // }
3664   // if (i >= cnt1) {
3665   bind(SHORT_UNROLLED_LOOP_EXIT);
3666   jccb(Assembler::greater, END);
3667   movl(tmp2, result);
3668   shll(result, 5);
3669   subl(result, tmp2);
3670   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3671   addl(result, tmp3);
3672   // }
3673   bind(END);
3674 
3675   BLOCK_COMMENT("} // arrays_hashcode");
3676 
3677 } // arrays_hashcode
3678 
3679 // helper function for string_compare
3680 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3681                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3682                                            Address::ScaleFactor scale2, Register index, int ae) {
3683   if (ae == StrIntrinsicNode::LL) {
3684     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3685     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3686   } else if (ae == StrIntrinsicNode::UU) {
3687     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3688     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3689   } else {
3690     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3691     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3692   }
3693 }
3694 
3695 // Compare strings, used for char[] and byte[].
3696 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3697                                        Register cnt1, Register cnt2, Register result,
3698                                        XMMRegister vec1, int ae, KRegister mask) {
3699   ShortBranchVerifier sbv(this);
3700   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3701   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3702   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3703   int stride2x2 = 0x40;
3704   Address::ScaleFactor scale = Address::no_scale;
3705   Address::ScaleFactor scale1 = Address::no_scale;
3706   Address::ScaleFactor scale2 = Address::no_scale;
3707 
3708   if (ae != StrIntrinsicNode::LL) {
3709     stride2x2 = 0x20;
3710   }
3711 
3712   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3713     shrl(cnt2, 1);
3714   }
3715   // Compute the minimum of the string lengths and the
3716   // difference of the string lengths (stack).
3717   // Do the conditional move stuff
3718   movl(result, cnt1);
3719   subl(cnt1, cnt2);
3720   push(cnt1);
3721   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3722 
3723   // Is the minimum length zero?
3724   testl(cnt2, cnt2);
3725   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3726   if (ae == StrIntrinsicNode::LL) {
3727     // Load first bytes
3728     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3729     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3730   } else if (ae == StrIntrinsicNode::UU) {
3731     // Load first characters
3732     load_unsigned_short(result, Address(str1, 0));
3733     load_unsigned_short(cnt1, Address(str2, 0));
3734   } else {
3735     load_unsigned_byte(result, Address(str1, 0));
3736     load_unsigned_short(cnt1, Address(str2, 0));
3737   }
3738   subl(result, cnt1);
3739   jcc(Assembler::notZero,  POP_LABEL);
3740 
3741   if (ae == StrIntrinsicNode::UU) {
3742     // Divide length by 2 to get number of chars
3743     shrl(cnt2, 1);
3744   }
3745   cmpl(cnt2, 1);
3746   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3747 
3748   // Check if the strings start at the same location and setup scale and stride
3749   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3750     cmpptr(str1, str2);
3751     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3752     if (ae == StrIntrinsicNode::LL) {
3753       scale = Address::times_1;
3754       stride = 16;
3755     } else {
3756       scale = Address::times_2;
3757       stride = 8;
3758     }
3759   } else {
3760     scale1 = Address::times_1;
3761     scale2 = Address::times_2;
3762     // scale not used
3763     stride = 8;
3764   }
3765 
3766   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3767     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3768     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3769     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3770     Label COMPARE_TAIL_LONG;
3771     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3772 
3773     int pcmpmask = 0x19;
3774     if (ae == StrIntrinsicNode::LL) {
3775       pcmpmask &= ~0x01;
3776     }
3777 
3778     // Setup to compare 16-chars (32-bytes) vectors,
3779     // start from first character again because it has aligned address.
3780     if (ae == StrIntrinsicNode::LL) {
3781       stride2 = 32;
3782     } else {
3783       stride2 = 16;
3784     }
3785     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3786       adr_stride = stride << scale;
3787     } else {
3788       adr_stride1 = 8;  //stride << scale1;
3789       adr_stride2 = 16; //stride << scale2;
3790     }
3791 
3792     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3793     // rax and rdx are used by pcmpestri as elements counters
3794     movl(result, cnt2);
3795     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3796     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3797 
3798     // fast path : compare first 2 8-char vectors.
3799     bind(COMPARE_16_CHARS);
3800     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3801       movdqu(vec1, Address(str1, 0));
3802     } else {
3803       pmovzxbw(vec1, Address(str1, 0));
3804     }
3805     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3806     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3807 
3808     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3809       movdqu(vec1, Address(str1, adr_stride));
3810       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3811     } else {
3812       pmovzxbw(vec1, Address(str1, adr_stride1));
3813       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3814     }
3815     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3816     addl(cnt1, stride);
3817 
3818     // Compare the characters at index in cnt1
3819     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3820     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3821     subl(result, cnt2);
3822     jmp(POP_LABEL);
3823 
3824     // Setup the registers to start vector comparison loop
3825     bind(COMPARE_WIDE_VECTORS);
3826     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3827       lea(str1, Address(str1, result, scale));
3828       lea(str2, Address(str2, result, scale));
3829     } else {
3830       lea(str1, Address(str1, result, scale1));
3831       lea(str2, Address(str2, result, scale2));
3832     }
3833     subl(result, stride2);
3834     subl(cnt2, stride2);
3835     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3836     negptr(result);
3837 
3838     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3839     bind(COMPARE_WIDE_VECTORS_LOOP);
3840 
3841 #ifdef _LP64
3842     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3843       cmpl(cnt2, stride2x2);
3844       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3845       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3846       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3847 
3848       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3849       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3850         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3851         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3852       } else {
3853         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3854         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3855       }
3856       kortestql(mask, mask);
3857       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3858       addptr(result, stride2x2);  // update since we already compared at this addr
3859       subl(cnt2, stride2x2);      // and sub the size too
3860       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3861 
3862       vpxor(vec1, vec1);
3863       jmpb(COMPARE_WIDE_TAIL);
3864     }//if (VM_Version::supports_avx512vlbw())
3865 #endif // _LP64
3866 
3867 
3868     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3869     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3870       vmovdqu(vec1, Address(str1, result, scale));
3871       vpxor(vec1, Address(str2, result, scale));
3872     } else {
3873       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3874       vpxor(vec1, Address(str2, result, scale2));
3875     }
3876     vptest(vec1, vec1);
3877     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3878     addptr(result, stride2);
3879     subl(cnt2, stride2);
3880     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3881     // clean upper bits of YMM registers
3882     vpxor(vec1, vec1);
3883 
3884     // compare wide vectors tail
3885     bind(COMPARE_WIDE_TAIL);
3886     testptr(result, result);
3887     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3888 
3889     movl(result, stride2);
3890     movl(cnt2, result);
3891     negptr(result);
3892     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3893 
3894     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3895     bind(VECTOR_NOT_EQUAL);
3896     // clean upper bits of YMM registers
3897     vpxor(vec1, vec1);
3898     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3899       lea(str1, Address(str1, result, scale));
3900       lea(str2, Address(str2, result, scale));
3901     } else {
3902       lea(str1, Address(str1, result, scale1));
3903       lea(str2, Address(str2, result, scale2));
3904     }
3905     jmp(COMPARE_16_CHARS);
3906 
3907     // Compare tail chars, length between 1 to 15 chars
3908     bind(COMPARE_TAIL_LONG);
3909     movl(cnt2, result);
3910     cmpl(cnt2, stride);
3911     jcc(Assembler::less, COMPARE_SMALL_STR);
3912 
3913     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3914       movdqu(vec1, Address(str1, 0));
3915     } else {
3916       pmovzxbw(vec1, Address(str1, 0));
3917     }
3918     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3919     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3920     subptr(cnt2, stride);
3921     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3922     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3923       lea(str1, Address(str1, result, scale));
3924       lea(str2, Address(str2, result, scale));
3925     } else {
3926       lea(str1, Address(str1, result, scale1));
3927       lea(str2, Address(str2, result, scale2));
3928     }
3929     negptr(cnt2);
3930     jmpb(WHILE_HEAD_LABEL);
3931 
3932     bind(COMPARE_SMALL_STR);
3933   } else if (UseSSE42Intrinsics) {
3934     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3935     int pcmpmask = 0x19;
3936     // Setup to compare 8-char (16-byte) vectors,
3937     // start from first character again because it has aligned address.
3938     movl(result, cnt2);
3939     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3940     if (ae == StrIntrinsicNode::LL) {
3941       pcmpmask &= ~0x01;
3942     }
3943     jcc(Assembler::zero, COMPARE_TAIL);
3944     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3945       lea(str1, Address(str1, result, scale));
3946       lea(str2, Address(str2, result, scale));
3947     } else {
3948       lea(str1, Address(str1, result, scale1));
3949       lea(str2, Address(str2, result, scale2));
3950     }
3951     negptr(result);
3952 
3953     // pcmpestri
3954     //   inputs:
3955     //     vec1- substring
3956     //     rax - negative string length (elements count)
3957     //     mem - scanned string
3958     //     rdx - string length (elements count)
3959     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3960     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3961     //   outputs:
3962     //     rcx - first mismatched element index
3963     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3964 
3965     bind(COMPARE_WIDE_VECTORS);
3966     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3967       movdqu(vec1, Address(str1, result, scale));
3968       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3969     } else {
3970       pmovzxbw(vec1, Address(str1, result, scale1));
3971       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3972     }
3973     // After pcmpestri cnt1(rcx) contains mismatched element index
3974 
3975     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3976     addptr(result, stride);
3977     subptr(cnt2, stride);
3978     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3979 
3980     // compare wide vectors tail
3981     testptr(result, result);
3982     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3983 
3984     movl(cnt2, stride);
3985     movl(result, stride);
3986     negptr(result);
3987     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3988       movdqu(vec1, Address(str1, result, scale));
3989       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3990     } else {
3991       pmovzxbw(vec1, Address(str1, result, scale1));
3992       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3993     }
3994     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3995 
3996     // Mismatched characters in the vectors
3997     bind(VECTOR_NOT_EQUAL);
3998     addptr(cnt1, result);
3999     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4000     subl(result, cnt2);
4001     jmpb(POP_LABEL);
4002 
4003     bind(COMPARE_TAIL); // limit is zero
4004     movl(cnt2, result);
4005     // Fallthru to tail compare
4006   }
4007   // Shift str2 and str1 to the end of the arrays, negate min
4008   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4009     lea(str1, Address(str1, cnt2, scale));
4010     lea(str2, Address(str2, cnt2, scale));
4011   } else {
4012     lea(str1, Address(str1, cnt2, scale1));
4013     lea(str2, Address(str2, cnt2, scale2));
4014   }
4015   decrementl(cnt2);  // first character was compared already
4016   negptr(cnt2);
4017 
4018   // Compare the rest of the elements
4019   bind(WHILE_HEAD_LABEL);
4020   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4021   subl(result, cnt1);
4022   jccb(Assembler::notZero, POP_LABEL);
4023   increment(cnt2);
4024   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4025 
4026   // Strings are equal up to min length.  Return the length difference.
4027   bind(LENGTH_DIFF_LABEL);
4028   pop(result);
4029   if (ae == StrIntrinsicNode::UU) {
4030     // Divide diff by 2 to get number of chars
4031     sarl(result, 1);
4032   }
4033   jmpb(DONE_LABEL);
4034 
4035 #ifdef _LP64
4036   if (VM_Version::supports_avx512vlbw()) {
4037 
4038     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4039 
4040     kmovql(cnt1, mask);
4041     notq(cnt1);
4042     bsfq(cnt2, cnt1);
4043     if (ae != StrIntrinsicNode::LL) {
4044       // Divide diff by 2 to get number of chars
4045       sarl(cnt2, 1);
4046     }
4047     addq(result, cnt2);
4048     if (ae == StrIntrinsicNode::LL) {
4049       load_unsigned_byte(cnt1, Address(str2, result));
4050       load_unsigned_byte(result, Address(str1, result));
4051     } else if (ae == StrIntrinsicNode::UU) {
4052       load_unsigned_short(cnt1, Address(str2, result, scale));
4053       load_unsigned_short(result, Address(str1, result, scale));
4054     } else {
4055       load_unsigned_short(cnt1, Address(str2, result, scale2));
4056       load_unsigned_byte(result, Address(str1, result, scale1));
4057     }
4058     subl(result, cnt1);
4059     jmpb(POP_LABEL);
4060   }//if (VM_Version::supports_avx512vlbw())
4061 #endif // _LP64
4062 
4063   // Discard the stored length difference
4064   bind(POP_LABEL);
4065   pop(cnt1);
4066 
4067   // That's it
4068   bind(DONE_LABEL);
4069   if(ae == StrIntrinsicNode::UL) {
4070     negl(result);
4071   }
4072 
4073 }
4074 
4075 // Search for Non-ASCII character (Negative byte value) in a byte array,
4076 // return the index of the first such character, otherwise the length
4077 // of the array segment searched.
4078 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4079 //   @IntrinsicCandidate
4080 //   public static int countPositives(byte[] ba, int off, int len) {
4081 //     for (int i = off; i < off + len; i++) {
4082 //       if (ba[i] < 0) {
4083 //         return i - off;
4084 //       }
4085 //     }
4086 //     return len;
4087 //   }
4088 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4089   Register result, Register tmp1,
4090   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4091   // rsi: byte array
4092   // rcx: len
4093   // rax: result
4094   ShortBranchVerifier sbv(this);
4095   assert_different_registers(ary1, len, result, tmp1);
4096   assert_different_registers(vec1, vec2);
4097   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4098 
4099   movl(result, len); // copy
4100   // len == 0
4101   testl(len, len);
4102   jcc(Assembler::zero, DONE);
4103 
4104   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4105     VM_Version::supports_avx512vlbw() &&
4106     VM_Version::supports_bmi2()) {
4107 
4108     Label test_64_loop, test_tail, BREAK_LOOP;
4109     movl(tmp1, len);
4110     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4111 
4112     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4113     andl(len,  0xffffffc0); // vector count (in chars)
4114     jccb(Assembler::zero, test_tail);
4115 
4116     lea(ary1, Address(ary1, len, Address::times_1));
4117     negptr(len);
4118 
4119     bind(test_64_loop);
4120     // Check whether our 64 elements of size byte contain negatives
4121     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4122     kortestql(mask1, mask1);
4123     jcc(Assembler::notZero, BREAK_LOOP);
4124 
4125     addptr(len, 64);
4126     jccb(Assembler::notZero, test_64_loop);
4127 
4128     bind(test_tail);
4129     // bail out when there is nothing to be done
4130     testl(tmp1, -1);
4131     jcc(Assembler::zero, DONE);
4132 
4133 
4134     // check the tail for absense of negatives
4135     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4136 #ifdef _LP64
4137     {
4138       Register tmp3_aliased = len;
4139       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4140       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4141       notq(tmp3_aliased);
4142       kmovql(mask2, tmp3_aliased);
4143     }
4144 #else
4145     Label k_init;
4146     jmp(k_init);
4147 
4148     // We could not read 64-bits from a general purpose register thus we move
4149     // data required to compose 64 1's to the instruction stream
4150     // We emit 64 byte wide series of elements from 0..63 which later on would
4151     // be used as a compare targets with tail count contained in tmp1 register.
4152     // Result would be a k register having tmp1 consecutive number or 1
4153     // counting from least significant bit.
4154     address tmp = pc();
4155     emit_int64(0x0706050403020100);
4156     emit_int64(0x0F0E0D0C0B0A0908);
4157     emit_int64(0x1716151413121110);
4158     emit_int64(0x1F1E1D1C1B1A1918);
4159     emit_int64(0x2726252423222120);
4160     emit_int64(0x2F2E2D2C2B2A2928);
4161     emit_int64(0x3736353433323130);
4162     emit_int64(0x3F3E3D3C3B3A3938);
4163 
4164     bind(k_init);
4165     lea(len, InternalAddress(tmp));
4166     // create mask to test for negative byte inside a vector
4167     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4168     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4169 
4170 #endif
4171     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4172     ktestq(mask1, mask2);
4173     jcc(Assembler::zero, DONE);
4174 
4175     // do a full check for negative registers in the tail
4176     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4177                      // ary1 already pointing to the right place
4178     jmpb(TAIL_START);
4179 
4180     bind(BREAK_LOOP);
4181     // At least one byte in the last 64 byte block was negative.
4182     // Set up to look at the last 64 bytes as if they were a tail
4183     lea(ary1, Address(ary1, len, Address::times_1));
4184     addptr(result, len);
4185     // Ignore the very last byte: if all others are positive,
4186     // it must be negative, so we can skip right to the 2+1 byte
4187     // end comparison at this point
4188     orl(result, 63);
4189     movl(len, 63);
4190     // Fallthru to tail compare
4191   } else {
4192 
4193     if (UseAVX >= 2 && UseSSE >= 2) {
4194       // With AVX2, use 32-byte vector compare
4195       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4196 
4197       // Compare 32-byte vectors
4198       testl(len, 0xffffffe0);   // vector count (in bytes)
4199       jccb(Assembler::zero, TAIL_START);
4200 
4201       andl(len, 0xffffffe0);
4202       lea(ary1, Address(ary1, len, Address::times_1));
4203       negptr(len);
4204 
4205       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4206       movdl(vec2, tmp1);
4207       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4208 
4209       bind(COMPARE_WIDE_VECTORS);
4210       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4211       vptest(vec1, vec2);
4212       jccb(Assembler::notZero, BREAK_LOOP);
4213       addptr(len, 32);
4214       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4215 
4216       testl(result, 0x0000001f);   // any bytes remaining?
4217       jcc(Assembler::zero, DONE);
4218 
4219       // Quick test using the already prepared vector mask
4220       movl(len, result);
4221       andl(len, 0x0000001f);
4222       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4223       vptest(vec1, vec2);
4224       jcc(Assembler::zero, DONE);
4225       // There are zeros, jump to the tail to determine exactly where
4226       jmpb(TAIL_START);
4227 
4228       bind(BREAK_LOOP);
4229       // At least one byte in the last 32-byte vector is negative.
4230       // Set up to look at the last 32 bytes as if they were a tail
4231       lea(ary1, Address(ary1, len, Address::times_1));
4232       addptr(result, len);
4233       // Ignore the very last byte: if all others are positive,
4234       // it must be negative, so we can skip right to the 2+1 byte
4235       // end comparison at this point
4236       orl(result, 31);
4237       movl(len, 31);
4238       // Fallthru to tail compare
4239     } else if (UseSSE42Intrinsics) {
4240       // With SSE4.2, use double quad vector compare
4241       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4242 
4243       // Compare 16-byte vectors
4244       testl(len, 0xfffffff0);   // vector count (in bytes)
4245       jcc(Assembler::zero, TAIL_START);
4246 
4247       andl(len, 0xfffffff0);
4248       lea(ary1, Address(ary1, len, Address::times_1));
4249       negptr(len);
4250 
4251       movl(tmp1, 0x80808080);
4252       movdl(vec2, tmp1);
4253       pshufd(vec2, vec2, 0);
4254 
4255       bind(COMPARE_WIDE_VECTORS);
4256       movdqu(vec1, Address(ary1, len, Address::times_1));
4257       ptest(vec1, vec2);
4258       jccb(Assembler::notZero, BREAK_LOOP);
4259       addptr(len, 16);
4260       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4261 
4262       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4263       jcc(Assembler::zero, DONE);
4264 
4265       // Quick test using the already prepared vector mask
4266       movl(len, result);
4267       andl(len, 0x0000000f);   // tail count (in bytes)
4268       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4269       ptest(vec1, vec2);
4270       jcc(Assembler::zero, DONE);
4271       jmpb(TAIL_START);
4272 
4273       bind(BREAK_LOOP);
4274       // At least one byte in the last 16-byte vector is negative.
4275       // Set up and look at the last 16 bytes as if they were a tail
4276       lea(ary1, Address(ary1, len, Address::times_1));
4277       addptr(result, len);
4278       // Ignore the very last byte: if all others are positive,
4279       // it must be negative, so we can skip right to the 2+1 byte
4280       // end comparison at this point
4281       orl(result, 15);
4282       movl(len, 15);
4283       // Fallthru to tail compare
4284     }
4285   }
4286 
4287   bind(TAIL_START);
4288   // Compare 4-byte vectors
4289   andl(len, 0xfffffffc); // vector count (in bytes)
4290   jccb(Assembler::zero, COMPARE_CHAR);
4291 
4292   lea(ary1, Address(ary1, len, Address::times_1));
4293   negptr(len);
4294 
4295   bind(COMPARE_VECTORS);
4296   movl(tmp1, Address(ary1, len, Address::times_1));
4297   andl(tmp1, 0x80808080);
4298   jccb(Assembler::notZero, TAIL_ADJUST);
4299   addptr(len, 4);
4300   jccb(Assembler::notZero, COMPARE_VECTORS);
4301 
4302   // Compare trailing char (final 2-3 bytes), if any
4303   bind(COMPARE_CHAR);
4304 
4305   testl(result, 0x2);   // tail  char
4306   jccb(Assembler::zero, COMPARE_BYTE);
4307   load_unsigned_short(tmp1, Address(ary1, 0));
4308   andl(tmp1, 0x00008080);
4309   jccb(Assembler::notZero, CHAR_ADJUST);
4310   lea(ary1, Address(ary1, 2));
4311 
4312   bind(COMPARE_BYTE);
4313   testl(result, 0x1);   // tail  byte
4314   jccb(Assembler::zero, DONE);
4315   load_unsigned_byte(tmp1, Address(ary1, 0));
4316   testl(tmp1, 0x00000080);
4317   jccb(Assembler::zero, DONE);
4318   subptr(result, 1);
4319   jmpb(DONE);
4320 
4321   bind(TAIL_ADJUST);
4322   // there are negative bits in the last 4 byte block.
4323   // Adjust result and check the next three bytes
4324   addptr(result, len);
4325   orl(result, 3);
4326   lea(ary1, Address(ary1, len, Address::times_1));
4327   jmpb(COMPARE_CHAR);
4328 
4329   bind(CHAR_ADJUST);
4330   // We are looking at a char + optional byte tail, and found that one
4331   // of the bytes in the char is negative. Adjust the result, check the
4332   // first byte and readjust if needed.
4333   andl(result, 0xfffffffc);
4334   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4335   jccb(Assembler::notZero, DONE);
4336   addptr(result, 1);
4337 
4338   // That's it
4339   bind(DONE);
4340   if (UseAVX >= 2 && UseSSE >= 2) {
4341     // clean upper bits of YMM registers
4342     vpxor(vec1, vec1);
4343     vpxor(vec2, vec2);
4344   }
4345 }
4346 
4347 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4348 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4349                                       Register limit, Register result, Register chr,
4350                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4351                                       KRegister mask, bool expand_ary2) {
4352   // for expand_ary2, limit is the (smaller) size of the second array.
4353   ShortBranchVerifier sbv(this);
4354   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4355 
4356   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4357          "Expansion only implemented for AVX2");
4358 
4359   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4360   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4361 
4362   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4363   int scaleIncr = expand_ary2 ? 8 : 16;
4364 
4365   if (is_array_equ) {
4366     // Check the input args
4367     cmpoop(ary1, ary2);
4368     jcc(Assembler::equal, TRUE_LABEL);
4369 
4370     // Need additional checks for arrays_equals.
4371     testptr(ary1, ary1);
4372     jcc(Assembler::zero, FALSE_LABEL);
4373     testptr(ary2, ary2);
4374     jcc(Assembler::zero, FALSE_LABEL);
4375 
4376     // Check the lengths
4377     movl(limit, Address(ary1, length_offset));
4378     cmpl(limit, Address(ary2, length_offset));
4379     jcc(Assembler::notEqual, FALSE_LABEL);
4380   }
4381 
4382   // count == 0
4383   testl(limit, limit);
4384   jcc(Assembler::zero, TRUE_LABEL);
4385 
4386   if (is_array_equ) {
4387     // Load array address
4388     lea(ary1, Address(ary1, base_offset));
4389     lea(ary2, Address(ary2, base_offset));
4390   }
4391 
4392   if (is_array_equ && is_char) {
4393     // arrays_equals when used for char[].
4394     shll(limit, 1);      // byte count != 0
4395   }
4396   movl(result, limit); // copy
4397 
4398   if (UseAVX >= 2) {
4399     // With AVX2, use 32-byte vector compare
4400     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4401 
4402     // Compare 32-byte vectors
4403     if (expand_ary2) {
4404       andl(result, 0x0000000f);  //   tail count (in bytes)
4405       andl(limit, 0xfffffff0);   // vector count (in bytes)
4406       jcc(Assembler::zero, COMPARE_TAIL);
4407     } else {
4408       andl(result, 0x0000001f);  //   tail count (in bytes)
4409       andl(limit, 0xffffffe0);   // vector count (in bytes)
4410       jcc(Assembler::zero, COMPARE_TAIL_16);
4411     }
4412 
4413     lea(ary1, Address(ary1, limit, scaleFactor));
4414     lea(ary2, Address(ary2, limit, Address::times_1));
4415     negptr(limit);
4416 
4417 #ifdef _LP64
4418     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4419       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4420 
4421       cmpl(limit, -64);
4422       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4423 
4424       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4425 
4426       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4427       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4428       kortestql(mask, mask);
4429       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4430       addptr(limit, 64);  // update since we already compared at this addr
4431       cmpl(limit, -64);
4432       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4433 
4434       // At this point we may still need to compare -limit+result bytes.
4435       // We could execute the next two instruction and just continue via non-wide path:
4436       //  cmpl(limit, 0);
4437       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4438       // But since we stopped at the points ary{1,2}+limit which are
4439       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4440       // (|limit| <= 32 and result < 32),
4441       // we may just compare the last 64 bytes.
4442       //
4443       addptr(result, -64);   // it is safe, bc we just came from this area
4444       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4445       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4446       kortestql(mask, mask);
4447       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4448 
4449       jmp(TRUE_LABEL);
4450 
4451       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4452 
4453     }//if (VM_Version::supports_avx512vlbw())
4454 #endif //_LP64
4455     bind(COMPARE_WIDE_VECTORS);
4456     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4457     if (expand_ary2) {
4458       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4459     } else {
4460       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4461     }
4462     vpxor(vec1, vec2);
4463 
4464     vptest(vec1, vec1);
4465     jcc(Assembler::notZero, FALSE_LABEL);
4466     addptr(limit, scaleIncr * 2);
4467     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4468 
4469     testl(result, result);
4470     jcc(Assembler::zero, TRUE_LABEL);
4471 
4472     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4473     if (expand_ary2) {
4474       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4475     } else {
4476       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4477     }
4478     vpxor(vec1, vec2);
4479 
4480     vptest(vec1, vec1);
4481     jcc(Assembler::notZero, FALSE_LABEL);
4482     jmp(TRUE_LABEL);
4483 
4484     bind(COMPARE_TAIL_16); // limit is zero
4485     movl(limit, result);
4486 
4487     // Compare 16-byte chunks
4488     andl(result, 0x0000000f);  //   tail count (in bytes)
4489     andl(limit, 0xfffffff0);   // vector count (in bytes)
4490     jcc(Assembler::zero, COMPARE_TAIL);
4491 
4492     lea(ary1, Address(ary1, limit, scaleFactor));
4493     lea(ary2, Address(ary2, limit, Address::times_1));
4494     negptr(limit);
4495 
4496     bind(COMPARE_WIDE_VECTORS_16);
4497     movdqu(vec1, Address(ary1, limit, scaleFactor));
4498     if (expand_ary2) {
4499       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4500     } else {
4501       movdqu(vec2, Address(ary2, limit, Address::times_1));
4502     }
4503     pxor(vec1, vec2);
4504 
4505     ptest(vec1, vec1);
4506     jcc(Assembler::notZero, FALSE_LABEL);
4507     addptr(limit, scaleIncr);
4508     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4509 
4510     bind(COMPARE_TAIL); // limit is zero
4511     movl(limit, result);
4512     // Fallthru to tail compare
4513   } else if (UseSSE42Intrinsics) {
4514     // With SSE4.2, use double quad vector compare
4515     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4516 
4517     // Compare 16-byte vectors
4518     andl(result, 0x0000000f);  //   tail count (in bytes)
4519     andl(limit, 0xfffffff0);   // vector count (in bytes)
4520     jcc(Assembler::zero, COMPARE_TAIL);
4521 
4522     lea(ary1, Address(ary1, limit, Address::times_1));
4523     lea(ary2, Address(ary2, limit, Address::times_1));
4524     negptr(limit);
4525 
4526     bind(COMPARE_WIDE_VECTORS);
4527     movdqu(vec1, Address(ary1, limit, Address::times_1));
4528     movdqu(vec2, Address(ary2, limit, Address::times_1));
4529     pxor(vec1, vec2);
4530 
4531     ptest(vec1, vec1);
4532     jcc(Assembler::notZero, FALSE_LABEL);
4533     addptr(limit, 16);
4534     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4535 
4536     testl(result, result);
4537     jcc(Assembler::zero, TRUE_LABEL);
4538 
4539     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4540     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4541     pxor(vec1, vec2);
4542 
4543     ptest(vec1, vec1);
4544     jccb(Assembler::notZero, FALSE_LABEL);
4545     jmpb(TRUE_LABEL);
4546 
4547     bind(COMPARE_TAIL); // limit is zero
4548     movl(limit, result);
4549     // Fallthru to tail compare
4550   }
4551 
4552   // Compare 4-byte vectors
4553   if (expand_ary2) {
4554     testl(result, result);
4555     jccb(Assembler::zero, TRUE_LABEL);
4556   } else {
4557     andl(limit, 0xfffffffc); // vector count (in bytes)
4558     jccb(Assembler::zero, COMPARE_CHAR);
4559   }
4560 
4561   lea(ary1, Address(ary1, limit, scaleFactor));
4562   lea(ary2, Address(ary2, limit, Address::times_1));
4563   negptr(limit);
4564 
4565   bind(COMPARE_VECTORS);
4566   if (expand_ary2) {
4567     // There are no "vector" operations for bytes to shorts
4568     movzbl(chr, Address(ary2, limit, Address::times_1));
4569     cmpw(Address(ary1, limit, Address::times_2), chr);
4570     jccb(Assembler::notEqual, FALSE_LABEL);
4571     addptr(limit, 1);
4572     jcc(Assembler::notZero, COMPARE_VECTORS);
4573     jmp(TRUE_LABEL);
4574   } else {
4575     movl(chr, Address(ary1, limit, Address::times_1));
4576     cmpl(chr, Address(ary2, limit, Address::times_1));
4577     jccb(Assembler::notEqual, FALSE_LABEL);
4578     addptr(limit, 4);
4579     jcc(Assembler::notZero, COMPARE_VECTORS);
4580   }
4581 
4582   // Compare trailing char (final 2 bytes), if any
4583   bind(COMPARE_CHAR);
4584   testl(result, 0x2);   // tail  char
4585   jccb(Assembler::zero, COMPARE_BYTE);
4586   load_unsigned_short(chr, Address(ary1, 0));
4587   load_unsigned_short(limit, Address(ary2, 0));
4588   cmpl(chr, limit);
4589   jccb(Assembler::notEqual, FALSE_LABEL);
4590 
4591   if (is_array_equ && is_char) {
4592     bind(COMPARE_BYTE);
4593   } else {
4594     lea(ary1, Address(ary1, 2));
4595     lea(ary2, Address(ary2, 2));
4596 
4597     bind(COMPARE_BYTE);
4598     testl(result, 0x1);   // tail  byte
4599     jccb(Assembler::zero, TRUE_LABEL);
4600     load_unsigned_byte(chr, Address(ary1, 0));
4601     load_unsigned_byte(limit, Address(ary2, 0));
4602     cmpl(chr, limit);
4603     jccb(Assembler::notEqual, FALSE_LABEL);
4604   }
4605   bind(TRUE_LABEL);
4606   movl(result, 1);   // return true
4607   jmpb(DONE);
4608 
4609   bind(FALSE_LABEL);
4610   xorl(result, result); // return false
4611 
4612   // That's it
4613   bind(DONE);
4614   if (UseAVX >= 2) {
4615     // clean upper bits of YMM registers
4616     vpxor(vec1, vec1);
4617     vpxor(vec2, vec2);
4618   }
4619 }
4620 
4621 #ifdef _LP64
4622 
4623 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4624 #define __ masm.
4625   Register dst = stub.data<0>();
4626   XMMRegister src = stub.data<1>();
4627   address target = stub.data<2>();
4628   __ bind(stub.entry());
4629   __ subptr(rsp, 8);
4630   __ movdbl(Address(rsp), src);
4631   __ call(RuntimeAddress(target));
4632   __ pop(dst);
4633   __ jmp(stub.continuation());
4634 #undef __
4635 }
4636 
4637 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4638   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4639   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4640 
4641   address slowpath_target;
4642   if (dst_bt == T_INT) {
4643     if (src_bt == T_FLOAT) {
4644       cvttss2sil(dst, src);
4645       cmpl(dst, 0x80000000);
4646       slowpath_target = StubRoutines::x86::f2i_fixup();
4647     } else {
4648       cvttsd2sil(dst, src);
4649       cmpl(dst, 0x80000000);
4650       slowpath_target = StubRoutines::x86::d2i_fixup();
4651     }
4652   } else {
4653     if (src_bt == T_FLOAT) {
4654       cvttss2siq(dst, src);
4655       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4656       slowpath_target = StubRoutines::x86::f2l_fixup();
4657     } else {
4658       cvttsd2siq(dst, src);
4659       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4660       slowpath_target = StubRoutines::x86::d2l_fixup();
4661     }
4662   }
4663 
4664   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4665   jcc(Assembler::equal, stub->entry());
4666   bind(stub->continuation());
4667 }
4668 
4669 #endif // _LP64
4670 
4671 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4672                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4673   switch(ideal_opc) {
4674     case Op_LShiftVS:
4675       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4676     case Op_LShiftVI:
4677       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4678     case Op_LShiftVL:
4679       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4680     case Op_RShiftVS:
4681       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4682     case Op_RShiftVI:
4683       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4684     case Op_RShiftVL:
4685       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4686     case Op_URShiftVS:
4687       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4688     case Op_URShiftVI:
4689       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4690     case Op_URShiftVL:
4691       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4692     case Op_RotateRightV:
4693       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4694     case Op_RotateLeftV:
4695       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4696     default:
4697       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4698       break;
4699   }
4700 }
4701 
4702 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4703                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4704   if (is_unsigned) {
4705     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4706   } else {
4707     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4708   }
4709 }
4710 
4711 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4712                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4713   switch (elem_bt) {
4714     case T_BYTE:
4715       if (ideal_opc == Op_SaturatingAddV) {
4716         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4717       } else {
4718         assert(ideal_opc == Op_SaturatingSubV, "");
4719         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4720       }
4721       break;
4722     case T_SHORT:
4723       if (ideal_opc == Op_SaturatingAddV) {
4724         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4725       } else {
4726         assert(ideal_opc == Op_SaturatingSubV, "");
4727         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4728       }
4729       break;
4730     default:
4731       fatal("Unsupported type %s", type2name(elem_bt));
4732       break;
4733   }
4734 }
4735 
4736 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4737                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4738   switch (elem_bt) {
4739     case T_BYTE:
4740       if (ideal_opc == Op_SaturatingAddV) {
4741         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4742       } else {
4743         assert(ideal_opc == Op_SaturatingSubV, "");
4744         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4745       }
4746       break;
4747     case T_SHORT:
4748       if (ideal_opc == Op_SaturatingAddV) {
4749         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4750       } else {
4751         assert(ideal_opc == Op_SaturatingSubV, "");
4752         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4753       }
4754       break;
4755     default:
4756       fatal("Unsupported type %s", type2name(elem_bt));
4757       break;
4758   }
4759 }
4760 
4761 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4762                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4763   if (is_unsigned) {
4764     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4765   } else {
4766     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4767   }
4768 }
4769 
4770 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4771                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4772   switch (elem_bt) {
4773     case T_BYTE:
4774       if (ideal_opc == Op_SaturatingAddV) {
4775         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4776       } else {
4777         assert(ideal_opc == Op_SaturatingSubV, "");
4778         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4779       }
4780       break;
4781     case T_SHORT:
4782       if (ideal_opc == Op_SaturatingAddV) {
4783         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4784       } else {
4785         assert(ideal_opc == Op_SaturatingSubV, "");
4786         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4787       }
4788       break;
4789     default:
4790       fatal("Unsupported type %s", type2name(elem_bt));
4791       break;
4792   }
4793 }
4794 
4795 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4796                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4797   switch (elem_bt) {
4798     case T_BYTE:
4799       if (ideal_opc == Op_SaturatingAddV) {
4800         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4801       } else {
4802         assert(ideal_opc == Op_SaturatingSubV, "");
4803         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4804       }
4805       break;
4806     case T_SHORT:
4807       if (ideal_opc == Op_SaturatingAddV) {
4808         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4809       } else {
4810         assert(ideal_opc == Op_SaturatingSubV, "");
4811         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4812       }
4813       break;
4814     default:
4815       fatal("Unsupported type %s", type2name(elem_bt));
4816       break;
4817   }
4818 }
4819 
4820 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4821                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4822                                     bool is_varshift) {
4823   switch (ideal_opc) {
4824     case Op_AddVB:
4825       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4826     case Op_AddVS:
4827       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4828     case Op_AddVI:
4829       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4830     case Op_AddVL:
4831       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4832     case Op_AddVF:
4833       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4834     case Op_AddVD:
4835       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4836     case Op_SubVB:
4837       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4838     case Op_SubVS:
4839       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4840     case Op_SubVI:
4841       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4842     case Op_SubVL:
4843       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4844     case Op_SubVF:
4845       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4846     case Op_SubVD:
4847       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4848     case Op_MulVS:
4849       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4850     case Op_MulVI:
4851       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4852     case Op_MulVL:
4853       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4854     case Op_MulVF:
4855       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4856     case Op_MulVD:
4857       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4858     case Op_DivVF:
4859       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4860     case Op_DivVD:
4861       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4862     case Op_SqrtVF:
4863       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4864     case Op_SqrtVD:
4865       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4866     case Op_AbsVB:
4867       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4868     case Op_AbsVS:
4869       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4870     case Op_AbsVI:
4871       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4872     case Op_AbsVL:
4873       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4874     case Op_FmaVF:
4875       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4876     case Op_FmaVD:
4877       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4878     case Op_VectorRearrange:
4879       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4880     case Op_LShiftVS:
4881       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4882     case Op_LShiftVI:
4883       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4884     case Op_LShiftVL:
4885       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4886     case Op_RShiftVS:
4887       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4888     case Op_RShiftVI:
4889       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4890     case Op_RShiftVL:
4891       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4892     case Op_URShiftVS:
4893       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4894     case Op_URShiftVI:
4895       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4896     case Op_URShiftVL:
4897       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4898     case Op_RotateLeftV:
4899       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4900     case Op_RotateRightV:
4901       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4902     case Op_MaxV:
4903       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4904     case Op_MinV:
4905       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4906     case Op_UMinV:
4907       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4908     case Op_UMaxV:
4909       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4910     case Op_XorV:
4911       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4912     case Op_OrV:
4913       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4914     case Op_AndV:
4915       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4916     default:
4917       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4918       break;
4919   }
4920 }
4921 
4922 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4923                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4924   switch (ideal_opc) {
4925     case Op_AddVB:
4926       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4927     case Op_AddVS:
4928       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4929     case Op_AddVI:
4930       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4931     case Op_AddVL:
4932       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4933     case Op_AddVF:
4934       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4935     case Op_AddVD:
4936       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4937     case Op_SubVB:
4938       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4939     case Op_SubVS:
4940       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4941     case Op_SubVI:
4942       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4943     case Op_SubVL:
4944       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4945     case Op_SubVF:
4946       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4947     case Op_SubVD:
4948       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4949     case Op_MulVS:
4950       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4951     case Op_MulVI:
4952       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4953     case Op_MulVL:
4954       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4955     case Op_MulVF:
4956       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4957     case Op_MulVD:
4958       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4959     case Op_DivVF:
4960       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4961     case Op_DivVD:
4962       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4963     case Op_FmaVF:
4964       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4965     case Op_FmaVD:
4966       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4967     case Op_MaxV:
4968       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4969     case Op_MinV:
4970       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4971     case Op_UMaxV:
4972       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4973     case Op_UMinV:
4974       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4975     case Op_XorV:
4976       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4977     case Op_OrV:
4978       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4979     case Op_AndV:
4980       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4981     default:
4982       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4983       break;
4984   }
4985 }
4986 
4987 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4988                                   KRegister src1, KRegister src2) {
4989   BasicType etype = T_ILLEGAL;
4990   switch(mask_len) {
4991     case 2:
4992     case 4:
4993     case 8:  etype = T_BYTE; break;
4994     case 16: etype = T_SHORT; break;
4995     case 32: etype = T_INT; break;
4996     case 64: etype = T_LONG; break;
4997     default: fatal("Unsupported type"); break;
4998   }
4999   assert(etype != T_ILLEGAL, "");
5000   switch(ideal_opc) {
5001     case Op_AndVMask:
5002       kand(etype, dst, src1, src2); break;
5003     case Op_OrVMask:
5004       kor(etype, dst, src1, src2); break;
5005     case Op_XorVMask:
5006       kxor(etype, dst, src1, src2); break;
5007     default:
5008       fatal("Unsupported masked operation"); break;
5009   }
5010 }
5011 
5012 /*
5013  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5014  * If src is NaN, the result is 0.
5015  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
5016  * the result is equal to the value of Integer.MIN_VALUE.
5017  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
5018  * the result is equal to the value of Integer.MAX_VALUE.
5019  */
5020 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5021                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5022                                                                    Register rscratch, AddressLiteral float_sign_flip,
5023                                                                    int vec_enc) {
5024   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5025   Label done;
5026   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
5027   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5028   vptest(xtmp2, xtmp2, vec_enc);
5029   jccb(Assembler::equal, done);
5030 
5031   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
5032   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
5033 
5034   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5035   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
5036   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
5037 
5038   // Recompute the mask for remaining special value.
5039   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
5040   // Extract SRC values corresponding to TRUE mask lanes.
5041   vpand(xtmp4, xtmp2, src, vec_enc);
5042   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
5043   // values are set.
5044   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5045 
5046   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5047   bind(done);
5048 }
5049 
5050 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5051                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5052                                                                     Register rscratch, AddressLiteral float_sign_flip,
5053                                                                     int vec_enc) {
5054   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5055   Label done;
5056   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5057   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5058   kortestwl(ktmp1, ktmp1);
5059   jccb(Assembler::equal, done);
5060 
5061   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5062   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5063   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5064 
5065   kxorwl(ktmp1, ktmp1, ktmp2);
5066   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5067   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5068   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5069   bind(done);
5070 }
5071 
5072 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5073                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5074                                                                      Register rscratch, AddressLiteral double_sign_flip,
5075                                                                      int vec_enc) {
5076   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5077 
5078   Label done;
5079   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5080   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5081   kortestwl(ktmp1, ktmp1);
5082   jccb(Assembler::equal, done);
5083 
5084   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5085   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5086   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5087 
5088   kxorwl(ktmp1, ktmp1, ktmp2);
5089   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5090   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5091   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5092   bind(done);
5093 }
5094 
5095 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5096                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5097                                                                      Register rscratch, AddressLiteral float_sign_flip,
5098                                                                      int vec_enc) {
5099   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5100   Label done;
5101   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5102   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5103   kortestwl(ktmp1, ktmp1);
5104   jccb(Assembler::equal, done);
5105 
5106   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5107   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5108   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5109 
5110   kxorwl(ktmp1, ktmp1, ktmp2);
5111   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5112   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5113   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5114   bind(done);
5115 }
5116 
5117 /*
5118  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5119  * If src is NaN, the result is 0.
5120  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5121  * the result is equal to the value of Long.MIN_VALUE.
5122  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5123  * the result is equal to the value of Long.MAX_VALUE.
5124  */
5125 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5126                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5127                                                                       Register rscratch, AddressLiteral double_sign_flip,
5128                                                                       int vec_enc) {
5129   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5130 
5131   Label done;
5132   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5133   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5134   kortestwl(ktmp1, ktmp1);
5135   jccb(Assembler::equal, done);
5136 
5137   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5138   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5139   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5140 
5141   kxorwl(ktmp1, ktmp1, ktmp2);
5142   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5143   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5144   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5145   bind(done);
5146 }
5147 
5148 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5149                                                              XMMRegister xtmp, int index, int vec_enc) {
5150    assert(vec_enc < Assembler::AVX_512bit, "");
5151    if (vec_enc == Assembler::AVX_256bit) {
5152      vextractf128_high(xtmp, src);
5153      vshufps(dst, src, xtmp, index, vec_enc);
5154    } else {
5155      vshufps(dst, src, zero, index, vec_enc);
5156    }
5157 }
5158 
5159 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5160                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5161                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5162   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5163 
5164   Label done;
5165   // Compare the destination lanes with float_sign_flip
5166   // value to get mask for all special values.
5167   movdqu(xtmp1, float_sign_flip, rscratch);
5168   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5169   ptest(xtmp2, xtmp2);
5170   jccb(Assembler::equal, done);
5171 
5172   // Flip float_sign_flip to get max integer value.
5173   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5174   pxor(xtmp1, xtmp4);
5175 
5176   // Set detination lanes corresponding to unordered source lanes as zero.
5177   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5178   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5179 
5180   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5181   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5182   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5183 
5184   // Recompute the mask for remaining special value.
5185   pxor(xtmp2, xtmp3);
5186   // Extract mask corresponding to non-negative source lanes.
5187   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5188 
5189   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5190   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5191   pand(xtmp3, xtmp2);
5192 
5193   // Replace destination lanes holding special value(0x80000000) with max int
5194   // if corresponding source lane holds a +ve value.
5195   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5196   bind(done);
5197 }
5198 
5199 
5200 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5201                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5202   switch(to_elem_bt) {
5203     case T_SHORT:
5204       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5205       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5206       vpackusdw(dst, dst, zero, vec_enc);
5207       if (vec_enc == Assembler::AVX_256bit) {
5208         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5209       }
5210       break;
5211     case  T_BYTE:
5212       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5213       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5214       vpackusdw(dst, dst, zero, vec_enc);
5215       if (vec_enc == Assembler::AVX_256bit) {
5216         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5217       }
5218       vpackuswb(dst, dst, zero, vec_enc);
5219       break;
5220     default: assert(false, "%s", type2name(to_elem_bt));
5221   }
5222 }
5223 
5224 /*
5225  * Algorithm for vector D2L and F2I conversions:-
5226  * a) Perform vector D2L/F2I cast.
5227  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5228  *    It signifies that source value could be any of the special floating point
5229  *    values(NaN,-Inf,Inf,Max,-Min).
5230  * c) Set destination to zero if source is NaN value.
5231  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5232  */
5233 
5234 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5235                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5236                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5237   int to_elem_sz = type2aelembytes(to_elem_bt);
5238   assert(to_elem_sz <= 4, "");
5239   vcvttps2dq(dst, src, vec_enc);
5240   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5241   if (to_elem_sz < 4) {
5242     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5243     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5244   }
5245 }
5246 
5247 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5248                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5249                                             Register rscratch, int vec_enc) {
5250   int to_elem_sz = type2aelembytes(to_elem_bt);
5251   assert(to_elem_sz <= 4, "");
5252   vcvttps2dq(dst, src, vec_enc);
5253   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5254   switch(to_elem_bt) {
5255     case T_INT:
5256       break;
5257     case T_SHORT:
5258       evpmovdw(dst, dst, vec_enc);
5259       break;
5260     case T_BYTE:
5261       evpmovdb(dst, dst, vec_enc);
5262       break;
5263     default: assert(false, "%s", type2name(to_elem_bt));
5264   }
5265 }
5266 
5267 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5268                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5269                                             Register rscratch, int vec_enc) {
5270   evcvttps2qq(dst, src, vec_enc);
5271   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5272 }
5273 
5274 // Handling for downcasting from double to integer or sub-word types on AVX2.
5275 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5276                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5277                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5278   int to_elem_sz = type2aelembytes(to_elem_bt);
5279   assert(to_elem_sz < 8, "");
5280   vcvttpd2dq(dst, src, vec_enc);
5281   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5282                                               float_sign_flip, vec_enc);
5283   if (to_elem_sz < 4) {
5284     // xtmp4 holds all zero lanes.
5285     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5286   }
5287 }
5288 
5289 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5290                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5291                                             KRegister ktmp2, AddressLiteral sign_flip,
5292                                             Register rscratch, int vec_enc) {
5293   if (VM_Version::supports_avx512dq()) {
5294     evcvttpd2qq(dst, src, vec_enc);
5295     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5296     switch(to_elem_bt) {
5297       case T_LONG:
5298         break;
5299       case T_INT:
5300         evpmovsqd(dst, dst, vec_enc);
5301         break;
5302       case T_SHORT:
5303         evpmovsqd(dst, dst, vec_enc);
5304         evpmovdw(dst, dst, vec_enc);
5305         break;
5306       case T_BYTE:
5307         evpmovsqd(dst, dst, vec_enc);
5308         evpmovdb(dst, dst, vec_enc);
5309         break;
5310       default: assert(false, "%s", type2name(to_elem_bt));
5311     }
5312   } else {
5313     assert(type2aelembytes(to_elem_bt) <= 4, "");
5314     vcvttpd2dq(dst, src, vec_enc);
5315     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5316     switch(to_elem_bt) {
5317       case T_INT:
5318         break;
5319       case T_SHORT:
5320         evpmovdw(dst, dst, vec_enc);
5321         break;
5322       case T_BYTE:
5323         evpmovdb(dst, dst, vec_enc);
5324         break;
5325       default: assert(false, "%s", type2name(to_elem_bt));
5326     }
5327   }
5328 }
5329 
5330 #ifdef _LP64
5331 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5332                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5333                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5334   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5335   // and re-instantiate original MXCSR.RC mode after that.
5336   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5337 
5338   mov64(tmp, julong_cast(0.5L));
5339   evpbroadcastq(xtmp1, tmp, vec_enc);
5340   vaddpd(xtmp1, src , xtmp1, vec_enc);
5341   evcvtpd2qq(dst, xtmp1, vec_enc);
5342   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5343                                                 double_sign_flip, vec_enc);;
5344 
5345   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5346 }
5347 
5348 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5349                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5350                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5351   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5352   // and re-instantiate original MXCSR.RC mode after that.
5353   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5354 
5355   movl(tmp, jint_cast(0.5));
5356   movq(xtmp1, tmp);
5357   vbroadcastss(xtmp1, xtmp1, vec_enc);
5358   vaddps(xtmp1, src , xtmp1, vec_enc);
5359   vcvtps2dq(dst, xtmp1, vec_enc);
5360   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5361                                               float_sign_flip, vec_enc);
5362 
5363   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5364 }
5365 
5366 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5367                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5368                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5369   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5370   // and re-instantiate original MXCSR.RC mode after that.
5371   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5372 
5373   movl(tmp, jint_cast(0.5));
5374   movq(xtmp1, tmp);
5375   vbroadcastss(xtmp1, xtmp1, vec_enc);
5376   vaddps(xtmp1, src , xtmp1, vec_enc);
5377   vcvtps2dq(dst, xtmp1, vec_enc);
5378   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5379 
5380   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5381 }
5382 #endif // _LP64
5383 
5384 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5385                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5386   switch (from_elem_bt) {
5387     case T_BYTE:
5388       switch (to_elem_bt) {
5389         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5390         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5391         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5392         default: ShouldNotReachHere();
5393       }
5394       break;
5395     case T_SHORT:
5396       switch (to_elem_bt) {
5397         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5398         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5399         default: ShouldNotReachHere();
5400       }
5401       break;
5402     case T_INT:
5403       assert(to_elem_bt == T_LONG, "");
5404       vpmovzxdq(dst, src, vlen_enc);
5405       break;
5406     default:
5407       ShouldNotReachHere();
5408   }
5409 }
5410 
5411 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5412                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5413   switch (from_elem_bt) {
5414     case T_BYTE:
5415       switch (to_elem_bt) {
5416         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5417         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5418         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5419         default: ShouldNotReachHere();
5420       }
5421       break;
5422     case T_SHORT:
5423       switch (to_elem_bt) {
5424         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5425         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5426         default: ShouldNotReachHere();
5427       }
5428       break;
5429     case T_INT:
5430       assert(to_elem_bt == T_LONG, "");
5431       vpmovsxdq(dst, src, vlen_enc);
5432       break;
5433     default:
5434       ShouldNotReachHere();
5435   }
5436 }
5437 
5438 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5439                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5440   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5441   assert(vlen_enc != AVX_512bit, "");
5442 
5443   int dst_bt_size = type2aelembytes(dst_bt);
5444   int src_bt_size = type2aelembytes(src_bt);
5445   if (dst_bt_size > src_bt_size) {
5446     switch (dst_bt_size / src_bt_size) {
5447       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5448       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5449       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5450       default: ShouldNotReachHere();
5451     }
5452   } else {
5453     assert(dst_bt_size < src_bt_size, "");
5454     switch (src_bt_size / dst_bt_size) {
5455       case 2: {
5456         if (vlen_enc == AVX_128bit) {
5457           vpacksswb(dst, src, src, vlen_enc);
5458         } else {
5459           vpacksswb(dst, src, src, vlen_enc);
5460           vpermq(dst, dst, 0x08, vlen_enc);
5461         }
5462         break;
5463       }
5464       case 4: {
5465         if (vlen_enc == AVX_128bit) {
5466           vpackssdw(dst, src, src, vlen_enc);
5467           vpacksswb(dst, dst, dst, vlen_enc);
5468         } else {
5469           vpackssdw(dst, src, src, vlen_enc);
5470           vpermq(dst, dst, 0x08, vlen_enc);
5471           vpacksswb(dst, dst, dst, AVX_128bit);
5472         }
5473         break;
5474       }
5475       case 8: {
5476         if (vlen_enc == AVX_128bit) {
5477           vpshufd(dst, src, 0x08, vlen_enc);
5478           vpackssdw(dst, dst, dst, vlen_enc);
5479           vpacksswb(dst, dst, dst, vlen_enc);
5480         } else {
5481           vpshufd(dst, src, 0x08, vlen_enc);
5482           vpermq(dst, dst, 0x08, vlen_enc);
5483           vpackssdw(dst, dst, dst, AVX_128bit);
5484           vpacksswb(dst, dst, dst, AVX_128bit);
5485         }
5486         break;
5487       }
5488       default: ShouldNotReachHere();
5489     }
5490   }
5491 }
5492 
5493 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5494                                    bool merge, BasicType bt, int vlen_enc) {
5495   if (bt == T_INT) {
5496     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5497   } else {
5498     assert(bt == T_LONG, "");
5499     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5500   }
5501 }
5502 
5503 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5504                                    bool merge, BasicType bt, int vlen_enc) {
5505   if (bt == T_INT) {
5506     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5507   } else {
5508     assert(bt == T_LONG, "");
5509     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5510   }
5511 }
5512 
5513 #ifdef _LP64
5514 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5515                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5516                                                int vec_enc) {
5517   int index = 0;
5518   int vindex = 0;
5519   mov64(rtmp1, 0x0101010101010101L);
5520   pdepq(rtmp1, src, rtmp1);
5521   if (mask_len > 8) {
5522     movq(rtmp2, src);
5523     vpxor(xtmp, xtmp, xtmp, vec_enc);
5524     movq(xtmp, rtmp1);
5525   }
5526   movq(dst, rtmp1);
5527 
5528   mask_len -= 8;
5529   while (mask_len > 0) {
5530     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5531     index++;
5532     if ((index % 2) == 0) {
5533       pxor(xtmp, xtmp);
5534     }
5535     mov64(rtmp1, 0x0101010101010101L);
5536     shrq(rtmp2, 8);
5537     pdepq(rtmp1, rtmp2, rtmp1);
5538     pinsrq(xtmp, rtmp1, index % 2);
5539     vindex = index / 2;
5540     if (vindex) {
5541       // Write entire 16 byte vector when both 64 bit
5542       // lanes are update to save redundant instructions.
5543       if (index % 2) {
5544         vinsertf128(dst, dst, xtmp, vindex);
5545       }
5546     } else {
5547       vmovdqu(dst, xtmp);
5548     }
5549     mask_len -= 8;
5550   }
5551 }
5552 
5553 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5554   switch(opc) {
5555     case Op_VectorMaskTrueCount:
5556       popcntq(dst, tmp);
5557       break;
5558     case Op_VectorMaskLastTrue:
5559       if (VM_Version::supports_lzcnt()) {
5560         lzcntq(tmp, tmp);
5561         movl(dst, 63);
5562         subl(dst, tmp);
5563       } else {
5564         movl(dst, -1);
5565         bsrq(tmp, tmp);
5566         cmov32(Assembler::notZero, dst, tmp);
5567       }
5568       break;
5569     case Op_VectorMaskFirstTrue:
5570       if (VM_Version::supports_bmi1()) {
5571         if (masklen < 32) {
5572           orl(tmp, 1 << masklen);
5573           tzcntl(dst, tmp);
5574         } else if (masklen == 32) {
5575           tzcntl(dst, tmp);
5576         } else {
5577           assert(masklen == 64, "");
5578           tzcntq(dst, tmp);
5579         }
5580       } else {
5581         if (masklen < 32) {
5582           orl(tmp, 1 << masklen);
5583           bsfl(dst, tmp);
5584         } else {
5585           assert(masklen == 32 || masklen == 64, "");
5586           movl(dst, masklen);
5587           if (masklen == 32)  {
5588             bsfl(tmp, tmp);
5589           } else {
5590             bsfq(tmp, tmp);
5591           }
5592           cmov32(Assembler::notZero, dst, tmp);
5593         }
5594       }
5595       break;
5596     case Op_VectorMaskToLong:
5597       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5598       break;
5599     default: assert(false, "Unhandled mask operation");
5600   }
5601 }
5602 
5603 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5604                                               int masklen, int masksize, int vec_enc) {
5605   assert(VM_Version::supports_popcnt(), "");
5606 
5607   if(VM_Version::supports_avx512bw()) {
5608     kmovql(tmp, mask);
5609   } else {
5610     assert(masklen <= 16, "");
5611     kmovwl(tmp, mask);
5612   }
5613 
5614   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5615   // operations needs to be clipped.
5616   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5617     andq(tmp, (1 << masklen) - 1);
5618   }
5619 
5620   vector_mask_operation_helper(opc, dst, tmp, masklen);
5621 }
5622 
5623 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5624                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5625   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5626          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5627   assert(VM_Version::supports_popcnt(), "");
5628 
5629   bool need_clip = false;
5630   switch(bt) {
5631     case T_BOOLEAN:
5632       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5633       vpxor(xtmp, xtmp, xtmp, vec_enc);
5634       vpsubb(xtmp, xtmp, mask, vec_enc);
5635       vpmovmskb(tmp, xtmp, vec_enc);
5636       need_clip = masklen < 16;
5637       break;
5638     case T_BYTE:
5639       vpmovmskb(tmp, mask, vec_enc);
5640       need_clip = masklen < 16;
5641       break;
5642     case T_SHORT:
5643       vpacksswb(xtmp, mask, mask, vec_enc);
5644       if (masklen >= 16) {
5645         vpermpd(xtmp, xtmp, 8, vec_enc);
5646       }
5647       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5648       need_clip = masklen < 16;
5649       break;
5650     case T_INT:
5651     case T_FLOAT:
5652       vmovmskps(tmp, mask, vec_enc);
5653       need_clip = masklen < 4;
5654       break;
5655     case T_LONG:
5656     case T_DOUBLE:
5657       vmovmskpd(tmp, mask, vec_enc);
5658       need_clip = masklen < 2;
5659       break;
5660     default: assert(false, "Unhandled type, %s", type2name(bt));
5661   }
5662 
5663   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5664   // operations needs to be clipped.
5665   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5666     // need_clip implies masklen < 32
5667     andq(tmp, (1 << masklen) - 1);
5668   }
5669 
5670   vector_mask_operation_helper(opc, dst, tmp, masklen);
5671 }
5672 
5673 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5674                                              Register rtmp2, int mask_len) {
5675   kmov(rtmp1, src);
5676   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5677   mov64(rtmp2, -1L);
5678   pextq(rtmp2, rtmp2, rtmp1);
5679   kmov(dst, rtmp2);
5680 }
5681 
5682 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5683                                                     XMMRegister mask, Register rtmp, Register rscratch,
5684                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5685                                                     int vec_enc) {
5686   assert(type2aelembytes(bt) >= 4, "");
5687   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5688   address compress_perm_table = nullptr;
5689   address expand_perm_table = nullptr;
5690   if (type2aelembytes(bt) == 8) {
5691     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5692     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5693     vmovmskpd(rtmp, mask, vec_enc);
5694   } else {
5695     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5696     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5697     vmovmskps(rtmp, mask, vec_enc);
5698   }
5699   shlq(rtmp, 5); // for 32 byte permute row.
5700   if (opcode == Op_CompressV) {
5701     lea(rscratch, ExternalAddress(compress_perm_table));
5702   } else {
5703     lea(rscratch, ExternalAddress(expand_perm_table));
5704   }
5705   addptr(rtmp, rscratch);
5706   vmovdqu(permv, Address(rtmp));
5707   vpermps(dst, permv, src, Assembler::AVX_256bit);
5708   vpxor(xtmp, xtmp, xtmp, vec_enc);
5709   // Blend the result with zero vector using permute mask, each column entry
5710   // in a permute table row contains either a valid permute index or a -1 (default)
5711   // value, this can potentially be used as a blending mask after
5712   // compressing/expanding the source vector lanes.
5713   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5714 }
5715 
5716 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5717                                                bool merge, BasicType bt, int vec_enc) {
5718   if (opcode == Op_CompressV) {
5719     switch(bt) {
5720     case T_BYTE:
5721       evpcompressb(dst, mask, src, merge, vec_enc);
5722       break;
5723     case T_CHAR:
5724     case T_SHORT:
5725       evpcompressw(dst, mask, src, merge, vec_enc);
5726       break;
5727     case T_INT:
5728       evpcompressd(dst, mask, src, merge, vec_enc);
5729       break;
5730     case T_FLOAT:
5731       evcompressps(dst, mask, src, merge, vec_enc);
5732       break;
5733     case T_LONG:
5734       evpcompressq(dst, mask, src, merge, vec_enc);
5735       break;
5736     case T_DOUBLE:
5737       evcompresspd(dst, mask, src, merge, vec_enc);
5738       break;
5739     default:
5740       fatal("Unsupported type %s", type2name(bt));
5741       break;
5742     }
5743   } else {
5744     assert(opcode == Op_ExpandV, "");
5745     switch(bt) {
5746     case T_BYTE:
5747       evpexpandb(dst, mask, src, merge, vec_enc);
5748       break;
5749     case T_CHAR:
5750     case T_SHORT:
5751       evpexpandw(dst, mask, src, merge, vec_enc);
5752       break;
5753     case T_INT:
5754       evpexpandd(dst, mask, src, merge, vec_enc);
5755       break;
5756     case T_FLOAT:
5757       evexpandps(dst, mask, src, merge, vec_enc);
5758       break;
5759     case T_LONG:
5760       evpexpandq(dst, mask, src, merge, vec_enc);
5761       break;
5762     case T_DOUBLE:
5763       evexpandpd(dst, mask, src, merge, vec_enc);
5764       break;
5765     default:
5766       fatal("Unsupported type %s", type2name(bt));
5767       break;
5768     }
5769   }
5770 }
5771 #endif
5772 
5773 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5774                                            KRegister ktmp1, int vec_enc) {
5775   if (opcode == Op_SignumVD) {
5776     vsubpd(dst, zero, one, vec_enc);
5777     // if src < 0 ? -1 : 1
5778     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5779     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5780     // if src == NaN, -0.0 or 0.0 return src.
5781     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5782     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5783   } else {
5784     assert(opcode == Op_SignumVF, "");
5785     vsubps(dst, zero, one, vec_enc);
5786     // if src < 0 ? -1 : 1
5787     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5788     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5789     // if src == NaN, -0.0 or 0.0 return src.
5790     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5791     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5792   }
5793 }
5794 
5795 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5796                                           XMMRegister xtmp1, int vec_enc) {
5797   if (opcode == Op_SignumVD) {
5798     vsubpd(dst, zero, one, vec_enc);
5799     // if src < 0 ? -1 : 1
5800     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5801     // if src == NaN, -0.0 or 0.0 return src.
5802     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5803     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5804   } else {
5805     assert(opcode == Op_SignumVF, "");
5806     vsubps(dst, zero, one, vec_enc);
5807     // if src < 0 ? -1 : 1
5808     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5809     // if src == NaN, -0.0 or 0.0 return src.
5810     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5811     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5812   }
5813 }
5814 
5815 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5816   if (VM_Version::supports_avx512bw()) {
5817     if (mask_len > 32) {
5818       kmovql(dst, src);
5819     } else {
5820       kmovdl(dst, src);
5821       if (mask_len != 32) {
5822         kshiftrdl(dst, dst, 32 - mask_len);
5823       }
5824     }
5825   } else {
5826     assert(mask_len <= 16, "");
5827     kmovwl(dst, src);
5828     if (mask_len != 16) {
5829       kshiftrwl(dst, dst, 16 - mask_len);
5830     }
5831   }
5832 }
5833 
5834 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5835   int lane_size = type2aelembytes(bt);
5836   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5837   if ((is_LP64 || lane_size < 8) &&
5838       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5839        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5840     movptr(rtmp, imm32);
5841     switch(lane_size) {
5842       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5843       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5844       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5845       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5846       fatal("Unsupported lane size %d", lane_size);
5847       break;
5848     }
5849   } else {
5850     movptr(rtmp, imm32);
5851     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5852     switch(lane_size) {
5853       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5854       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5855       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5856       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5857       fatal("Unsupported lane size %d", lane_size);
5858       break;
5859     }
5860   }
5861 }
5862 
5863 //
5864 // Following is lookup table based popcount computation algorithm:-
5865 //       Index   Bit set count
5866 //     [ 0000 ->   0,
5867 //       0001 ->   1,
5868 //       0010 ->   1,
5869 //       0011 ->   2,
5870 //       0100 ->   1,
5871 //       0101 ->   2,
5872 //       0110 ->   2,
5873 //       0111 ->   3,
5874 //       1000 ->   1,
5875 //       1001 ->   2,
5876 //       1010 ->   3,
5877 //       1011 ->   3,
5878 //       1100 ->   2,
5879 //       1101 ->   3,
5880 //       1111 ->   4 ]
5881 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5882 //     shuffle indices for lookup table access.
5883 //  b. Right shift each byte of vector lane by 4 positions.
5884 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5885 //     shuffle indices for lookup table access.
5886 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5887 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5888 //     count of all the bytes of a quadword.
5889 //  f. Perform step e. for upper 128bit vector lane.
5890 //  g. Pack the bitset count of quadwords back to double word.
5891 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5892 
5893 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5894                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5895   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5896   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5897   vpsrlw(dst, src, 4, vec_enc);
5898   vpand(dst, dst, xtmp1, vec_enc);
5899   vpand(xtmp1, src, xtmp1, vec_enc);
5900   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5901   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5902   vpshufb(dst, xtmp2, dst, vec_enc);
5903   vpaddb(dst, dst, xtmp1, vec_enc);
5904 }
5905 
5906 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5907                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5908   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5909   // Following code is as per steps e,f,g and h of above algorithm.
5910   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5911   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5912   vpsadbw(dst, dst, xtmp2, vec_enc);
5913   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5914   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5915   vpackuswb(dst, xtmp1, dst, vec_enc);
5916 }
5917 
5918 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5919                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5920   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5921   // Add the popcount of upper and lower bytes of word.
5922   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5923   vpsrlw(dst, xtmp1, 8, vec_enc);
5924   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5925   vpaddw(dst, dst, xtmp1, vec_enc);
5926 }
5927 
5928 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5929                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5930   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5931   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5932   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5933 }
5934 
5935 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5936                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5937   switch(bt) {
5938     case T_LONG:
5939       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5940       break;
5941     case T_INT:
5942       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5943       break;
5944     case T_CHAR:
5945     case T_SHORT:
5946       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5947       break;
5948     case T_BYTE:
5949     case T_BOOLEAN:
5950       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5951       break;
5952     default:
5953       fatal("Unsupported type %s", type2name(bt));
5954       break;
5955   }
5956 }
5957 
5958 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5959                                                       KRegister mask, bool merge, int vec_enc) {
5960   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5961   switch(bt) {
5962     case T_LONG:
5963       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5964       evpopcntq(dst, mask, src, merge, vec_enc);
5965       break;
5966     case T_INT:
5967       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5968       evpopcntd(dst, mask, src, merge, vec_enc);
5969       break;
5970     case T_CHAR:
5971     case T_SHORT:
5972       assert(VM_Version::supports_avx512_bitalg(), "");
5973       evpopcntw(dst, mask, src, merge, vec_enc);
5974       break;
5975     case T_BYTE:
5976     case T_BOOLEAN:
5977       assert(VM_Version::supports_avx512_bitalg(), "");
5978       evpopcntb(dst, mask, src, merge, vec_enc);
5979       break;
5980     default:
5981       fatal("Unsupported type %s", type2name(bt));
5982       break;
5983   }
5984 }
5985 
5986 #ifndef _LP64
5987 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5988   assert(VM_Version::supports_avx512bw(), "");
5989   kmovdl(tmp, src);
5990   kunpckdql(dst, tmp, tmp);
5991 }
5992 #endif
5993 
5994 // Bit reversal algorithm first reverses the bits of each byte followed by
5995 // a byte level reversal for multi-byte primitive types (short/int/long).
5996 // Algorithm performs a lookup table access to get reverse bit sequence
5997 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5998 // is obtained by swapping the reverse bit sequences of upper and lower
5999 // nibble of a byte.
6000 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6001                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
6002   if (VM_Version::supports_avx512vlbw()) {
6003 
6004     // Get the reverse bit sequence of lower nibble of each byte.
6005     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6006     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6007     evpandq(dst, xtmp2, src, vec_enc);
6008     vpshufb(dst, xtmp1, dst, vec_enc);
6009     vpsllq(dst, dst, 4, vec_enc);
6010 
6011     // Get the reverse bit sequence of upper nibble of each byte.
6012     vpandn(xtmp2, xtmp2, src, vec_enc);
6013     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6014     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6015 
6016     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6017     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6018     evporq(xtmp2, dst, xtmp2, vec_enc);
6019     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6020 
6021   } else if(vec_enc == Assembler::AVX_512bit) {
6022     // Shift based bit reversal.
6023     assert(bt == T_LONG || bt == T_INT, "");
6024 
6025     // Swap lower and upper nibble of each byte.
6026     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6027 
6028     // Swap two least and most significant bits of each nibble.
6029     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6030 
6031     // Swap adjacent pair of bits.
6032     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6033     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6034 
6035     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6036     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6037   } else {
6038     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6039     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6040 
6041     // Get the reverse bit sequence of lower nibble of each byte.
6042     vpand(dst, xtmp2, src, vec_enc);
6043     vpshufb(dst, xtmp1, dst, vec_enc);
6044     vpsllq(dst, dst, 4, vec_enc);
6045 
6046     // Get the reverse bit sequence of upper nibble of each byte.
6047     vpandn(xtmp2, xtmp2, src, vec_enc);
6048     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6049     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6050 
6051     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6052     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6053     vpor(xtmp2, dst, xtmp2, vec_enc);
6054     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6055   }
6056 }
6057 
6058 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6059                                                 XMMRegister xtmp, Register rscratch) {
6060   assert(VM_Version::supports_gfni(), "");
6061   assert(rscratch != noreg || always_reachable(mask), "missing");
6062 
6063   // Galois field instruction based bit reversal based on following algorithm.
6064   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6065   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6066   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6067   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6068 }
6069 
6070 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6071                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6072   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6073   evpandq(dst, xtmp1, src, vec_enc);
6074   vpsllq(dst, dst, nbits, vec_enc);
6075   vpandn(xtmp1, xtmp1, src, vec_enc);
6076   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6077   evporq(dst, dst, xtmp1, vec_enc);
6078 }
6079 
6080 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6081                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6082   // Shift based bit reversal.
6083   assert(VM_Version::supports_evex(), "");
6084   switch(bt) {
6085     case T_LONG:
6086       // Swap upper and lower double word of each quad word.
6087       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6088       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6089       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6090       break;
6091     case T_INT:
6092       // Swap upper and lower word of each double word.
6093       evprord(xtmp1, k0, src, 16, true, vec_enc);
6094       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6095       break;
6096     case T_CHAR:
6097     case T_SHORT:
6098       // Swap upper and lower byte of each word.
6099       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6100       break;
6101     case T_BYTE:
6102       evmovdquq(dst, k0, src, true, vec_enc);
6103       break;
6104     default:
6105       fatal("Unsupported type %s", type2name(bt));
6106       break;
6107   }
6108 }
6109 
6110 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6111   if (bt == T_BYTE) {
6112     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6113       evmovdquq(dst, k0, src, true, vec_enc);
6114     } else {
6115       vmovdqu(dst, src);
6116     }
6117     return;
6118   }
6119   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6120   // pre-computed shuffle indices.
6121   switch(bt) {
6122     case T_LONG:
6123       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6124       break;
6125     case T_INT:
6126       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6127       break;
6128     case T_CHAR:
6129     case T_SHORT:
6130       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6131       break;
6132     default:
6133       fatal("Unsupported type %s", type2name(bt));
6134       break;
6135   }
6136   vpshufb(dst, src, dst, vec_enc);
6137 }
6138 
6139 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6140                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6141                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6142   assert(is_integral_type(bt), "");
6143   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6144   assert(VM_Version::supports_avx512cd(), "");
6145   switch(bt) {
6146     case T_LONG:
6147       evplzcntq(dst, ktmp, src, merge, vec_enc);
6148       break;
6149     case T_INT:
6150       evplzcntd(dst, ktmp, src, merge, vec_enc);
6151       break;
6152     case T_SHORT:
6153       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6154       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6155       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6156       vpunpckhwd(dst, xtmp1, src, vec_enc);
6157       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6158       vpackusdw(dst, xtmp2, dst, vec_enc);
6159       break;
6160     case T_BYTE:
6161       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6162       // accessing the lookup table.
6163       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6164       // accessing the lookup table.
6165       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6166       assert(VM_Version::supports_avx512bw(), "");
6167       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6168       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6169       vpand(xtmp2, dst, src, vec_enc);
6170       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6171       vpsrlw(xtmp3, src, 4, vec_enc);
6172       vpand(xtmp3, dst, xtmp3, vec_enc);
6173       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6174       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6175       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6176       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6177       break;
6178     default:
6179       fatal("Unsupported type %s", type2name(bt));
6180       break;
6181   }
6182 }
6183 
6184 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6185                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6186   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6187   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6188   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6189   // accessing the lookup table.
6190   vpand(dst, xtmp2, src, vec_enc);
6191   vpshufb(dst, xtmp1, dst, vec_enc);
6192   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6193   // accessing the lookup table.
6194   vpsrlw(xtmp3, src, 4, vec_enc);
6195   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6196   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6197   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6198   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6199   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6200   vpaddb(dst, dst, xtmp2, vec_enc);
6201   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6202 }
6203 
6204 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6205                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6206   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6207   // Add zero counts of lower byte and upper byte of a word if
6208   // upper byte holds a zero value.
6209   vpsrlw(xtmp3, src, 8, vec_enc);
6210   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6211   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6212   vpsllw(xtmp2, dst, 8, vec_enc);
6213   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6214   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6215   vpsrlw(dst, dst, 8, vec_enc);
6216 }
6217 
6218 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6219                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6220   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6221   // hence biased exponent can be used to compute leading zero count as per
6222   // following formula:-
6223   // LZCNT = 32 - (biased_exp - 127)
6224   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6225 
6226   // Broadcast 0xFF
6227   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6228   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6229 
6230   // Extract biased exponent.
6231   vcvtdq2ps(dst, src, vec_enc);
6232   vpsrld(dst, dst, 23, vec_enc);
6233   vpand(dst, dst, xtmp1, vec_enc);
6234 
6235   // Broadcast 127.
6236   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6237   // Exponent = biased_exp - 127
6238   vpsubd(dst, dst, xtmp1, vec_enc);
6239 
6240   // Exponent = Exponent  + 1
6241   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6242   vpaddd(dst, dst, xtmp3, vec_enc);
6243 
6244   // Replace -ve exponent with zero, exponent is -ve when src
6245   // lane contains a zero value.
6246   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6247   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6248 
6249   // Rematerialize broadcast 32.
6250   vpslld(xtmp1, xtmp3, 5, vec_enc);
6251   // Exponent is 32 if corresponding source lane contains max_int value.
6252   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6253   // LZCNT = 32 - exponent
6254   vpsubd(dst, xtmp1, dst, vec_enc);
6255 
6256   // Replace LZCNT with a value 1 if corresponding source lane
6257   // contains max_int value.
6258   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6259 
6260   // Replace biased_exp with 0 if source lane value is less than zero.
6261   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6262   vblendvps(dst, dst, xtmp2, src, vec_enc);
6263 }
6264 
6265 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6266                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6267   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6268   // Add zero counts of lower word and upper word of a double word if
6269   // upper word holds a zero value.
6270   vpsrld(xtmp3, src, 16, vec_enc);
6271   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6272   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6273   vpslld(xtmp2, dst, 16, vec_enc);
6274   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6275   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6276   vpsrld(dst, dst, 16, vec_enc);
6277   // Add zero counts of lower doubleword and upper doubleword of a
6278   // quadword if upper doubleword holds a zero value.
6279   vpsrlq(xtmp3, src, 32, vec_enc);
6280   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6281   vpsllq(xtmp2, dst, 32, vec_enc);
6282   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6283   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6284   vpsrlq(dst, dst, 32, vec_enc);
6285 }
6286 
6287 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6288                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6289                                                        Register rtmp, int vec_enc) {
6290   assert(is_integral_type(bt), "unexpected type");
6291   assert(vec_enc < Assembler::AVX_512bit, "");
6292   switch(bt) {
6293     case T_LONG:
6294       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6295       break;
6296     case T_INT:
6297       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6298       break;
6299     case T_SHORT:
6300       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6301       break;
6302     case T_BYTE:
6303       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6304       break;
6305     default:
6306       fatal("Unsupported type %s", type2name(bt));
6307       break;
6308   }
6309 }
6310 
6311 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6312   switch(bt) {
6313     case T_BYTE:
6314       vpsubb(dst, src1, src2, vec_enc);
6315       break;
6316     case T_SHORT:
6317       vpsubw(dst, src1, src2, vec_enc);
6318       break;
6319     case T_INT:
6320       vpsubd(dst, src1, src2, vec_enc);
6321       break;
6322     case T_LONG:
6323       vpsubq(dst, src1, src2, vec_enc);
6324       break;
6325     default:
6326       fatal("Unsupported type %s", type2name(bt));
6327       break;
6328   }
6329 }
6330 
6331 // Trailing zero count computation is based on leading zero count operation as per
6332 // following equation. All AVX3 targets support AVX512CD feature which offers
6333 // direct vector instruction to compute leading zero count.
6334 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6335 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6336                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6337                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6338   assert(is_integral_type(bt), "");
6339   // xtmp = -1
6340   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6341   // xtmp = xtmp + src
6342   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6343   // xtmp = xtmp & ~src
6344   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6345   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6346   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6347   vpsub(bt, dst, xtmp4, dst, vec_enc);
6348 }
6349 
6350 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6351 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6352 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6353                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6354   assert(is_integral_type(bt), "");
6355   // xtmp = 0
6356   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6357   // xtmp = 0 - src
6358   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6359   // xtmp = xtmp | src
6360   vpor(xtmp3, xtmp3, src, vec_enc);
6361   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6362   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6363   vpsub(bt, dst, xtmp1, dst, vec_enc);
6364 }
6365 
6366 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6367   Label done;
6368   Label neg_divisor_fastpath;
6369   cmpl(divisor, 0);
6370   jccb(Assembler::less, neg_divisor_fastpath);
6371   xorl(rdx, rdx);
6372   divl(divisor);
6373   jmpb(done);
6374   bind(neg_divisor_fastpath);
6375   // Fastpath for divisor < 0:
6376   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6377   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6378   movl(rdx, rax);
6379   subl(rdx, divisor);
6380   if (VM_Version::supports_bmi1()) {
6381     andnl(rax, rdx, rax);
6382   } else {
6383     notl(rdx);
6384     andl(rax, rdx);
6385   }
6386   shrl(rax, 31);
6387   bind(done);
6388 }
6389 
6390 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6391   Label done;
6392   Label neg_divisor_fastpath;
6393   cmpl(divisor, 0);
6394   jccb(Assembler::less, neg_divisor_fastpath);
6395   xorl(rdx, rdx);
6396   divl(divisor);
6397   jmpb(done);
6398   bind(neg_divisor_fastpath);
6399   // Fastpath when divisor < 0:
6400   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6401   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6402   movl(rdx, rax);
6403   subl(rax, divisor);
6404   if (VM_Version::supports_bmi1()) {
6405     andnl(rax, rax, rdx);
6406   } else {
6407     notl(rax);
6408     andl(rax, rdx);
6409   }
6410   sarl(rax, 31);
6411   andl(rax, divisor);
6412   subl(rdx, rax);
6413   bind(done);
6414 }
6415 
6416 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6417   Label done;
6418   Label neg_divisor_fastpath;
6419 
6420   cmpl(divisor, 0);
6421   jccb(Assembler::less, neg_divisor_fastpath);
6422   xorl(rdx, rdx);
6423   divl(divisor);
6424   jmpb(done);
6425   bind(neg_divisor_fastpath);
6426   // Fastpath for divisor < 0:
6427   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6428   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6429   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6430   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6431   movl(rdx, rax);
6432   subl(rax, divisor);
6433   if (VM_Version::supports_bmi1()) {
6434     andnl(rax, rax, rdx);
6435   } else {
6436     notl(rax);
6437     andl(rax, rdx);
6438   }
6439   movl(tmp, rax);
6440   shrl(rax, 31); // quotient
6441   sarl(tmp, 31);
6442   andl(tmp, divisor);
6443   subl(rdx, tmp); // remainder
6444   bind(done);
6445 }
6446 
6447 #ifdef _LP64
6448 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6449                                  XMMRegister xtmp2, Register rtmp) {
6450   if(VM_Version::supports_gfni()) {
6451     // Galois field instruction based bit reversal based on following algorithm.
6452     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6453     mov64(rtmp, 0x8040201008040201L);
6454     movq(xtmp1, src);
6455     movq(xtmp2, rtmp);
6456     gf2p8affineqb(xtmp1, xtmp2, 0);
6457     movq(dst, xtmp1);
6458   } else {
6459     // Swap even and odd numbered bits.
6460     movl(rtmp, src);
6461     andl(rtmp, 0x55555555);
6462     shll(rtmp, 1);
6463     movl(dst, src);
6464     andl(dst, 0xAAAAAAAA);
6465     shrl(dst, 1);
6466     orl(dst, rtmp);
6467 
6468     // Swap LSB and MSB 2 bits of each nibble.
6469     movl(rtmp, dst);
6470     andl(rtmp, 0x33333333);
6471     shll(rtmp, 2);
6472     andl(dst, 0xCCCCCCCC);
6473     shrl(dst, 2);
6474     orl(dst, rtmp);
6475 
6476     // Swap LSB and MSB 4 bits of each byte.
6477     movl(rtmp, dst);
6478     andl(rtmp, 0x0F0F0F0F);
6479     shll(rtmp, 4);
6480     andl(dst, 0xF0F0F0F0);
6481     shrl(dst, 4);
6482     orl(dst, rtmp);
6483   }
6484   bswapl(dst);
6485 }
6486 
6487 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6488                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6489   if(VM_Version::supports_gfni()) {
6490     // Galois field instruction based bit reversal based on following algorithm.
6491     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6492     mov64(rtmp1, 0x8040201008040201L);
6493     movq(xtmp1, src);
6494     movq(xtmp2, rtmp1);
6495     gf2p8affineqb(xtmp1, xtmp2, 0);
6496     movq(dst, xtmp1);
6497   } else {
6498     // Swap even and odd numbered bits.
6499     movq(rtmp1, src);
6500     mov64(rtmp2, 0x5555555555555555L);
6501     andq(rtmp1, rtmp2);
6502     shlq(rtmp1, 1);
6503     movq(dst, src);
6504     notq(rtmp2);
6505     andq(dst, rtmp2);
6506     shrq(dst, 1);
6507     orq(dst, rtmp1);
6508 
6509     // Swap LSB and MSB 2 bits of each nibble.
6510     movq(rtmp1, dst);
6511     mov64(rtmp2, 0x3333333333333333L);
6512     andq(rtmp1, rtmp2);
6513     shlq(rtmp1, 2);
6514     notq(rtmp2);
6515     andq(dst, rtmp2);
6516     shrq(dst, 2);
6517     orq(dst, rtmp1);
6518 
6519     // Swap LSB and MSB 4 bits of each byte.
6520     movq(rtmp1, dst);
6521     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6522     andq(rtmp1, rtmp2);
6523     shlq(rtmp1, 4);
6524     notq(rtmp2);
6525     andq(dst, rtmp2);
6526     shrq(dst, 4);
6527     orq(dst, rtmp1);
6528   }
6529   bswapq(dst);
6530 }
6531 
6532 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6533   Label done;
6534   Label neg_divisor_fastpath;
6535   cmpq(divisor, 0);
6536   jccb(Assembler::less, neg_divisor_fastpath);
6537   xorl(rdx, rdx);
6538   divq(divisor);
6539   jmpb(done);
6540   bind(neg_divisor_fastpath);
6541   // Fastpath for divisor < 0:
6542   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6543   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6544   movq(rdx, rax);
6545   subq(rdx, divisor);
6546   if (VM_Version::supports_bmi1()) {
6547     andnq(rax, rdx, rax);
6548   } else {
6549     notq(rdx);
6550     andq(rax, rdx);
6551   }
6552   shrq(rax, 63);
6553   bind(done);
6554 }
6555 
6556 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6557   Label done;
6558   Label neg_divisor_fastpath;
6559   cmpq(divisor, 0);
6560   jccb(Assembler::less, neg_divisor_fastpath);
6561   xorq(rdx, rdx);
6562   divq(divisor);
6563   jmp(done);
6564   bind(neg_divisor_fastpath);
6565   // Fastpath when divisor < 0:
6566   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6567   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6568   movq(rdx, rax);
6569   subq(rax, divisor);
6570   if (VM_Version::supports_bmi1()) {
6571     andnq(rax, rax, rdx);
6572   } else {
6573     notq(rax);
6574     andq(rax, rdx);
6575   }
6576   sarq(rax, 63);
6577   andq(rax, divisor);
6578   subq(rdx, rax);
6579   bind(done);
6580 }
6581 
6582 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6583   Label done;
6584   Label neg_divisor_fastpath;
6585   cmpq(divisor, 0);
6586   jccb(Assembler::less, neg_divisor_fastpath);
6587   xorq(rdx, rdx);
6588   divq(divisor);
6589   jmp(done);
6590   bind(neg_divisor_fastpath);
6591   // Fastpath for divisor < 0:
6592   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6593   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6594   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6595   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6596   movq(rdx, rax);
6597   subq(rax, divisor);
6598   if (VM_Version::supports_bmi1()) {
6599     andnq(rax, rax, rdx);
6600   } else {
6601     notq(rax);
6602     andq(rax, rdx);
6603   }
6604   movq(tmp, rax);
6605   shrq(rax, 63); // quotient
6606   sarq(tmp, 63);
6607   andq(tmp, divisor);
6608   subq(rdx, tmp); // remainder
6609   bind(done);
6610 }
6611 #endif
6612 
6613 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6614                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6615                                         int vlen_enc) {
6616   assert(VM_Version::supports_avx512bw(), "");
6617   // Byte shuffles are inlane operations and indices are determined using
6618   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6619   // normalized to index range 0-15. This makes sure that all the multiples
6620   // of an index value are placed at same relative position in 128 bit
6621   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6622   // will be 16th element in their respective 128 bit lanes.
6623   movl(rtmp, 16);
6624   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6625 
6626   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6627   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6628   // original shuffle indices and move the shuffled lanes corresponding to true
6629   // mask to destination vector.
6630   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6631   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6632   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6633 
6634   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6635   // and broadcasting second 128 bit lane.
6636   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6637   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6638   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6639   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6640   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6641 
6642   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6643   // and broadcasting third 128 bit lane.
6644   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6645   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6646   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6647   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6648   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6649 
6650   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6651   // and broadcasting third 128 bit lane.
6652   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6653   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6654   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6655   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6656   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6657 }
6658 
6659 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6660                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6661   if (vlen_enc == AVX_128bit) {
6662     vpermilps(dst, src, shuffle, vlen_enc);
6663   } else if (bt == T_INT) {
6664     vpermd(dst, shuffle, src, vlen_enc);
6665   } else {
6666     assert(bt == T_FLOAT, "");
6667     vpermps(dst, shuffle, src, vlen_enc);
6668   }
6669 }
6670 
6671 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6672   switch(elem_bt) {
6673     case T_BYTE:
6674       if (ideal_opc == Op_SaturatingAddV) {
6675         vpaddsb(dst, src1, src2, vlen_enc);
6676       } else {
6677         assert(ideal_opc == Op_SaturatingSubV, "");
6678         vpsubsb(dst, src1, src2, vlen_enc);
6679       }
6680       break;
6681     case T_SHORT:
6682       if (ideal_opc == Op_SaturatingAddV) {
6683         vpaddsw(dst, src1, src2, vlen_enc);
6684       } else {
6685         assert(ideal_opc == Op_SaturatingSubV, "");
6686         vpsubsw(dst, src1, src2, vlen_enc);
6687       }
6688       break;
6689     default:
6690       fatal("Unsupported type %s", type2name(elem_bt));
6691       break;
6692   }
6693 }
6694 
6695 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6696   switch(elem_bt) {
6697     case T_BYTE:
6698       if (ideal_opc == Op_SaturatingAddV) {
6699         vpaddusb(dst, src1, src2, vlen_enc);
6700       } else {
6701         assert(ideal_opc == Op_SaturatingSubV, "");
6702         vpsubusb(dst, src1, src2, vlen_enc);
6703       }
6704       break;
6705     case T_SHORT:
6706       if (ideal_opc == Op_SaturatingAddV) {
6707         vpaddusw(dst, src1, src2, vlen_enc);
6708       } else {
6709         assert(ideal_opc == Op_SaturatingSubV, "");
6710         vpsubusw(dst, src1, src2, vlen_enc);
6711       }
6712       break;
6713     default:
6714       fatal("Unsupported type %s", type2name(elem_bt));
6715       break;
6716   }
6717 }
6718 
6719 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6720                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6721   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6722   // overflow_mask = Inp1 <u Inp2
6723   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6724   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6725   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6726 }
6727 
6728 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6729                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6730   // Emulate unsigned comparison using signed comparison
6731   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6732   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6733   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6734   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6735 
6736   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6737 
6738   // Res = INP1 - INP2 (non-commutative and non-associative)
6739   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6740   // Res = Mask ? Zero : Res
6741   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6742   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6743 }
6744 
6745 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6746                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6747   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6748   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6749   // Res = Signed Add INP1, INP2
6750   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6751   // T1 = SRC1 | SRC2
6752   vpor(xtmp1, src1, src2, vlen_enc);
6753   // Max_Unsigned = -1
6754   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6755   // Unsigned compare:  Mask = Res <u T1
6756   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6757   // res  = Mask ? Max_Unsigned : Res
6758   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6759 }
6760 
6761 //
6762 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6763 // unsigned addition operation.
6764 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6765 //
6766 // We empirically determined its semantic equivalence to following reduced expression
6767 //    overflow_mask =  (a + b) <u (a | b)
6768 //
6769 // and also verified it though Alive2 solver.
6770 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6771 //
6772 
6773 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6774                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6775   // Res = Signed Add INP1, INP2
6776   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6777   // Compute T1 = INP1 | INP2
6778   vpor(xtmp3, src1, src2, vlen_enc);
6779   // T1 = Minimum signed value.
6780   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6781   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6782   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6783   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6784   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6785   // Compute overflow detection mask = Res<1> <s T1
6786   if (elem_bt == T_INT) {
6787     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6788   } else {
6789     assert(elem_bt == T_LONG, "");
6790     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6791   }
6792   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6793 }
6794 
6795 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6796                                       int vlen_enc, bool xtmp2_hold_M1) {
6797   if (VM_Version::supports_avx512dq()) {
6798     evpmovq2m(ktmp, src, vlen_enc);
6799   } else {
6800     assert(VM_Version::supports_evex(), "");
6801     if (!xtmp2_hold_M1) {
6802       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6803     }
6804     evpsraq(xtmp1, src, 63, vlen_enc);
6805     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6806   }
6807 }
6808 
6809 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6810                                       int vlen_enc, bool xtmp2_hold_M1) {
6811   if (VM_Version::supports_avx512dq()) {
6812     evpmovd2m(ktmp, src, vlen_enc);
6813   } else {
6814     assert(VM_Version::supports_evex(), "");
6815     if (!xtmp2_hold_M1) {
6816       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6817     }
6818     vpsrad(xtmp1, src, 31, vlen_enc);
6819     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6820   }
6821 }
6822 
6823 
6824 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6825   if (elem_bt == T_LONG) {
6826     if (VM_Version::supports_evex()) {
6827       evpsraq(dst, src, 63, vlen_enc);
6828     } else {
6829       vpsrad(dst, src, 31, vlen_enc);
6830       vpshufd(dst, dst, 0xF5, vlen_enc);
6831     }
6832   } else {
6833     assert(elem_bt == T_INT, "");
6834     vpsrad(dst, src, 31, vlen_enc);
6835   }
6836 }
6837 
6838 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6839   if (compute_allones) {
6840     if (vlen_enc == Assembler::AVX_512bit) {
6841       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6842     } else {
6843       vpcmpeqq(allones, allones, allones, vlen_enc);
6844     }
6845   }
6846   if (elem_bt == T_LONG) {
6847     vpsrlq(dst, allones, 1, vlen_enc);
6848   } else {
6849     assert(elem_bt == T_INT, "");
6850     vpsrld(dst, allones, 1, vlen_enc);
6851   }
6852 }
6853 
6854 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6855   if (compute_allones) {
6856     if (vlen_enc == Assembler::AVX_512bit) {
6857       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6858     } else {
6859       vpcmpeqq(allones, allones, allones, vlen_enc);
6860     }
6861   }
6862   if (elem_bt == T_LONG) {
6863     vpsllq(dst, allones, 63, vlen_enc);
6864   } else {
6865     assert(elem_bt == T_INT, "");
6866     vpslld(dst, allones, 31, vlen_enc);
6867   }
6868 }
6869 
6870 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6871                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6872   switch(elem_bt) {
6873     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6874     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6875     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6876     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6877     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6878   }
6879 }
6880 
6881 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6882   switch(elem_bt) {
6883     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6884     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6885     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6886     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6887     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6888   }
6889 }
6890 
6891 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6892                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6893   if (elem_bt == T_LONG) {
6894     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6895   } else {
6896     assert(elem_bt == T_INT, "");
6897     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6898   }
6899 }
6900 
6901 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6902                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6903                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6904   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6905   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6906   // Overflow detection based on Hacker's delight section 2-13.
6907   if (ideal_opc == Op_SaturatingAddV) {
6908     // res = src1 + src2
6909     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6910     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6911     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6912     vpxor(xtmp1, dst, src1, vlen_enc);
6913     vpxor(xtmp2, dst, src2, vlen_enc);
6914     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6915   } else {
6916     assert(ideal_opc == Op_SaturatingSubV, "");
6917     // res = src1 - src2
6918     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6919     // Overflow occurs when both inputs have opposite polarity and
6920     // result polarity does not comply with first input polarity.
6921     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6922     vpxor(xtmp1, src1, src2, vlen_enc);
6923     vpxor(xtmp2, dst, src1, vlen_enc);
6924     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6925   }
6926 
6927   // Compute overflow detection mask.
6928   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6929   // Note: xtmp1 hold -1 in all its lanes after above call.
6930 
6931   // Compute mask based on first input polarity.
6932   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6933 
6934   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6935   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6936 
6937   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6938   // set bits in first input polarity mask holds a min value.
6939   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6940   // Blend destination lanes with saturated values using overflow detection mask.
6941   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6942 }
6943 
6944 
6945 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6946                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6947                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6948   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6949   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6950   // Overflow detection based on Hacker's delight section 2-13.
6951   if (ideal_opc == Op_SaturatingAddV) {
6952     // res = src1 + src2
6953     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6954     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6955     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6956     vpxor(xtmp1, dst, src1, vlen_enc);
6957     vpxor(xtmp2, dst, src2, vlen_enc);
6958     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6959   } else {
6960     assert(ideal_opc == Op_SaturatingSubV, "");
6961     // res = src1 - src2
6962     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6963     // Overflow occurs when both inputs have opposite polarity and
6964     // result polarity does not comply with first input polarity.
6965     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6966     vpxor(xtmp1, src1, src2, vlen_enc);
6967     vpxor(xtmp2, dst, src1, vlen_enc);
6968     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6969   }
6970 
6971   // Sign-extend to compute overflow detection mask.
6972   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6973 
6974   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6975   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6976   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6977 
6978   // Compose saturating min/max vector using first input polarity mask.
6979   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6980   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6981 
6982   // Blend result with saturating vector using overflow detection mask.
6983   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6984 }
6985 
6986 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6987   switch(elem_bt) {
6988     case T_BYTE:
6989       if (ideal_opc == Op_SaturatingAddV) {
6990         vpaddsb(dst, src1, src2, vlen_enc);
6991       } else {
6992         assert(ideal_opc == Op_SaturatingSubV, "");
6993         vpsubsb(dst, src1, src2, vlen_enc);
6994       }
6995       break;
6996     case T_SHORT:
6997       if (ideal_opc == Op_SaturatingAddV) {
6998         vpaddsw(dst, src1, src2, vlen_enc);
6999       } else {
7000         assert(ideal_opc == Op_SaturatingSubV, "");
7001         vpsubsw(dst, src1, src2, vlen_enc);
7002       }
7003       break;
7004     default:
7005       fatal("Unsupported type %s", type2name(elem_bt));
7006       break;
7007   }
7008 }
7009 
7010 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7011   switch(elem_bt) {
7012     case T_BYTE:
7013       if (ideal_opc == Op_SaturatingAddV) {
7014         vpaddusb(dst, src1, src2, vlen_enc);
7015       } else {
7016         assert(ideal_opc == Op_SaturatingSubV, "");
7017         vpsubusb(dst, src1, src2, vlen_enc);
7018       }
7019       break;
7020     case T_SHORT:
7021       if (ideal_opc == Op_SaturatingAddV) {
7022         vpaddusw(dst, src1, src2, vlen_enc);
7023       } else {
7024         assert(ideal_opc == Op_SaturatingSubV, "");
7025         vpsubusw(dst, src1, src2, vlen_enc);
7026       }
7027       break;
7028     default:
7029       fatal("Unsupported type %s", type2name(elem_bt));
7030       break;
7031   }
7032 }
7033 
7034 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7035                                                      XMMRegister src2, int vlen_enc) {
7036   switch(elem_bt) {
7037     case T_BYTE:
7038       evpermi2b(dst, src1, src2, vlen_enc);
7039       break;
7040     case T_SHORT:
7041       evpermi2w(dst, src1, src2, vlen_enc);
7042       break;
7043     case T_INT:
7044       evpermi2d(dst, src1, src2, vlen_enc);
7045       break;
7046     case T_LONG:
7047       evpermi2q(dst, src1, src2, vlen_enc);
7048       break;
7049     case T_FLOAT:
7050       evpermi2ps(dst, src1, src2, vlen_enc);
7051       break;
7052     case T_DOUBLE:
7053       evpermi2pd(dst, src1, src2, vlen_enc);
7054       break;
7055     default:
7056       fatal("Unsupported type %s", type2name(elem_bt));
7057       break;
7058   }
7059 }
7060 
7061 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7062   if (is_unsigned) {
7063     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7064   } else {
7065     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7066   }
7067 }
7068 
7069 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7070   if (is_unsigned) {
7071     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7072   } else {
7073     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7074   }
7075 }