1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  54 
  55   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  56   // NativeJump::patch_verified_entry will be able to patch out the entry
  57   // code safely. The push to verify stack depth is ok at 5 bytes,
  58   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  59   // stack bang then we must use the 6 byte frame allocation even if
  60   // we have no frame. :-(
  61   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  62 
  63   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  64   // Remove word for return addr
  65   framesize -= wordSize;
  66   stack_bang_size -= wordSize;
  67 
  68   // Calls to C2R adapters often do not accept exceptional returns.
  69   // We require that their callers must bang for them.  But be careful, because
  70   // some VM calls (such as call site linkage) can use several kilobytes of
  71   // stack.  But the stack safety zone should account for that.
  72   // See bugs 4446381, 4468289, 4497237.
  73   if (stack_bang_size > 0) {
  74     generate_stack_overflow_check(stack_bang_size);
  75 
  76     // We always push rbp, so that on return to interpreter rbp, will be
  77     // restored correctly and we can correct the stack.
  78     push(rbp);
  79     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  80     if (PreserveFramePointer) {
  81       mov(rbp, rsp);
  82     }
  83     // Remove word for ebp
  84     framesize -= wordSize;
  85 
  86     // Create frame
  87     if (framesize) {
  88       subptr(rsp, framesize);
  89     }
  90   } else {
  91     // Create frame (force generation of a 4 byte immediate value)
  92     subptr_imm32(rsp, framesize);
  93 
  94     // Save RBP register now.
  95     framesize -= wordSize;
  96     movptr(Address(rsp, framesize), rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       movptr(rbp, rsp);
 100       if (framesize > 0) {
 101         addptr(rbp, framesize);
 102       }
 103     }
 104   }
 105 
 106   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 107     framesize -= wordSize;
 108     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 109   }
 110 
 111 #ifndef _LP64
 112   // If method sets FPU control word do it now
 113   if (fp_mode_24b) {
 114     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 115   }
 116   if (UseSSE >= 2 && VerifyFPU) {
 117     verify_FPU(0, "FPU stack must be clean on entry");
 118   }
 119 #endif
 120 
 121 #ifdef ASSERT
 122   if (VerifyStackAtCalls) {
 123     Label L;
 124     push(rax);
 125     mov(rax, rsp);
 126     andptr(rax, StackAlignmentInBytes-1);
 127     cmpptr(rax, StackAlignmentInBytes-wordSize);
 128     pop(rax);
 129     jcc(Assembler::equal, L);
 130     STOP("Stack is not properly aligned!");
 131     bind(L);
 132   }
 133 #endif
 134 
 135   if (!is_stub) {
 136     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137  #ifdef _LP64
 138     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 139       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 140       Label dummy_slow_path;
 141       Label dummy_continuation;
 142       Label* slow_path = &dummy_slow_path;
 143       Label* continuation = &dummy_continuation;
 144       if (!Compile::current()->output()->in_scratch_emit_size()) {
 145         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 146         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 147         Compile::current()->output()->add_stub(stub);
 148         slow_path = &stub->entry();
 149         continuation = &stub->continuation();
 150       }
 151       bs->nmethod_entry_barrier(this, slow_path, continuation);
 152     }
 153 #else
 154     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 155     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 156 #endif
 157   }
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 // fast_lock and fast_unlock used by C2
 176 
 177 // Because the transitions from emitted code to the runtime
 178 // monitorenter/exit helper stubs are so slow it's critical that
 179 // we inline both the stack-locking fast path and the inflated fast path.
 180 //
 181 // See also: cmpFastLock and cmpFastUnlock.
 182 //
 183 // What follows is a specialized inline transliteration of the code
 184 // in enter() and exit(). If we're concerned about I$ bloat another
 185 // option would be to emit TrySlowEnter and TrySlowExit methods
 186 // at startup-time.  These methods would accept arguments as
 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 188 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 190 // In practice, however, the # of lock sites is bounded and is usually small.
 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 192 // if the processor uses simple bimodal branch predictors keyed by EIP
 193 // Since the helper routines would be called from multiple synchronization
 194 // sites.
 195 //
 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 198 // to those specialized methods.  That'd give us a mostly platform-independent
 199 // implementation that the JITs could optimize and inline at their pleasure.
 200 // Done correctly, the only time we'd need to cross to native could would be
 201 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 203 // (b) explicit barriers or fence operations.
 204 //
 205 // TODO:
 206 //
 207 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 208 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 209 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 210 //    the lock operators would typically be faster than reifying Self.
 211 //
 212 // *  Ideally I'd define the primitives as:
 213 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 214 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 215 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 216 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 217 //    Furthermore the register assignments are overconstrained, possibly resulting in
 218 //    sub-optimal code near the synchronization site.
 219 //
 220 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 221 //    Alternately, use a better sp-proximity test.
 222 //
 223 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 224 //    Either one is sufficient to uniquely identify a thread.
 225 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 226 //
 227 // *  Intrinsify notify() and notifyAll() for the common cases where the
 228 //    object is locked by the calling thread but the waitlist is empty.
 229 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 230 //
 231 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 232 //    But beware of excessive branch density on AMD Opterons.
 233 //
 234 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 235 //    or failure of the fast path.  If the fast path fails then we pass
 236 //    control to the slow path, typically in C.  In fast_lock and
 237 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 238 //    will emit a conditional branch immediately after the node.
 239 //    So we have branches to branches and lots of ICC.ZF games.
 240 //    Instead, it might be better to have C2 pass a "FailureLabel"
 241 //    into fast_lock and fast_unlock.  In the case of success, control
 242 //    will drop through the node.  ICC.ZF is undefined at exit.
 243 //    In the case of failure, the node will branch directly to the
 244 //    FailureLabel
 245 
 246 
 247 // obj: object to lock
 248 // box: on-stack box address (displaced header location) - KILLED
 249 // rax,: tmp -- KILLED
 250 // scr: tmp -- KILLED
 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 252                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 253                                  Metadata* method_data) {
 254   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 255   // Ensure the register assignments are disjoint
 256   assert(tmpReg == rax, "");
 257   assert(cx1Reg == noreg, "");
 258   assert(cx2Reg == noreg, "");
 259   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 260 
 261   // Possible cases that we'll encounter in fast_lock
 262   // ------------------------------------------------
 263   // * Inflated
 264   //    -- unlocked
 265   //    -- Locked
 266   //       = by self
 267   //       = by other
 268   // * neutral
 269   // * stack-locked
 270   //    -- by self
 271   //       = sp-proximity test hits
 272   //       = sp-proximity test generates false-negative
 273   //    -- by other
 274   //
 275 
 276   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 277 
 278   if (DiagnoseSyncOnValueBasedClasses != 0) {
 279     load_klass(tmpReg, objReg, scrReg);
 280     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 281     jcc(Assembler::notZero, DONE_LABEL);
 282   }
 283 
 284   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 285   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 286   jcc(Assembler::notZero, IsInflated);
 287 
 288   if (LockingMode == LM_MONITOR) {
 289     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 290     testptr(objReg, objReg);
 291   } else {
 292     assert(LockingMode == LM_LEGACY, "must be");
 293     // Attempt stack-locking ...
 294     orptr (tmpReg, markWord::unlocked_value);
 295     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 296     lock();
 297     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 298     jcc(Assembler::equal, COUNT);           // Success
 299 
 300     // Recursive locking.
 301     // The object is stack-locked: markword contains stack pointer to BasicLock.
 302     // Locked by current thread if difference with current SP is less than one page.
 303     subptr(tmpReg, rsp);
 304     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 305     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 306     movptr(Address(boxReg, 0), tmpReg);
 307   }
 308   jmp(DONE_LABEL);
 309 
 310   bind(IsInflated);
 311   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 312 
 313 #ifndef _LP64
 314   // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 315   orl(boxReg, 1);  // set ICC.ZF=0 to indicate failure
 316 #else
 317   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 318   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 319   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 320 
 321   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 322   movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset()));
 323   movq(scrReg, tmpReg);
 324   xorq(tmpReg, tmpReg);
 325   lock();
 326   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 327 
 328   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 329   jccb(Assembler::equal, COUNT);    // CAS above succeeded; propagate ZF = 1 (success)
 330 
 331   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 332   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 333   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 334   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 335 #endif // _LP64
 336   bind(DONE_LABEL);
 337 
 338   // ZFlag == 1 count in fast path
 339   // ZFlag == 0 count in slow path
 340   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 341 
 342   bind(COUNT);
 343   if (LockingMode == LM_LEGACY) {
 344 #ifdef _LP64
 345     // Count monitors in fast path
 346     increment(Address(thread, JavaThread::held_monitor_count_offset()));
 347 #endif
 348   }
 349   xorl(tmpReg, tmpReg); // Set ZF == 1
 350 
 351   bind(NO_COUNT);
 352 
 353   // At NO_COUNT the icc ZFlag is set as follows ...
 354   // fast_unlock uses the same protocol.
 355   // ZFlag == 1 -> Success
 356   // ZFlag == 0 -> Failure - force control through the slow path
 357 }
 358 
 359 // obj: object to unlock
 360 // box: box address (displaced header location), killed.  Must be EAX.
 361 // tmp: killed, cannot be obj nor box.
 362 //
 363 // Some commentary on balanced locking:
 364 //
 365 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 366 // Methods that don't have provably balanced locking are forced to run in the
 367 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 368 // The interpreter provides two properties:
 369 // I1:  At return-time the interpreter automatically and quietly unlocks any
 370 //      objects acquired the current activation (frame).  Recall that the
 371 //      interpreter maintains an on-stack list of locks currently held by
 372 //      a frame.
 373 // I2:  If a method attempts to unlock an object that is not held by the
 374 //      the frame the interpreter throws IMSX.
 375 //
 376 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 377 // B() doesn't have provably balanced locking so it runs in the interpreter.
 378 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 379 // is still locked by A().
 380 //
 381 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 382 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 383 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 384 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 385 // Arguably given that the spec legislates the JNI case as undefined our implementation
 386 // could reasonably *avoid* checking owner in fast_unlock().
 387 // In the interest of performance we elide m->Owner==Self check in unlock.
 388 // A perfectly viable alternative is to elide the owner check except when
 389 // Xcheck:jni is enabled.
 390 
 391 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 392   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 393   assert(boxReg == rax, "");
 394   assert_different_registers(objReg, boxReg, tmpReg);
 395 
 396   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 397 
 398   if (LockingMode == LM_LEGACY) {
 399     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 400     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 401   }
 402   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 403   if (LockingMode != LM_MONITOR) {
 404     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 405     jcc(Assembler::zero, Stacked);
 406   }
 407 
 408   // It's inflated.
 409 
 410 #ifndef _LP64
 411   // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 412   orl(boxReg, 1);  // set ICC.ZF=0 to indicate failure
 413   jmpb(DONE_LABEL);
 414 #else
 415   // Despite our balanced locking property we still check that m->_owner == Self
 416   // as java routines or native JNI code called by this thread might
 417   // have released the lock.
 418   // Refer to the comments in synchronizer.cpp for how we might encode extra
 419   // state in _succ so we can avoid fetching EntryList|cxq.
 420   //
 421   // If there's no contention try a 1-0 exit.  That is, exit without
 422   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 423   // we detect and recover from the race that the 1-0 exit admits.
 424   //
 425   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 426   // before it STs null into _owner, releasing the lock.  Updates
 427   // to data protected by the critical section must be visible before
 428   // we drop the lock (and thus before any other thread could acquire
 429   // the lock and observe the fields protected by the lock).
 430   // IA32's memory-model is SPO, so STs are ordered with respect to
 431   // each other and there's no need for an explicit barrier (fence).
 432   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 433   Label LSuccess, LNotRecursive;
 434 
 435   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 436   jccb(Assembler::equal, LNotRecursive);
 437 
 438   // Recursive inflated unlock
 439   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 440   jmpb(LSuccess);
 441 
 442   bind(LNotRecursive);
 443 
 444   // Set owner to null.
 445   // Release to satisfy the JMM
 446   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 447   // We need a full fence after clearing owner to avoid stranding.
 448   // StoreLoad achieves this.
 449   membar(StoreLoad);
 450 
 451   // Check if the entry lists are empty (EntryList first - by convention).
 452   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 453   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 454   jccb(Assembler::zero, LSuccess);    // If so we are done.
 455 
 456   // Check if there is a successor.
 457   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 458   jccb(Assembler::notZero, LSuccess); // If so we are done.
 459 
 460   // Save the monitor pointer in the current thread, so we can try to
 461   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 462   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 463   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 464 
 465   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 466   jmpb  (DONE_LABEL);
 467 
 468   bind  (LSuccess);
 469   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 470   jmpb  (DONE_LABEL);
 471 #endif  // _LP64
 472 
 473   if (LockingMode == LM_LEGACY) {
 474     bind  (Stacked);
 475     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 476     lock();
 477     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 478     // Intentional fall-thru into DONE_LABEL
 479   }
 480 
 481   bind(DONE_LABEL);
 482 
 483   // ZFlag == 1 count in fast path
 484   // ZFlag == 0 count in slow path
 485   jccb(Assembler::notZero, NO_COUNT);
 486 
 487   bind(COUNT);
 488 
 489   if (LockingMode == LM_LEGACY) {
 490     // Count monitors in fast path
 491 #ifdef _LP64
 492     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 493 #endif
 494   }
 495 
 496   xorl(tmpReg, tmpReg); // Set ZF == 1
 497 
 498   bind(NO_COUNT);
 499 }
 500 
 501 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 502                                               Register t, Register thread) {
 503   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 504   assert(rax_reg == rax, "Used for CAS");
 505   assert_different_registers(obj, box, rax_reg, t, thread);
 506 
 507   // Handle inflated monitor.
 508   Label inflated;
 509   // Finish fast lock successfully. ZF value is irrelevant.
 510   Label locked;
 511   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 512   Label slow_path;
 513 
 514   if (UseObjectMonitorTable) {
 515     // Clear cache in case fast locking succeeds.
 516     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 517   }
 518 
 519   if (DiagnoseSyncOnValueBasedClasses != 0) {
 520     load_klass(rax_reg, obj, t);
 521     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 522     jcc(Assembler::notZero, slow_path);
 523   }
 524 
 525   const Register mark = t;
 526 
 527   { // Lightweight Lock
 528 
 529     Label push;
 530 
 531     const Register top = UseObjectMonitorTable ? rax_reg : box;
 532 
 533     // Load the mark.
 534     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 535 
 536     // Prefetch top.
 537     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 538 
 539     // Check for monitor (0b10).
 540     testptr(mark, markWord::monitor_value);
 541     jcc(Assembler::notZero, inflated);
 542 
 543     // Check if lock-stack is full.
 544     cmpl(top, LockStack::end_offset() - 1);
 545     jcc(Assembler::greater, slow_path);
 546 
 547     // Check if recursive.
 548     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 549     jccb(Assembler::equal, push);
 550 
 551     // Try to lock. Transition lock bits 0b01 => 0b00
 552     movptr(rax_reg, mark);
 553     orptr(rax_reg, markWord::unlocked_value);
 554     andptr(mark, ~(int32_t)markWord::unlocked_value);
 555     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 556     jcc(Assembler::notEqual, slow_path);
 557 
 558     if (UseObjectMonitorTable) {
 559       // Need to reload top, clobbered by CAS.
 560       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 561     }
 562     bind(push);
 563     // After successful lock, push object on lock-stack.
 564     movptr(Address(thread, top), obj);
 565     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 566     jmpb(locked);
 567   }
 568 
 569   { // Handle inflated monitor.
 570     bind(inflated);
 571 
 572 #ifndef _LP64
 573     // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 574     orl(box, 1);  // set ICC.ZF=0 to indicate failure
 575     jmpb(slow_path);
 576 #else
 577     const Register monitor = t;
 578 
 579     if (!UseObjectMonitorTable) {
 580       assert(mark == monitor, "should be the same here");
 581     } else {
 582       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 583       // Fetch ObjectMonitor* from the cache or take the slow-path.
 584       Label monitor_found;
 585 
 586       // Load cache address
 587       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 588 
 589       const int num_unrolled = 2;
 590       for (int i = 0; i < num_unrolled; i++) {
 591         cmpptr(obj, Address(t));
 592         jccb(Assembler::equal, monitor_found);
 593         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 594       }
 595 
 596       Label loop;
 597 
 598       // Search for obj in cache.
 599       bind(loop);
 600 
 601       // Check for match.
 602       cmpptr(obj, Address(t));
 603       jccb(Assembler::equal, monitor_found);
 604 
 605       // Search until null encountered, guaranteed _null_sentinel at end.
 606       cmpptr(Address(t), 1);
 607       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 608       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 609       jmpb(loop);
 610 
 611       // Cache hit.
 612       bind(monitor_found);
 613       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 614     }
 615     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 616     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 617     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 618 
 619     Label monitor_locked;
 620     // Lock the monitor.
 621 
 622     if (UseObjectMonitorTable) {
 623       // Cache the monitor for unlock before trashing box. On failure to acquire
 624       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 625       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 626     }
 627 
 628     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 629     xorptr(rax_reg, rax_reg);
 630     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 631     lock(); cmpxchgptr(box, owner_address);
 632     jccb(Assembler::equal, monitor_locked);
 633 
 634     // Check if recursive.
 635     cmpptr(box, rax_reg);
 636     jccb(Assembler::notEqual, slow_path);
 637 
 638     // Recursive.
 639     increment(recursions_address);
 640 
 641     bind(monitor_locked);
 642 #endif  // _LP64
 643   }
 644 
 645   bind(locked);
 646   // Set ZF = 1
 647   xorl(rax_reg, rax_reg);
 648 
 649 #ifdef ASSERT
 650   // Check that locked label is reached with ZF set.
 651   Label zf_correct;
 652   Label zf_bad_zero;
 653   jcc(Assembler::zero, zf_correct);
 654   jmp(zf_bad_zero);
 655 #endif
 656 
 657   bind(slow_path);
 658 #ifdef ASSERT
 659   // Check that slow_path label is reached with ZF not set.
 660   jcc(Assembler::notZero, zf_correct);
 661   stop("Fast Lock ZF != 0");
 662   bind(zf_bad_zero);
 663   stop("Fast Lock ZF != 1");
 664   bind(zf_correct);
 665 #endif
 666   // C2 uses the value of ZF to determine the continuation.
 667 }
 668 
 669 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 670   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 671   assert(reg_rax == rax, "Used for CAS");
 672   assert_different_registers(obj, reg_rax, t);
 673 
 674   // Handle inflated monitor.
 675   Label inflated, inflated_check_lock_stack;
 676   // Finish fast unlock successfully.  MUST jump with ZF == 1
 677   Label unlocked, slow_path;
 678 
 679   const Register mark = t;
 680   const Register monitor = t;
 681   const Register top = UseObjectMonitorTable ? t : reg_rax;
 682   const Register box = reg_rax;
 683 
 684   Label dummy;
 685   C2FastUnlockLightweightStub* stub = nullptr;
 686 
 687   if (!Compile::current()->output()->in_scratch_emit_size()) {
 688     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 689     Compile::current()->output()->add_stub(stub);
 690   }
 691 
 692   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 693 
 694   { // Lightweight Unlock
 695 
 696     // Load top.
 697     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 698 
 699     if (!UseObjectMonitorTable) {
 700       // Prefetch mark.
 701       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 702     }
 703 
 704     // Check if obj is top of lock-stack.
 705     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 706     // Top of lock stack was not obj. Must be monitor.
 707     jcc(Assembler::notEqual, inflated_check_lock_stack);
 708 
 709     // Pop lock-stack.
 710     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 711     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 712 
 713     // Check if recursive.
 714     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 715     jcc(Assembler::equal, unlocked);
 716 
 717     // We elide the monitor check, let the CAS fail instead.
 718 
 719     if (UseObjectMonitorTable) {
 720       // Load mark.
 721       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 722     }
 723 
 724     // Try to unlock. Transition lock bits 0b00 => 0b01
 725     movptr(reg_rax, mark);
 726     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 727     orptr(mark, markWord::unlocked_value);
 728     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 729     jcc(Assembler::notEqual, push_and_slow_path);
 730     jmp(unlocked);
 731   }
 732 
 733 
 734   { // Handle inflated monitor.
 735     bind(inflated_check_lock_stack);
 736 #ifdef ASSERT
 737     Label check_done;
 738     subl(top, oopSize);
 739     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 740     jcc(Assembler::below, check_done);
 741     cmpptr(obj, Address(thread, top));
 742     jccb(Assembler::notEqual, inflated_check_lock_stack);
 743     stop("Fast Unlock lock on stack");
 744     bind(check_done);
 745     if (UseObjectMonitorTable) {
 746       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 747     }
 748     testptr(mark, markWord::monitor_value);
 749     jccb(Assembler::notZero, inflated);
 750     stop("Fast Unlock not monitor");
 751 #endif
 752 
 753     bind(inflated);
 754 
 755 #ifndef _LP64
 756     // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 757     orl(t, 1);  // set ICC.ZF=0 to indicate failure
 758     jmpb(slow_path);
 759 #else
 760     if (!UseObjectMonitorTable) {
 761       assert(mark == monitor, "should be the same here");
 762     } else {
 763       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 764       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 765       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 766       cmpptr(monitor, alignof(ObjectMonitor*));
 767       jcc(Assembler::below, slow_path);
 768     }
 769     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 770     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 771     const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag};
 772     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 773     const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag};
 774     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 775 
 776     Label recursive;
 777 
 778     // Check if recursive.
 779     cmpptr(recursions_address, 0);
 780     jccb(Assembler::notZero, recursive);
 781 
 782     // Set owner to null.
 783     // Release to satisfy the JMM
 784     movptr(owner_address, NULL_WORD);
 785     // We need a full fence after clearing owner to avoid stranding.
 786     // StoreLoad achieves this.
 787     membar(StoreLoad);
 788 
 789     // Check if the entry lists are empty (EntryList first - by convention).
 790     movptr(reg_rax, EntryList_address);
 791     orptr(reg_rax, cxq_address);
 792     jccb(Assembler::zero, unlocked);    // If so we are done.
 793 
 794     // Check if there is a successor.
 795     cmpptr(succ_address, NULL_WORD);
 796     jccb(Assembler::notZero, unlocked); // If so we are done.
 797 
 798     // Save the monitor pointer in the current thread, so we can try to
 799     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 800     if (!UseObjectMonitorTable) {
 801       andptr(monitor, ~(int32_t)markWord::monitor_value);
 802     }
 803     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 804 
 805     orl(t, 1); // Fast Unlock ZF = 0
 806     jmpb(slow_path);
 807 
 808     // Recursive unlock.
 809     bind(recursive);
 810     decrement(recursions_address);
 811 #endif  // _LP64
 812   }
 813 
 814   bind(unlocked);
 815   xorl(t, t); // Fast Unlock ZF = 1
 816 
 817 #ifdef ASSERT
 818   // Check that unlocked label is reached with ZF set.
 819   Label zf_correct;
 820   Label zf_bad_zero;
 821   jcc(Assembler::zero, zf_correct);
 822   jmp(zf_bad_zero);
 823 #endif
 824 
 825   bind(slow_path);
 826   if (stub != nullptr) {
 827     bind(stub->slow_path_continuation());
 828   }
 829 #ifdef ASSERT
 830   // Check that stub->continuation() label is reached with ZF not set.
 831   jcc(Assembler::notZero, zf_correct);
 832   stop("Fast Unlock ZF != 0");
 833   bind(zf_bad_zero);
 834   stop("Fast Unlock ZF != 1");
 835   bind(zf_correct);
 836 #endif
 837   // C2 uses the value of ZF to determine the continuation.
 838 }
 839 
 840 //-------------------------------------------------------------------------------------------
 841 // Generic instructions support for use in .ad files C2 code generation
 842 
 843 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 844   if (dst != src) {
 845     movdqu(dst, src);
 846   }
 847   if (opcode == Op_AbsVD) {
 848     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 849   } else {
 850     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 851     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 852   }
 853 }
 854 
 855 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 856   if (opcode == Op_AbsVD) {
 857     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 858   } else {
 859     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 860     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 861   }
 862 }
 863 
 864 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 865   if (dst != src) {
 866     movdqu(dst, src);
 867   }
 868   if (opcode == Op_AbsVF) {
 869     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 870   } else {
 871     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 872     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 873   }
 874 }
 875 
 876 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 877   if (opcode == Op_AbsVF) {
 878     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 879   } else {
 880     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 881     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 882   }
 883 }
 884 
 885 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 886   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 887   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 888 
 889   if (opcode == Op_MinV) {
 890     if (elem_bt == T_BYTE) {
 891       pminsb(dst, src);
 892     } else if (elem_bt == T_SHORT) {
 893       pminsw(dst, src);
 894     } else if (elem_bt == T_INT) {
 895       pminsd(dst, src);
 896     } else {
 897       assert(elem_bt == T_LONG, "required");
 898       assert(tmp == xmm0, "required");
 899       assert_different_registers(dst, src, tmp);
 900       movdqu(xmm0, dst);
 901       pcmpgtq(xmm0, src);
 902       blendvpd(dst, src);  // xmm0 as mask
 903     }
 904   } else { // opcode == Op_MaxV
 905     if (elem_bt == T_BYTE) {
 906       pmaxsb(dst, src);
 907     } else if (elem_bt == T_SHORT) {
 908       pmaxsw(dst, src);
 909     } else if (elem_bt == T_INT) {
 910       pmaxsd(dst, src);
 911     } else {
 912       assert(elem_bt == T_LONG, "required");
 913       assert(tmp == xmm0, "required");
 914       assert_different_registers(dst, src, tmp);
 915       movdqu(xmm0, src);
 916       pcmpgtq(xmm0, dst);
 917       blendvpd(dst, src);  // xmm0 as mask
 918     }
 919   }
 920 }
 921 
 922 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 923                                   XMMRegister src1, Address src2, int vlen_enc) {
 924   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 925   if (opcode == Op_UMinV) {
 926     switch(elem_bt) {
 927       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 928       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 929       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 930       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 931       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 932     }
 933   } else {
 934     assert(opcode == Op_UMaxV, "required");
 935     switch(elem_bt) {
 936       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 937       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 938       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 939       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 940       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 941     }
 942   }
 943 }
 944 
 945 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 946   // For optimality, leverage a full vector width of 512 bits
 947   // for operations over smaller vector sizes on AVX512 targets.
 948   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 949     if (opcode == Op_UMaxV) {
 950       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 951     } else {
 952       assert(opcode == Op_UMinV, "required");
 953       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 954     }
 955   } else {
 956     // T1 = -1
 957     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 958     // T1 = -1 << 63
 959     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 960     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 961     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 962     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 963     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 964     // Mask = T2 > T1
 965     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 966     if (opcode == Op_UMaxV) {
 967       // Res = Mask ? Src2 : Src1
 968       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 969     } else {
 970       // Res = Mask ? Src1 : Src2
 971       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 972     }
 973   }
 974 }
 975 
 976 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 977                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 978   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 979   if (opcode == Op_UMinV) {
 980     switch(elem_bt) {
 981       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 982       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 983       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 984       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 985       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 986     }
 987   } else {
 988     assert(opcode == Op_UMaxV, "required");
 989     switch(elem_bt) {
 990       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 991       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 992       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 993       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 994       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 995     }
 996   }
 997 }
 998 
 999 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1000                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1001                                  int vlen_enc) {
1002   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1003 
1004   if (opcode == Op_MinV) {
1005     if (elem_bt == T_BYTE) {
1006       vpminsb(dst, src1, src2, vlen_enc);
1007     } else if (elem_bt == T_SHORT) {
1008       vpminsw(dst, src1, src2, vlen_enc);
1009     } else if (elem_bt == T_INT) {
1010       vpminsd(dst, src1, src2, vlen_enc);
1011     } else {
1012       assert(elem_bt == T_LONG, "required");
1013       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1014         vpminsq(dst, src1, src2, vlen_enc);
1015       } else {
1016         assert_different_registers(dst, src1, src2);
1017         vpcmpgtq(dst, src1, src2, vlen_enc);
1018         vblendvpd(dst, src1, src2, dst, vlen_enc);
1019       }
1020     }
1021   } else { // opcode == Op_MaxV
1022     if (elem_bt == T_BYTE) {
1023       vpmaxsb(dst, src1, src2, vlen_enc);
1024     } else if (elem_bt == T_SHORT) {
1025       vpmaxsw(dst, src1, src2, vlen_enc);
1026     } else if (elem_bt == T_INT) {
1027       vpmaxsd(dst, src1, src2, vlen_enc);
1028     } else {
1029       assert(elem_bt == T_LONG, "required");
1030       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1031         vpmaxsq(dst, src1, src2, vlen_enc);
1032       } else {
1033         assert_different_registers(dst, src1, src2);
1034         vpcmpgtq(dst, src1, src2, vlen_enc);
1035         vblendvpd(dst, src2, src1, dst, vlen_enc);
1036       }
1037     }
1038   }
1039 }
1040 
1041 // Float/Double min max
1042 
1043 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1044                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1045                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1046                                    int vlen_enc) {
1047   assert(UseAVX > 0, "required");
1048   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1049          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1050   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1051   assert_different_registers(a, tmp, atmp, btmp);
1052   assert_different_registers(b, tmp, atmp, btmp);
1053 
1054   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1055   bool is_double_word = is_double_word_type(elem_bt);
1056 
1057   /* Note on 'non-obvious' assembly sequence:
1058    *
1059    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1060    * and Java on how they handle floats:
1061    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1062    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1063    *
1064    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1065    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1066    *                (only useful when signs differ, noop otherwise)
1067    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1068 
1069    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1070    *   btmp = (b < +0.0) ? a : b
1071    *   atmp = (b < +0.0) ? b : a
1072    *   Tmp  = Max_Float(atmp , btmp)
1073    *   Res  = (atmp == NaN) ? atmp : Tmp
1074    */
1075 
1076   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1077   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1078   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1079   XMMRegister mask;
1080 
1081   if (!is_double_word && is_min) {
1082     mask = a;
1083     vblend = &MacroAssembler::vblendvps;
1084     vmaxmin = &MacroAssembler::vminps;
1085     vcmp = &MacroAssembler::vcmpps;
1086   } else if (!is_double_word && !is_min) {
1087     mask = b;
1088     vblend = &MacroAssembler::vblendvps;
1089     vmaxmin = &MacroAssembler::vmaxps;
1090     vcmp = &MacroAssembler::vcmpps;
1091   } else if (is_double_word && is_min) {
1092     mask = a;
1093     vblend = &MacroAssembler::vblendvpd;
1094     vmaxmin = &MacroAssembler::vminpd;
1095     vcmp = &MacroAssembler::vcmppd;
1096   } else {
1097     assert(is_double_word && !is_min, "sanity");
1098     mask = b;
1099     vblend = &MacroAssembler::vblendvpd;
1100     vmaxmin = &MacroAssembler::vmaxpd;
1101     vcmp = &MacroAssembler::vcmppd;
1102   }
1103 
1104   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1105   XMMRegister maxmin, scratch;
1106   if (dst == btmp) {
1107     maxmin = btmp;
1108     scratch = tmp;
1109   } else {
1110     maxmin = tmp;
1111     scratch = btmp;
1112   }
1113 
1114   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1115   if (precompute_mask && !is_double_word) {
1116     vpsrad(tmp, mask, 32, vlen_enc);
1117     mask = tmp;
1118   } else if (precompute_mask && is_double_word) {
1119     vpxor(tmp, tmp, tmp, vlen_enc);
1120     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1121     mask = tmp;
1122   }
1123 
1124   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1125   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1126   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1127   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1128   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1129 }
1130 
1131 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1132                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1133                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1134                                     int vlen_enc) {
1135   assert(UseAVX > 2, "required");
1136   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1137          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1138   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1139   assert_different_registers(dst, a, atmp, btmp);
1140   assert_different_registers(dst, b, atmp, btmp);
1141 
1142   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1143   bool is_double_word = is_double_word_type(elem_bt);
1144   bool merge = true;
1145 
1146   if (!is_double_word && is_min) {
1147     evpmovd2m(ktmp, a, vlen_enc);
1148     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1149     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1150     vminps(dst, atmp, btmp, vlen_enc);
1151     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1152     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1153   } else if (!is_double_word && !is_min) {
1154     evpmovd2m(ktmp, b, vlen_enc);
1155     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1156     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1157     vmaxps(dst, atmp, btmp, vlen_enc);
1158     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1159     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1160   } else if (is_double_word && is_min) {
1161     evpmovq2m(ktmp, a, vlen_enc);
1162     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1163     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1164     vminpd(dst, atmp, btmp, vlen_enc);
1165     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1166     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1167   } else {
1168     assert(is_double_word && !is_min, "sanity");
1169     evpmovq2m(ktmp, b, vlen_enc);
1170     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1171     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1172     vmaxpd(dst, atmp, btmp, vlen_enc);
1173     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1174     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1175   }
1176 }
1177 
1178 // Float/Double signum
1179 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1180   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1181 
1182   Label DONE_LABEL;
1183 
1184   if (opcode == Op_SignumF) {
1185     assert(UseSSE > 0, "required");
1186     ucomiss(dst, zero);
1187     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1188     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1189     movflt(dst, one);
1190     jcc(Assembler::above, DONE_LABEL);
1191     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1192   } else if (opcode == Op_SignumD) {
1193     assert(UseSSE > 1, "required");
1194     ucomisd(dst, zero);
1195     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1196     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1197     movdbl(dst, one);
1198     jcc(Assembler::above, DONE_LABEL);
1199     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1200   }
1201 
1202   bind(DONE_LABEL);
1203 }
1204 
1205 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1206   if (sign) {
1207     pmovsxbw(dst, src);
1208   } else {
1209     pmovzxbw(dst, src);
1210   }
1211 }
1212 
1213 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1214   if (sign) {
1215     vpmovsxbw(dst, src, vector_len);
1216   } else {
1217     vpmovzxbw(dst, src, vector_len);
1218   }
1219 }
1220 
1221 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1222   if (sign) {
1223     vpmovsxbd(dst, src, vector_len);
1224   } else {
1225     vpmovzxbd(dst, src, vector_len);
1226   }
1227 }
1228 
1229 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1230   if (sign) {
1231     vpmovsxwd(dst, src, vector_len);
1232   } else {
1233     vpmovzxwd(dst, src, vector_len);
1234   }
1235 }
1236 
1237 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1238                                      int shift, int vector_len) {
1239   if (opcode == Op_RotateLeftV) {
1240     if (etype == T_INT) {
1241       evprold(dst, src, shift, vector_len);
1242     } else {
1243       assert(etype == T_LONG, "expected type T_LONG");
1244       evprolq(dst, src, shift, vector_len);
1245     }
1246   } else {
1247     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1248     if (etype == T_INT) {
1249       evprord(dst, src, shift, vector_len);
1250     } else {
1251       assert(etype == T_LONG, "expected type T_LONG");
1252       evprorq(dst, src, shift, vector_len);
1253     }
1254   }
1255 }
1256 
1257 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1258                                      XMMRegister shift, int vector_len) {
1259   if (opcode == Op_RotateLeftV) {
1260     if (etype == T_INT) {
1261       evprolvd(dst, src, shift, vector_len);
1262     } else {
1263       assert(etype == T_LONG, "expected type T_LONG");
1264       evprolvq(dst, src, shift, vector_len);
1265     }
1266   } else {
1267     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1268     if (etype == T_INT) {
1269       evprorvd(dst, src, shift, vector_len);
1270     } else {
1271       assert(etype == T_LONG, "expected type T_LONG");
1272       evprorvq(dst, src, shift, vector_len);
1273     }
1274   }
1275 }
1276 
1277 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1278   if (opcode == Op_RShiftVI) {
1279     psrad(dst, shift);
1280   } else if (opcode == Op_LShiftVI) {
1281     pslld(dst, shift);
1282   } else {
1283     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1284     psrld(dst, shift);
1285   }
1286 }
1287 
1288 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1289   switch (opcode) {
1290     case Op_RShiftVI:  psrad(dst, shift); break;
1291     case Op_LShiftVI:  pslld(dst, shift); break;
1292     case Op_URShiftVI: psrld(dst, shift); break;
1293 
1294     default: assert(false, "%s", NodeClassNames[opcode]);
1295   }
1296 }
1297 
1298 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1299   if (opcode == Op_RShiftVI) {
1300     vpsrad(dst, nds, shift, vector_len);
1301   } else if (opcode == Op_LShiftVI) {
1302     vpslld(dst, nds, shift, vector_len);
1303   } else {
1304     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1305     vpsrld(dst, nds, shift, vector_len);
1306   }
1307 }
1308 
1309 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1310   switch (opcode) {
1311     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1312     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1313     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1314 
1315     default: assert(false, "%s", NodeClassNames[opcode]);
1316   }
1317 }
1318 
1319 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1320   switch (opcode) {
1321     case Op_RShiftVB:  // fall-through
1322     case Op_RShiftVS:  psraw(dst, shift); break;
1323 
1324     case Op_LShiftVB:  // fall-through
1325     case Op_LShiftVS:  psllw(dst, shift);   break;
1326 
1327     case Op_URShiftVS: // fall-through
1328     case Op_URShiftVB: psrlw(dst, shift);  break;
1329 
1330     default: assert(false, "%s", NodeClassNames[opcode]);
1331   }
1332 }
1333 
1334 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1335   switch (opcode) {
1336     case Op_RShiftVB:  // fall-through
1337     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1338 
1339     case Op_LShiftVB:  // fall-through
1340     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1341 
1342     case Op_URShiftVS: // fall-through
1343     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1344 
1345     default: assert(false, "%s", NodeClassNames[opcode]);
1346   }
1347 }
1348 
1349 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1350   switch (opcode) {
1351     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1352     case Op_LShiftVL:  psllq(dst, shift); break;
1353     case Op_URShiftVL: psrlq(dst, shift); break;
1354 
1355     default: assert(false, "%s", NodeClassNames[opcode]);
1356   }
1357 }
1358 
1359 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1360   if (opcode == Op_RShiftVL) {
1361     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1362   } else if (opcode == Op_LShiftVL) {
1363     psllq(dst, shift);
1364   } else {
1365     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1366     psrlq(dst, shift);
1367   }
1368 }
1369 
1370 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1371   switch (opcode) {
1372     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1373     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1374     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1375 
1376     default: assert(false, "%s", NodeClassNames[opcode]);
1377   }
1378 }
1379 
1380 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1381   if (opcode == Op_RShiftVL) {
1382     evpsraq(dst, nds, shift, vector_len);
1383   } else if (opcode == Op_LShiftVL) {
1384     vpsllq(dst, nds, shift, vector_len);
1385   } else {
1386     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1387     vpsrlq(dst, nds, shift, vector_len);
1388   }
1389 }
1390 
1391 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1392   switch (opcode) {
1393     case Op_RShiftVB:  // fall-through
1394     case Op_RShiftVS:  // fall-through
1395     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1396 
1397     case Op_LShiftVB:  // fall-through
1398     case Op_LShiftVS:  // fall-through
1399     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1400 
1401     case Op_URShiftVB: // fall-through
1402     case Op_URShiftVS: // fall-through
1403     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1404 
1405     default: assert(false, "%s", NodeClassNames[opcode]);
1406   }
1407 }
1408 
1409 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1410   switch (opcode) {
1411     case Op_RShiftVB:  // fall-through
1412     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1413 
1414     case Op_LShiftVB:  // fall-through
1415     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1416 
1417     case Op_URShiftVB: // fall-through
1418     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1419 
1420     default: assert(false, "%s", NodeClassNames[opcode]);
1421   }
1422 }
1423 
1424 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1425   assert(UseAVX >= 2, "required");
1426   switch (opcode) {
1427     case Op_RShiftVL: {
1428       if (UseAVX > 2) {
1429         assert(tmp == xnoreg, "not used");
1430         if (!VM_Version::supports_avx512vl()) {
1431           vlen_enc = Assembler::AVX_512bit;
1432         }
1433         evpsravq(dst, src, shift, vlen_enc);
1434       } else {
1435         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1436         vpsrlvq(dst, src, shift, vlen_enc);
1437         vpsrlvq(tmp, tmp, shift, vlen_enc);
1438         vpxor(dst, dst, tmp, vlen_enc);
1439         vpsubq(dst, dst, tmp, vlen_enc);
1440       }
1441       break;
1442     }
1443     case Op_LShiftVL: {
1444       assert(tmp == xnoreg, "not used");
1445       vpsllvq(dst, src, shift, vlen_enc);
1446       break;
1447     }
1448     case Op_URShiftVL: {
1449       assert(tmp == xnoreg, "not used");
1450       vpsrlvq(dst, src, shift, vlen_enc);
1451       break;
1452     }
1453     default: assert(false, "%s", NodeClassNames[opcode]);
1454   }
1455 }
1456 
1457 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1458 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1459   assert(opcode == Op_LShiftVB ||
1460          opcode == Op_RShiftVB ||
1461          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1462   bool sign = (opcode != Op_URShiftVB);
1463   assert(vector_len == 0, "required");
1464   vextendbd(sign, dst, src, 1);
1465   vpmovzxbd(vtmp, shift, 1);
1466   varshiftd(opcode, dst, dst, vtmp, 1);
1467   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1468   vextracti128_high(vtmp, dst);
1469   vpackusdw(dst, dst, vtmp, 0);
1470 }
1471 
1472 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1473 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1474   assert(opcode == Op_LShiftVB ||
1475          opcode == Op_RShiftVB ||
1476          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1477   bool sign = (opcode != Op_URShiftVB);
1478   int ext_vector_len = vector_len + 1;
1479   vextendbw(sign, dst, src, ext_vector_len);
1480   vpmovzxbw(vtmp, shift, ext_vector_len);
1481   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1482   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1483   if (vector_len == 0) {
1484     vextracti128_high(vtmp, dst);
1485     vpackuswb(dst, dst, vtmp, vector_len);
1486   } else {
1487     vextracti64x4_high(vtmp, dst);
1488     vpackuswb(dst, dst, vtmp, vector_len);
1489     vpermq(dst, dst, 0xD8, vector_len);
1490   }
1491 }
1492 
1493 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1494   switch(typ) {
1495     case T_BYTE:
1496       pinsrb(dst, val, idx);
1497       break;
1498     case T_SHORT:
1499       pinsrw(dst, val, idx);
1500       break;
1501     case T_INT:
1502       pinsrd(dst, val, idx);
1503       break;
1504     case T_LONG:
1505       pinsrq(dst, val, idx);
1506       break;
1507     default:
1508       assert(false,"Should not reach here.");
1509       break;
1510   }
1511 }
1512 
1513 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1514   switch(typ) {
1515     case T_BYTE:
1516       vpinsrb(dst, src, val, idx);
1517       break;
1518     case T_SHORT:
1519       vpinsrw(dst, src, val, idx);
1520       break;
1521     case T_INT:
1522       vpinsrd(dst, src, val, idx);
1523       break;
1524     case T_LONG:
1525       vpinsrq(dst, src, val, idx);
1526       break;
1527     default:
1528       assert(false,"Should not reach here.");
1529       break;
1530   }
1531 }
1532 
1533 #ifdef _LP64
1534 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1535                                                 XMMRegister dst, Register base,
1536                                                 Register idx_base,
1537                                                 Register offset, Register mask,
1538                                                 Register mask_idx, Register rtmp,
1539                                                 int vlen_enc) {
1540   vpxor(dst, dst, dst, vlen_enc);
1541   if (elem_bt == T_SHORT) {
1542     for (int i = 0; i < 4; i++) {
1543       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1544       Label skip_load;
1545       btq(mask, mask_idx);
1546       jccb(Assembler::carryClear, skip_load);
1547       movl(rtmp, Address(idx_base, i * 4));
1548       if (offset != noreg) {
1549         addl(rtmp, offset);
1550       }
1551       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1552       bind(skip_load);
1553       incq(mask_idx);
1554     }
1555   } else {
1556     assert(elem_bt == T_BYTE, "");
1557     for (int i = 0; i < 8; i++) {
1558       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1559       Label skip_load;
1560       btq(mask, mask_idx);
1561       jccb(Assembler::carryClear, skip_load);
1562       movl(rtmp, Address(idx_base, i * 4));
1563       if (offset != noreg) {
1564         addl(rtmp, offset);
1565       }
1566       pinsrb(dst, Address(base, rtmp), i);
1567       bind(skip_load);
1568       incq(mask_idx);
1569     }
1570   }
1571 }
1572 #endif // _LP64
1573 
1574 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1575                                          Register base, Register idx_base,
1576                                          Register offset, Register rtmp,
1577                                          int vlen_enc) {
1578   vpxor(dst, dst, dst, vlen_enc);
1579   if (elem_bt == T_SHORT) {
1580     for (int i = 0; i < 4; i++) {
1581       // dst[i] = src[offset + idx_base[i]]
1582       movl(rtmp, Address(idx_base, i * 4));
1583       if (offset != noreg) {
1584         addl(rtmp, offset);
1585       }
1586       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1587     }
1588   } else {
1589     assert(elem_bt == T_BYTE, "");
1590     for (int i = 0; i < 8; i++) {
1591       // dst[i] = src[offset + idx_base[i]]
1592       movl(rtmp, Address(idx_base, i * 4));
1593       if (offset != noreg) {
1594         addl(rtmp, offset);
1595       }
1596       pinsrb(dst, Address(base, rtmp), i);
1597     }
1598   }
1599 }
1600 
1601 /*
1602  * Gather using hybrid algorithm, first partially unroll scalar loop
1603  * to accumulate values from gather indices into a quad-word(64bit) slice.
1604  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1605  * permutation to place the slice into appropriate vector lane
1606  * locations in destination vector. Following pseudo code describes the
1607  * algorithm in detail:
1608  *
1609  * DST_VEC = ZERO_VEC
1610  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1611  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1612  * FOREACH_ITER:
1613  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1614  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1615  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1616  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1617  *
1618  * With each iteration, doubleword permute indices (0,1) corresponding
1619  * to gathered quadword gets right shifted by two lane positions.
1620  *
1621  */
1622 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1623                                         Register base, Register idx_base,
1624                                         Register offset, Register mask,
1625                                         XMMRegister xtmp1, XMMRegister xtmp2,
1626                                         XMMRegister temp_dst, Register rtmp,
1627                                         Register mask_idx, Register length,
1628                                         int vector_len, int vlen_enc) {
1629   Label GATHER8_LOOP;
1630   assert(is_subword_type(elem_ty), "");
1631   movl(length, vector_len);
1632   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1633   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1634   vallones(xtmp2, vlen_enc);
1635   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1636   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1637   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1638 
1639   bind(GATHER8_LOOP);
1640     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1641     if (mask == noreg) {
1642       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1643     } else {
1644       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1645     }
1646     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1647     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1648     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1649     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1650     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1651     vpor(dst, dst, temp_dst, vlen_enc);
1652     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1653     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1654     jcc(Assembler::notEqual, GATHER8_LOOP);
1655 }
1656 
1657 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1658   switch(typ) {
1659     case T_INT:
1660       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1661       break;
1662     case T_FLOAT:
1663       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1664       break;
1665     case T_LONG:
1666       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1667       break;
1668     case T_DOUBLE:
1669       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1670       break;
1671     default:
1672       assert(false,"Should not reach here.");
1673       break;
1674   }
1675 }
1676 
1677 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1678   switch(typ) {
1679     case T_INT:
1680       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1681       break;
1682     case T_FLOAT:
1683       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1684       break;
1685     case T_LONG:
1686       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1687       break;
1688     case T_DOUBLE:
1689       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1690       break;
1691     default:
1692       assert(false,"Should not reach here.");
1693       break;
1694   }
1695 }
1696 
1697 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1698   switch(typ) {
1699     case T_INT:
1700       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1701       break;
1702     case T_FLOAT:
1703       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1704       break;
1705     case T_LONG:
1706       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1707       break;
1708     case T_DOUBLE:
1709       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1710       break;
1711     default:
1712       assert(false,"Should not reach here.");
1713       break;
1714   }
1715 }
1716 
1717 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1718   if (vlen_in_bytes <= 16) {
1719     pxor (dst, dst);
1720     psubb(dst, src);
1721     switch (elem_bt) {
1722       case T_BYTE:   /* nothing to do */ break;
1723       case T_SHORT:  pmovsxbw(dst, dst); break;
1724       case T_INT:    pmovsxbd(dst, dst); break;
1725       case T_FLOAT:  pmovsxbd(dst, dst); break;
1726       case T_LONG:   pmovsxbq(dst, dst); break;
1727       case T_DOUBLE: pmovsxbq(dst, dst); break;
1728 
1729       default: assert(false, "%s", type2name(elem_bt));
1730     }
1731   } else {
1732     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1733     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1734 
1735     vpxor (dst, dst, dst, vlen_enc);
1736     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1737 
1738     switch (elem_bt) {
1739       case T_BYTE:   /* nothing to do */            break;
1740       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1741       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1742       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1743       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1744       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1745 
1746       default: assert(false, "%s", type2name(elem_bt));
1747     }
1748   }
1749 }
1750 
1751 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1752   if (novlbwdq) {
1753     vpmovsxbd(xtmp, src, vlen_enc);
1754     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1755             Assembler::eq, true, vlen_enc, noreg);
1756   } else {
1757     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1758     vpsubb(xtmp, xtmp, src, vlen_enc);
1759     evpmovb2m(dst, xtmp, vlen_enc);
1760   }
1761 }
1762 
1763 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1764   if (is_integral_type(bt)) {
1765     switch (vlen_in_bytes) {
1766       case 4:  movdl(dst, src);   break;
1767       case 8:  movq(dst, src);    break;
1768       case 16: movdqu(dst, src);  break;
1769       case 32: vmovdqu(dst, src); break;
1770       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1771       default: ShouldNotReachHere();
1772     }
1773   } else {
1774     switch (vlen_in_bytes) {
1775       case 4:  movflt(dst, src); break;
1776       case 8:  movdbl(dst, src); break;
1777       case 16: movups(dst, src); break;
1778       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1779       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1780       default: ShouldNotReachHere();
1781     }
1782   }
1783 }
1784 
1785 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1786   assert(rscratch != noreg || always_reachable(src), "missing");
1787 
1788   if (reachable(src)) {
1789     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1790   } else {
1791     lea(rscratch, src);
1792     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1793   }
1794 }
1795 
1796 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1797   int vlen_enc = vector_length_encoding(vlen);
1798   if (VM_Version::supports_avx()) {
1799     if (bt == T_LONG) {
1800       if (VM_Version::supports_avx2()) {
1801         vpbroadcastq(dst, src, vlen_enc);
1802       } else {
1803         vmovddup(dst, src, vlen_enc);
1804       }
1805     } else if (bt == T_DOUBLE) {
1806       if (vlen_enc != Assembler::AVX_128bit) {
1807         vbroadcastsd(dst, src, vlen_enc, noreg);
1808       } else {
1809         vmovddup(dst, src, vlen_enc);
1810       }
1811     } else {
1812       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1813         vpbroadcastd(dst, src, vlen_enc);
1814       } else {
1815         vbroadcastss(dst, src, vlen_enc);
1816       }
1817     }
1818   } else if (VM_Version::supports_sse3()) {
1819     movddup(dst, src);
1820   } else {
1821     load_vector(bt, dst, src, vlen);
1822   }
1823 }
1824 
1825 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1826   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1827   int offset = exact_log2(type2aelembytes(bt)) << 6;
1828   if (is_floating_point_type(bt)) {
1829     offset += 128;
1830   }
1831   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1832   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1833 }
1834 
1835 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1836 
1837 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1838   int vector_len = Assembler::AVX_128bit;
1839 
1840   switch (opcode) {
1841     case Op_AndReductionV:  pand(dst, src); break;
1842     case Op_OrReductionV:   por (dst, src); break;
1843     case Op_XorReductionV:  pxor(dst, src); break;
1844     case Op_MinReductionV:
1845       switch (typ) {
1846         case T_BYTE:        pminsb(dst, src); break;
1847         case T_SHORT:       pminsw(dst, src); break;
1848         case T_INT:         pminsd(dst, src); break;
1849         case T_LONG:        assert(UseAVX > 2, "required");
1850                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1851         default:            assert(false, "wrong type");
1852       }
1853       break;
1854     case Op_MaxReductionV:
1855       switch (typ) {
1856         case T_BYTE:        pmaxsb(dst, src); break;
1857         case T_SHORT:       pmaxsw(dst, src); break;
1858         case T_INT:         pmaxsd(dst, src); break;
1859         case T_LONG:        assert(UseAVX > 2, "required");
1860                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1861         default:            assert(false, "wrong type");
1862       }
1863       break;
1864     case Op_AddReductionVF: addss(dst, src); break;
1865     case Op_AddReductionVD: addsd(dst, src); break;
1866     case Op_AddReductionVI:
1867       switch (typ) {
1868         case T_BYTE:        paddb(dst, src); break;
1869         case T_SHORT:       paddw(dst, src); break;
1870         case T_INT:         paddd(dst, src); break;
1871         default:            assert(false, "wrong type");
1872       }
1873       break;
1874     case Op_AddReductionVL: paddq(dst, src); break;
1875     case Op_MulReductionVF: mulss(dst, src); break;
1876     case Op_MulReductionVD: mulsd(dst, src); break;
1877     case Op_MulReductionVI:
1878       switch (typ) {
1879         case T_SHORT:       pmullw(dst, src); break;
1880         case T_INT:         pmulld(dst, src); break;
1881         default:            assert(false, "wrong type");
1882       }
1883       break;
1884     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1885                             evpmullq(dst, dst, src, vector_len); break;
1886     default:                assert(false, "wrong opcode");
1887   }
1888 }
1889 
1890 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1891   switch (opcode) {
1892     case Op_AddReductionVF: addps(dst, src); break;
1893     case Op_AddReductionVD: addpd(dst, src); break;
1894     case Op_MulReductionVF: mulps(dst, src); break;
1895     case Op_MulReductionVD: mulpd(dst, src); break;
1896     default:                assert(false, "%s", NodeClassNames[opcode]);
1897   }
1898 }
1899 
1900 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1901   int vector_len = Assembler::AVX_256bit;
1902 
1903   switch (opcode) {
1904     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1905     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1906     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1907     case Op_MinReductionV:
1908       switch (typ) {
1909         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1910         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1911         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1912         case T_LONG:        assert(UseAVX > 2, "required");
1913                             vpminsq(dst, src1, src2, vector_len); break;
1914         default:            assert(false, "wrong type");
1915       }
1916       break;
1917     case Op_MaxReductionV:
1918       switch (typ) {
1919         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1920         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1921         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1922         case T_LONG:        assert(UseAVX > 2, "required");
1923                             vpmaxsq(dst, src1, src2, vector_len); break;
1924         default:            assert(false, "wrong type");
1925       }
1926       break;
1927     case Op_AddReductionVI:
1928       switch (typ) {
1929         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1930         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1931         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1932         default:            assert(false, "wrong type");
1933       }
1934       break;
1935     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1936     case Op_MulReductionVI:
1937       switch (typ) {
1938         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1939         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1940         default:            assert(false, "wrong type");
1941       }
1942       break;
1943     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1944     default:                assert(false, "wrong opcode");
1945   }
1946 }
1947 
1948 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1949   int vector_len = Assembler::AVX_256bit;
1950 
1951   switch (opcode) {
1952     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1953     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1954     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1955     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1956     default:                assert(false, "%s", NodeClassNames[opcode]);
1957   }
1958 }
1959 
1960 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1961                                   XMMRegister dst, XMMRegister src,
1962                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1963   switch (opcode) {
1964     case Op_AddReductionVF:
1965     case Op_MulReductionVF:
1966       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1967       break;
1968 
1969     case Op_AddReductionVD:
1970     case Op_MulReductionVD:
1971       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1972       break;
1973 
1974     default: assert(false, "wrong opcode");
1975   }
1976 }
1977 
1978 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1979                                             XMMRegister dst, XMMRegister src,
1980                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1981   switch (opcode) {
1982     case Op_AddReductionVF:
1983     case Op_MulReductionVF:
1984       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1985       break;
1986 
1987     case Op_AddReductionVD:
1988     case Op_MulReductionVD:
1989       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1990       break;
1991 
1992     default: assert(false, "%s", NodeClassNames[opcode]);
1993   }
1994 }
1995 
1996 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1997                              Register dst, Register src1, XMMRegister src2,
1998                              XMMRegister vtmp1, XMMRegister vtmp2) {
1999   switch (vlen) {
2000     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2001     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2002     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2003     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2004 
2005     default: assert(false, "wrong vector length");
2006   }
2007 }
2008 
2009 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2010                              Register dst, Register src1, XMMRegister src2,
2011                              XMMRegister vtmp1, XMMRegister vtmp2) {
2012   switch (vlen) {
2013     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2014     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2015     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2016     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2017 
2018     default: assert(false, "wrong vector length");
2019   }
2020 }
2021 
2022 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2023                              Register dst, Register src1, XMMRegister src2,
2024                              XMMRegister vtmp1, XMMRegister vtmp2) {
2025   switch (vlen) {
2026     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2027     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2028     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2029     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2030 
2031     default: assert(false, "wrong vector length");
2032   }
2033 }
2034 
2035 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2036                              Register dst, Register src1, XMMRegister src2,
2037                              XMMRegister vtmp1, XMMRegister vtmp2) {
2038   switch (vlen) {
2039     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2040     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2041     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2042     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2043 
2044     default: assert(false, "wrong vector length");
2045   }
2046 }
2047 
2048 #ifdef _LP64
2049 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2050                              Register dst, Register src1, XMMRegister src2,
2051                              XMMRegister vtmp1, XMMRegister vtmp2) {
2052   switch (vlen) {
2053     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2054     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2055     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2056 
2057     default: assert(false, "wrong vector length");
2058   }
2059 }
2060 #endif // _LP64
2061 
2062 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2063   switch (vlen) {
2064     case 2:
2065       assert(vtmp2 == xnoreg, "");
2066       reduce2F(opcode, dst, src, vtmp1);
2067       break;
2068     case 4:
2069       assert(vtmp2 == xnoreg, "");
2070       reduce4F(opcode, dst, src, vtmp1);
2071       break;
2072     case 8:
2073       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2074       break;
2075     case 16:
2076       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2077       break;
2078     default: assert(false, "wrong vector length");
2079   }
2080 }
2081 
2082 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2083   switch (vlen) {
2084     case 2:
2085       assert(vtmp2 == xnoreg, "");
2086       reduce2D(opcode, dst, src, vtmp1);
2087       break;
2088     case 4:
2089       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2090       break;
2091     case 8:
2092       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2093       break;
2094     default: assert(false, "wrong vector length");
2095   }
2096 }
2097 
2098 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2099   switch (vlen) {
2100     case 2:
2101       assert(vtmp1 == xnoreg, "");
2102       assert(vtmp2 == xnoreg, "");
2103       unorderedReduce2F(opcode, dst, src);
2104       break;
2105     case 4:
2106       assert(vtmp2 == xnoreg, "");
2107       unorderedReduce4F(opcode, dst, src, vtmp1);
2108       break;
2109     case 8:
2110       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2111       break;
2112     case 16:
2113       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2114       break;
2115     default: assert(false, "wrong vector length");
2116   }
2117 }
2118 
2119 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2120   switch (vlen) {
2121     case 2:
2122       assert(vtmp1 == xnoreg, "");
2123       assert(vtmp2 == xnoreg, "");
2124       unorderedReduce2D(opcode, dst, src);
2125       break;
2126     case 4:
2127       assert(vtmp2 == xnoreg, "");
2128       unorderedReduce4D(opcode, dst, src, vtmp1);
2129       break;
2130     case 8:
2131       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2132       break;
2133     default: assert(false, "wrong vector length");
2134   }
2135 }
2136 
2137 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2138   if (opcode == Op_AddReductionVI) {
2139     if (vtmp1 != src2) {
2140       movdqu(vtmp1, src2);
2141     }
2142     phaddd(vtmp1, vtmp1);
2143   } else {
2144     pshufd(vtmp1, src2, 0x1);
2145     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2146   }
2147   movdl(vtmp2, src1);
2148   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2149   movdl(dst, vtmp1);
2150 }
2151 
2152 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2153   if (opcode == Op_AddReductionVI) {
2154     if (vtmp1 != src2) {
2155       movdqu(vtmp1, src2);
2156     }
2157     phaddd(vtmp1, src2);
2158     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2159   } else {
2160     pshufd(vtmp2, src2, 0xE);
2161     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2162     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2163   }
2164 }
2165 
2166 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2167   if (opcode == Op_AddReductionVI) {
2168     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2169     vextracti128_high(vtmp2, vtmp1);
2170     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2171     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2172   } else {
2173     vextracti128_high(vtmp1, src2);
2174     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2175     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2176   }
2177 }
2178 
2179 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2180   vextracti64x4_high(vtmp2, src2);
2181   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2182   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2183 }
2184 
2185 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2186   pshufd(vtmp2, src2, 0x1);
2187   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2188   movdqu(vtmp1, vtmp2);
2189   psrldq(vtmp1, 2);
2190   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2191   movdqu(vtmp2, vtmp1);
2192   psrldq(vtmp2, 1);
2193   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2194   movdl(vtmp2, src1);
2195   pmovsxbd(vtmp1, vtmp1);
2196   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2197   pextrb(dst, vtmp1, 0x0);
2198   movsbl(dst, dst);
2199 }
2200 
2201 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2202   pshufd(vtmp1, src2, 0xE);
2203   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2204   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2205 }
2206 
2207 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2208   vextracti128_high(vtmp2, src2);
2209   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2210   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2211 }
2212 
2213 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2214   vextracti64x4_high(vtmp1, src2);
2215   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2216   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2217 }
2218 
2219 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2220   pmovsxbw(vtmp2, src2);
2221   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2222 }
2223 
2224 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2225   if (UseAVX > 1) {
2226     int vector_len = Assembler::AVX_256bit;
2227     vpmovsxbw(vtmp1, src2, vector_len);
2228     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2229   } else {
2230     pmovsxbw(vtmp2, src2);
2231     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2232     pshufd(vtmp2, src2, 0x1);
2233     pmovsxbw(vtmp2, src2);
2234     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2235   }
2236 }
2237 
2238 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2239   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2240     int vector_len = Assembler::AVX_512bit;
2241     vpmovsxbw(vtmp1, src2, vector_len);
2242     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2243   } else {
2244     assert(UseAVX >= 2,"Should not reach here.");
2245     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2246     vextracti128_high(vtmp2, src2);
2247     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2248   }
2249 }
2250 
2251 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2252   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2253   vextracti64x4_high(vtmp2, src2);
2254   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2255 }
2256 
2257 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2258   if (opcode == Op_AddReductionVI) {
2259     if (vtmp1 != src2) {
2260       movdqu(vtmp1, src2);
2261     }
2262     phaddw(vtmp1, vtmp1);
2263     phaddw(vtmp1, vtmp1);
2264   } else {
2265     pshufd(vtmp2, src2, 0x1);
2266     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2267     movdqu(vtmp1, vtmp2);
2268     psrldq(vtmp1, 2);
2269     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2270   }
2271   movdl(vtmp2, src1);
2272   pmovsxwd(vtmp1, vtmp1);
2273   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2274   pextrw(dst, vtmp1, 0x0);
2275   movswl(dst, dst);
2276 }
2277 
2278 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2279   if (opcode == Op_AddReductionVI) {
2280     if (vtmp1 != src2) {
2281       movdqu(vtmp1, src2);
2282     }
2283     phaddw(vtmp1, src2);
2284   } else {
2285     pshufd(vtmp1, src2, 0xE);
2286     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2287   }
2288   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2289 }
2290 
2291 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2292   if (opcode == Op_AddReductionVI) {
2293     int vector_len = Assembler::AVX_256bit;
2294     vphaddw(vtmp2, src2, src2, vector_len);
2295     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2296   } else {
2297     vextracti128_high(vtmp2, src2);
2298     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2299   }
2300   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2301 }
2302 
2303 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2304   int vector_len = Assembler::AVX_256bit;
2305   vextracti64x4_high(vtmp1, src2);
2306   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2307   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2308 }
2309 
2310 #ifdef _LP64
2311 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2312   pshufd(vtmp2, src2, 0xE);
2313   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2314   movdq(vtmp1, src1);
2315   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2316   movdq(dst, vtmp1);
2317 }
2318 
2319 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2320   vextracti128_high(vtmp1, src2);
2321   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2322   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2323 }
2324 
2325 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2326   vextracti64x4_high(vtmp2, src2);
2327   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2328   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2329 }
2330 
2331 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2332   mov64(temp, -1L);
2333   bzhiq(temp, temp, len);
2334   kmovql(dst, temp);
2335 }
2336 #endif // _LP64
2337 
2338 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2339   reduce_operation_128(T_FLOAT, opcode, dst, src);
2340   pshufd(vtmp, src, 0x1);
2341   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2342 }
2343 
2344 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2345   reduce2F(opcode, dst, src, vtmp);
2346   pshufd(vtmp, src, 0x2);
2347   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2348   pshufd(vtmp, src, 0x3);
2349   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2350 }
2351 
2352 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2353   reduce4F(opcode, dst, src, vtmp2);
2354   vextractf128_high(vtmp2, src);
2355   reduce4F(opcode, dst, vtmp2, vtmp1);
2356 }
2357 
2358 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2359   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2360   vextracti64x4_high(vtmp1, src);
2361   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2362 }
2363 
2364 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2365   pshufd(dst, src, 0x1);
2366   reduce_operation_128(T_FLOAT, opcode, dst, src);
2367 }
2368 
2369 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2370   pshufd(vtmp, src, 0xE);
2371   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2372   unorderedReduce2F(opcode, dst, vtmp);
2373 }
2374 
2375 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2376   vextractf128_high(vtmp1, src);
2377   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2378   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2379 }
2380 
2381 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2382   vextractf64x4_high(vtmp2, src);
2383   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2384   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2385 }
2386 
2387 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2388   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2389   pshufd(vtmp, src, 0xE);
2390   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2391 }
2392 
2393 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2394   reduce2D(opcode, dst, src, vtmp2);
2395   vextractf128_high(vtmp2, src);
2396   reduce2D(opcode, dst, vtmp2, vtmp1);
2397 }
2398 
2399 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2400   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2401   vextracti64x4_high(vtmp1, src);
2402   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2403 }
2404 
2405 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2406   pshufd(dst, src, 0xE);
2407   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2408 }
2409 
2410 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2411   vextractf128_high(vtmp, src);
2412   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2413   unorderedReduce2D(opcode, dst, vtmp);
2414 }
2415 
2416 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2417   vextractf64x4_high(vtmp2, src);
2418   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2419   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2420 }
2421 
2422 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2423   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2424 }
2425 
2426 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2427   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2428 }
2429 
2430 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2431   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2432 }
2433 
2434 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2435                                  int vec_enc) {
2436   switch(elem_bt) {
2437     case T_INT:
2438     case T_FLOAT:
2439       vmaskmovps(dst, src, mask, vec_enc);
2440       break;
2441     case T_LONG:
2442     case T_DOUBLE:
2443       vmaskmovpd(dst, src, mask, vec_enc);
2444       break;
2445     default:
2446       fatal("Unsupported type %s", type2name(elem_bt));
2447       break;
2448   }
2449 }
2450 
2451 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2452                                  int vec_enc) {
2453   switch(elem_bt) {
2454     case T_INT:
2455     case T_FLOAT:
2456       vmaskmovps(dst, src, mask, vec_enc);
2457       break;
2458     case T_LONG:
2459     case T_DOUBLE:
2460       vmaskmovpd(dst, src, mask, vec_enc);
2461       break;
2462     default:
2463       fatal("Unsupported type %s", type2name(elem_bt));
2464       break;
2465   }
2466 }
2467 
2468 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2469                                           XMMRegister dst, XMMRegister src,
2470                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2471                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2472   const int permconst[] = {1, 14};
2473   XMMRegister wsrc = src;
2474   XMMRegister wdst = xmm_0;
2475   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2476 
2477   int vlen_enc = Assembler::AVX_128bit;
2478   if (vlen == 16) {
2479     vlen_enc = Assembler::AVX_256bit;
2480   }
2481 
2482   for (int i = log2(vlen) - 1; i >=0; i--) {
2483     if (i == 0 && !is_dst_valid) {
2484       wdst = dst;
2485     }
2486     if (i == 3) {
2487       vextracti64x4_high(wtmp, wsrc);
2488     } else if (i == 2) {
2489       vextracti128_high(wtmp, wsrc);
2490     } else { // i = [0,1]
2491       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2492     }
2493     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2494     wsrc = wdst;
2495     vlen_enc = Assembler::AVX_128bit;
2496   }
2497   if (is_dst_valid) {
2498     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2499   }
2500 }
2501 
2502 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2503                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2504                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2505   XMMRegister wsrc = src;
2506   XMMRegister wdst = xmm_0;
2507   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2508   int vlen_enc = Assembler::AVX_128bit;
2509   if (vlen == 8) {
2510     vlen_enc = Assembler::AVX_256bit;
2511   }
2512   for (int i = log2(vlen) - 1; i >=0; i--) {
2513     if (i == 0 && !is_dst_valid) {
2514       wdst = dst;
2515     }
2516     if (i == 1) {
2517       vextracti128_high(wtmp, wsrc);
2518     } else if (i == 2) {
2519       vextracti64x4_high(wtmp, wsrc);
2520     } else {
2521       assert(i == 0, "%d", i);
2522       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2523     }
2524     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2525     wsrc = wdst;
2526     vlen_enc = Assembler::AVX_128bit;
2527   }
2528   if (is_dst_valid) {
2529     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2530   }
2531 }
2532 
2533 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2534   switch (bt) {
2535     case T_BYTE:  pextrb(dst, src, idx); break;
2536     case T_SHORT: pextrw(dst, src, idx); break;
2537     case T_INT:   pextrd(dst, src, idx); break;
2538     case T_LONG:  pextrq(dst, src, idx); break;
2539 
2540     default:
2541       assert(false,"Should not reach here.");
2542       break;
2543   }
2544 }
2545 
2546 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2547   int esize =  type2aelembytes(typ);
2548   int elem_per_lane = 16/esize;
2549   int lane = elemindex / elem_per_lane;
2550   int eindex = elemindex % elem_per_lane;
2551 
2552   if (lane >= 2) {
2553     assert(UseAVX > 2, "required");
2554     vextractf32x4(dst, src, lane & 3);
2555     return dst;
2556   } else if (lane > 0) {
2557     assert(UseAVX > 0, "required");
2558     vextractf128(dst, src, lane);
2559     return dst;
2560   } else {
2561     return src;
2562   }
2563 }
2564 
2565 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2566   if (typ == T_BYTE) {
2567     movsbl(dst, dst);
2568   } else if (typ == T_SHORT) {
2569     movswl(dst, dst);
2570   }
2571 }
2572 
2573 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2574   int esize =  type2aelembytes(typ);
2575   int elem_per_lane = 16/esize;
2576   int eindex = elemindex % elem_per_lane;
2577   assert(is_integral_type(typ),"required");
2578 
2579   if (eindex == 0) {
2580     if (typ == T_LONG) {
2581       movq(dst, src);
2582     } else {
2583       movdl(dst, src);
2584       movsxl(typ, dst);
2585     }
2586   } else {
2587     extract(typ, dst, src, eindex);
2588     movsxl(typ, dst);
2589   }
2590 }
2591 
2592 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2593   int esize =  type2aelembytes(typ);
2594   int elem_per_lane = 16/esize;
2595   int eindex = elemindex % elem_per_lane;
2596   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2597 
2598   if (eindex == 0) {
2599     movq(dst, src);
2600   } else {
2601     if (typ == T_FLOAT) {
2602       if (UseAVX == 0) {
2603         movdqu(dst, src);
2604         shufps(dst, dst, eindex);
2605       } else {
2606         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2607       }
2608     } else {
2609       if (UseAVX == 0) {
2610         movdqu(dst, src);
2611         psrldq(dst, eindex*esize);
2612       } else {
2613         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2614       }
2615       movq(dst, dst);
2616     }
2617   }
2618   // Zero upper bits
2619   if (typ == T_FLOAT) {
2620     if (UseAVX == 0) {
2621       assert(vtmp != xnoreg, "required.");
2622       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2623       pand(dst, vtmp);
2624     } else {
2625       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2626     }
2627   }
2628 }
2629 
2630 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2631   switch(typ) {
2632     case T_BYTE:
2633     case T_BOOLEAN:
2634       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2635       break;
2636     case T_SHORT:
2637     case T_CHAR:
2638       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2639       break;
2640     case T_INT:
2641     case T_FLOAT:
2642       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2643       break;
2644     case T_LONG:
2645     case T_DOUBLE:
2646       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2647       break;
2648     default:
2649       assert(false,"Should not reach here.");
2650       break;
2651   }
2652 }
2653 
2654 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2655   assert(rscratch != noreg || always_reachable(src2), "missing");
2656 
2657   switch(typ) {
2658     case T_BOOLEAN:
2659     case T_BYTE:
2660       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2661       break;
2662     case T_CHAR:
2663     case T_SHORT:
2664       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2665       break;
2666     case T_INT:
2667     case T_FLOAT:
2668       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2669       break;
2670     case T_LONG:
2671     case T_DOUBLE:
2672       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2673       break;
2674     default:
2675       assert(false,"Should not reach here.");
2676       break;
2677   }
2678 }
2679 
2680 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2681   switch(typ) {
2682     case T_BYTE:
2683       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2684       break;
2685     case T_SHORT:
2686       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2687       break;
2688     case T_INT:
2689     case T_FLOAT:
2690       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2691       break;
2692     case T_LONG:
2693     case T_DOUBLE:
2694       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2695       break;
2696     default:
2697       assert(false,"Should not reach here.");
2698       break;
2699   }
2700 }
2701 
2702 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2703   assert(vlen_in_bytes <= 32, "");
2704   int esize = type2aelembytes(bt);
2705   if (vlen_in_bytes == 32) {
2706     assert(vtmp == xnoreg, "required.");
2707     if (esize >= 4) {
2708       vtestps(src1, src2, AVX_256bit);
2709     } else {
2710       vptest(src1, src2, AVX_256bit);
2711     }
2712     return;
2713   }
2714   if (vlen_in_bytes < 16) {
2715     // Duplicate the lower part to fill the whole register,
2716     // Don't need to do so for src2
2717     assert(vtmp != xnoreg, "required");
2718     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2719     pshufd(vtmp, src1, shuffle_imm);
2720   } else {
2721     assert(vtmp == xnoreg, "required");
2722     vtmp = src1;
2723   }
2724   if (esize >= 4 && VM_Version::supports_avx()) {
2725     vtestps(vtmp, src2, AVX_128bit);
2726   } else {
2727     ptest(vtmp, src2);
2728   }
2729 }
2730 
2731 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2732 #ifdef ASSERT
2733   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2734   bool is_bw_supported = VM_Version::supports_avx512bw();
2735   if (is_bw && !is_bw_supported) {
2736     assert(vlen_enc != Assembler::AVX_512bit, "required");
2737     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2738            "XMM register should be 0-15");
2739   }
2740 #endif // ASSERT
2741   switch (elem_bt) {
2742     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2743     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2744     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2745     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2746     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2747     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2748     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2749   }
2750 }
2751 
2752 #ifdef _LP64
2753 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2754   assert(UseAVX >= 2, "required");
2755   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2756   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2757   if ((UseAVX > 2) &&
2758       (!is_bw || VM_Version::supports_avx512bw()) &&
2759       (!is_vl || VM_Version::supports_avx512vl())) {
2760     switch (elem_bt) {
2761       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2762       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2763       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2764       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2765       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2766     }
2767   } else {
2768     assert(vlen_enc != Assembler::AVX_512bit, "required");
2769     assert((dst->encoding() < 16),"XMM register should be 0-15");
2770     switch (elem_bt) {
2771       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2772       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2773       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2774       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2775       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2776       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2777       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2778     }
2779   }
2780 }
2781 #endif
2782 
2783 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2784   switch (to_elem_bt) {
2785     case T_SHORT:
2786       vpmovsxbw(dst, src, vlen_enc);
2787       break;
2788     case T_INT:
2789       vpmovsxbd(dst, src, vlen_enc);
2790       break;
2791     case T_FLOAT:
2792       vpmovsxbd(dst, src, vlen_enc);
2793       vcvtdq2ps(dst, dst, vlen_enc);
2794       break;
2795     case T_LONG:
2796       vpmovsxbq(dst, src, vlen_enc);
2797       break;
2798     case T_DOUBLE: {
2799       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2800       vpmovsxbd(dst, src, mid_vlen_enc);
2801       vcvtdq2pd(dst, dst, vlen_enc);
2802       break;
2803     }
2804     default:
2805       fatal("Unsupported type %s", type2name(to_elem_bt));
2806       break;
2807   }
2808 }
2809 
2810 //-------------------------------------------------------------------------------------------
2811 
2812 // IndexOf for constant substrings with size >= 8 chars
2813 // which don't need to be loaded through stack.
2814 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2815                                          Register cnt1, Register cnt2,
2816                                          int int_cnt2,  Register result,
2817                                          XMMRegister vec, Register tmp,
2818                                          int ae) {
2819   ShortBranchVerifier sbv(this);
2820   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2821   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2822 
2823   // This method uses the pcmpestri instruction with bound registers
2824   //   inputs:
2825   //     xmm - substring
2826   //     rax - substring length (elements count)
2827   //     mem - scanned string
2828   //     rdx - string length (elements count)
2829   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2830   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2831   //   outputs:
2832   //     rcx - matched index in string
2833   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2834   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2835   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2836   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2837   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2838 
2839   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2840         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2841         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2842 
2843   // Note, inline_string_indexOf() generates checks:
2844   // if (substr.count > string.count) return -1;
2845   // if (substr.count == 0) return 0;
2846   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2847 
2848   // Load substring.
2849   if (ae == StrIntrinsicNode::UL) {
2850     pmovzxbw(vec, Address(str2, 0));
2851   } else {
2852     movdqu(vec, Address(str2, 0));
2853   }
2854   movl(cnt2, int_cnt2);
2855   movptr(result, str1); // string addr
2856 
2857   if (int_cnt2 > stride) {
2858     jmpb(SCAN_TO_SUBSTR);
2859 
2860     // Reload substr for rescan, this code
2861     // is executed only for large substrings (> 8 chars)
2862     bind(RELOAD_SUBSTR);
2863     if (ae == StrIntrinsicNode::UL) {
2864       pmovzxbw(vec, Address(str2, 0));
2865     } else {
2866       movdqu(vec, Address(str2, 0));
2867     }
2868     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2869 
2870     bind(RELOAD_STR);
2871     // We came here after the beginning of the substring was
2872     // matched but the rest of it was not so we need to search
2873     // again. Start from the next element after the previous match.
2874 
2875     // cnt2 is number of substring reminding elements and
2876     // cnt1 is number of string reminding elements when cmp failed.
2877     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2878     subl(cnt1, cnt2);
2879     addl(cnt1, int_cnt2);
2880     movl(cnt2, int_cnt2); // Now restore cnt2
2881 
2882     decrementl(cnt1);     // Shift to next element
2883     cmpl(cnt1, cnt2);
2884     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2885 
2886     addptr(result, (1<<scale1));
2887 
2888   } // (int_cnt2 > 8)
2889 
2890   // Scan string for start of substr in 16-byte vectors
2891   bind(SCAN_TO_SUBSTR);
2892   pcmpestri(vec, Address(result, 0), mode);
2893   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2894   subl(cnt1, stride);
2895   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2896   cmpl(cnt1, cnt2);
2897   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2898   addptr(result, 16);
2899   jmpb(SCAN_TO_SUBSTR);
2900 
2901   // Found a potential substr
2902   bind(FOUND_CANDIDATE);
2903   // Matched whole vector if first element matched (tmp(rcx) == 0).
2904   if (int_cnt2 == stride) {
2905     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2906   } else { // int_cnt2 > 8
2907     jccb(Assembler::overflow, FOUND_SUBSTR);
2908   }
2909   // After pcmpestri tmp(rcx) contains matched element index
2910   // Compute start addr of substr
2911   lea(result, Address(result, tmp, scale1));
2912 
2913   // Make sure string is still long enough
2914   subl(cnt1, tmp);
2915   cmpl(cnt1, cnt2);
2916   if (int_cnt2 == stride) {
2917     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2918   } else { // int_cnt2 > 8
2919     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2920   }
2921   // Left less then substring.
2922 
2923   bind(RET_NOT_FOUND);
2924   movl(result, -1);
2925   jmp(EXIT);
2926 
2927   if (int_cnt2 > stride) {
2928     // This code is optimized for the case when whole substring
2929     // is matched if its head is matched.
2930     bind(MATCH_SUBSTR_HEAD);
2931     pcmpestri(vec, Address(result, 0), mode);
2932     // Reload only string if does not match
2933     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2934 
2935     Label CONT_SCAN_SUBSTR;
2936     // Compare the rest of substring (> 8 chars).
2937     bind(FOUND_SUBSTR);
2938     // First 8 chars are already matched.
2939     negptr(cnt2);
2940     addptr(cnt2, stride);
2941 
2942     bind(SCAN_SUBSTR);
2943     subl(cnt1, stride);
2944     cmpl(cnt2, -stride); // Do not read beyond substring
2945     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2946     // Back-up strings to avoid reading beyond substring:
2947     // cnt1 = cnt1 - cnt2 + 8
2948     addl(cnt1, cnt2); // cnt2 is negative
2949     addl(cnt1, stride);
2950     movl(cnt2, stride); negptr(cnt2);
2951     bind(CONT_SCAN_SUBSTR);
2952     if (int_cnt2 < (int)G) {
2953       int tail_off1 = int_cnt2<<scale1;
2954       int tail_off2 = int_cnt2<<scale2;
2955       if (ae == StrIntrinsicNode::UL) {
2956         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2957       } else {
2958         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2959       }
2960       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2961     } else {
2962       // calculate index in register to avoid integer overflow (int_cnt2*2)
2963       movl(tmp, int_cnt2);
2964       addptr(tmp, cnt2);
2965       if (ae == StrIntrinsicNode::UL) {
2966         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2967       } else {
2968         movdqu(vec, Address(str2, tmp, scale2, 0));
2969       }
2970       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2971     }
2972     // Need to reload strings pointers if not matched whole vector
2973     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2974     addptr(cnt2, stride);
2975     jcc(Assembler::negative, SCAN_SUBSTR);
2976     // Fall through if found full substring
2977 
2978   } // (int_cnt2 > 8)
2979 
2980   bind(RET_FOUND);
2981   // Found result if we matched full small substring.
2982   // Compute substr offset
2983   subptr(result, str1);
2984   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2985     shrl(result, 1); // index
2986   }
2987   bind(EXIT);
2988 
2989 } // string_indexofC8
2990 
2991 // Small strings are loaded through stack if they cross page boundary.
2992 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2993                                        Register cnt1, Register cnt2,
2994                                        int int_cnt2,  Register result,
2995                                        XMMRegister vec, Register tmp,
2996                                        int ae) {
2997   ShortBranchVerifier sbv(this);
2998   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2999   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3000 
3001   //
3002   // int_cnt2 is length of small (< 8 chars) constant substring
3003   // or (-1) for non constant substring in which case its length
3004   // is in cnt2 register.
3005   //
3006   // Note, inline_string_indexOf() generates checks:
3007   // if (substr.count > string.count) return -1;
3008   // if (substr.count == 0) return 0;
3009   //
3010   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3011   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3012   // This method uses the pcmpestri instruction with bound registers
3013   //   inputs:
3014   //     xmm - substring
3015   //     rax - substring length (elements count)
3016   //     mem - scanned string
3017   //     rdx - string length (elements count)
3018   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3019   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3020   //   outputs:
3021   //     rcx - matched index in string
3022   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3023   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3024   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3025   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3026 
3027   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3028         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3029         FOUND_CANDIDATE;
3030 
3031   { //========================================================
3032     // We don't know where these strings are located
3033     // and we can't read beyond them. Load them through stack.
3034     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3035 
3036     movptr(tmp, rsp); // save old SP
3037 
3038     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3039       if (int_cnt2 == (1>>scale2)) { // One byte
3040         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3041         load_unsigned_byte(result, Address(str2, 0));
3042         movdl(vec, result); // move 32 bits
3043       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3044         // Not enough header space in 32-bit VM: 12+3 = 15.
3045         movl(result, Address(str2, -1));
3046         shrl(result, 8);
3047         movdl(vec, result); // move 32 bits
3048       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3049         load_unsigned_short(result, Address(str2, 0));
3050         movdl(vec, result); // move 32 bits
3051       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3052         movdl(vec, Address(str2, 0)); // move 32 bits
3053       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3054         movq(vec, Address(str2, 0));  // move 64 bits
3055       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3056         // Array header size is 12 bytes in 32-bit VM
3057         // + 6 bytes for 3 chars == 18 bytes,
3058         // enough space to load vec and shift.
3059         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3060         if (ae == StrIntrinsicNode::UL) {
3061           int tail_off = int_cnt2-8;
3062           pmovzxbw(vec, Address(str2, tail_off));
3063           psrldq(vec, -2*tail_off);
3064         }
3065         else {
3066           int tail_off = int_cnt2*(1<<scale2);
3067           movdqu(vec, Address(str2, tail_off-16));
3068           psrldq(vec, 16-tail_off);
3069         }
3070       }
3071     } else { // not constant substring
3072       cmpl(cnt2, stride);
3073       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3074 
3075       // We can read beyond string if srt+16 does not cross page boundary
3076       // since heaps are aligned and mapped by pages.
3077       assert(os::vm_page_size() < (int)G, "default page should be small");
3078       movl(result, str2); // We need only low 32 bits
3079       andl(result, ((int)os::vm_page_size()-1));
3080       cmpl(result, ((int)os::vm_page_size()-16));
3081       jccb(Assembler::belowEqual, CHECK_STR);
3082 
3083       // Move small strings to stack to allow load 16 bytes into vec.
3084       subptr(rsp, 16);
3085       int stk_offset = wordSize-(1<<scale2);
3086       push(cnt2);
3087 
3088       bind(COPY_SUBSTR);
3089       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3090         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3091         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3092       } else if (ae == StrIntrinsicNode::UU) {
3093         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3094         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3095       }
3096       decrement(cnt2);
3097       jccb(Assembler::notZero, COPY_SUBSTR);
3098 
3099       pop(cnt2);
3100       movptr(str2, rsp);  // New substring address
3101     } // non constant
3102 
3103     bind(CHECK_STR);
3104     cmpl(cnt1, stride);
3105     jccb(Assembler::aboveEqual, BIG_STRINGS);
3106 
3107     // Check cross page boundary.
3108     movl(result, str1); // We need only low 32 bits
3109     andl(result, ((int)os::vm_page_size()-1));
3110     cmpl(result, ((int)os::vm_page_size()-16));
3111     jccb(Assembler::belowEqual, BIG_STRINGS);
3112 
3113     subptr(rsp, 16);
3114     int stk_offset = -(1<<scale1);
3115     if (int_cnt2 < 0) { // not constant
3116       push(cnt2);
3117       stk_offset += wordSize;
3118     }
3119     movl(cnt2, cnt1);
3120 
3121     bind(COPY_STR);
3122     if (ae == StrIntrinsicNode::LL) {
3123       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3124       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3125     } else {
3126       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3127       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3128     }
3129     decrement(cnt2);
3130     jccb(Assembler::notZero, COPY_STR);
3131 
3132     if (int_cnt2 < 0) { // not constant
3133       pop(cnt2);
3134     }
3135     movptr(str1, rsp);  // New string address
3136 
3137     bind(BIG_STRINGS);
3138     // Load substring.
3139     if (int_cnt2 < 0) { // -1
3140       if (ae == StrIntrinsicNode::UL) {
3141         pmovzxbw(vec, Address(str2, 0));
3142       } else {
3143         movdqu(vec, Address(str2, 0));
3144       }
3145       push(cnt2);       // substr count
3146       push(str2);       // substr addr
3147       push(str1);       // string addr
3148     } else {
3149       // Small (< 8 chars) constant substrings are loaded already.
3150       movl(cnt2, int_cnt2);
3151     }
3152     push(tmp);  // original SP
3153 
3154   } // Finished loading
3155 
3156   //========================================================
3157   // Start search
3158   //
3159 
3160   movptr(result, str1); // string addr
3161 
3162   if (int_cnt2  < 0) {  // Only for non constant substring
3163     jmpb(SCAN_TO_SUBSTR);
3164 
3165     // SP saved at sp+0
3166     // String saved at sp+1*wordSize
3167     // Substr saved at sp+2*wordSize
3168     // Substr count saved at sp+3*wordSize
3169 
3170     // Reload substr for rescan, this code
3171     // is executed only for large substrings (> 8 chars)
3172     bind(RELOAD_SUBSTR);
3173     movptr(str2, Address(rsp, 2*wordSize));
3174     movl(cnt2, Address(rsp, 3*wordSize));
3175     if (ae == StrIntrinsicNode::UL) {
3176       pmovzxbw(vec, Address(str2, 0));
3177     } else {
3178       movdqu(vec, Address(str2, 0));
3179     }
3180     // We came here after the beginning of the substring was
3181     // matched but the rest of it was not so we need to search
3182     // again. Start from the next element after the previous match.
3183     subptr(str1, result); // Restore counter
3184     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3185       shrl(str1, 1);
3186     }
3187     addl(cnt1, str1);
3188     decrementl(cnt1);   // Shift to next element
3189     cmpl(cnt1, cnt2);
3190     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3191 
3192     addptr(result, (1<<scale1));
3193   } // non constant
3194 
3195   // Scan string for start of substr in 16-byte vectors
3196   bind(SCAN_TO_SUBSTR);
3197   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3198   pcmpestri(vec, Address(result, 0), mode);
3199   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3200   subl(cnt1, stride);
3201   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3202   cmpl(cnt1, cnt2);
3203   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3204   addptr(result, 16);
3205 
3206   bind(ADJUST_STR);
3207   cmpl(cnt1, stride); // Do not read beyond string
3208   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3209   // Back-up string to avoid reading beyond string.
3210   lea(result, Address(result, cnt1, scale1, -16));
3211   movl(cnt1, stride);
3212   jmpb(SCAN_TO_SUBSTR);
3213 
3214   // Found a potential substr
3215   bind(FOUND_CANDIDATE);
3216   // After pcmpestri tmp(rcx) contains matched element index
3217 
3218   // Make sure string is still long enough
3219   subl(cnt1, tmp);
3220   cmpl(cnt1, cnt2);
3221   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3222   // Left less then substring.
3223 
3224   bind(RET_NOT_FOUND);
3225   movl(result, -1);
3226   jmp(CLEANUP);
3227 
3228   bind(FOUND_SUBSTR);
3229   // Compute start addr of substr
3230   lea(result, Address(result, tmp, scale1));
3231   if (int_cnt2 > 0) { // Constant substring
3232     // Repeat search for small substring (< 8 chars)
3233     // from new point without reloading substring.
3234     // Have to check that we don't read beyond string.
3235     cmpl(tmp, stride-int_cnt2);
3236     jccb(Assembler::greater, ADJUST_STR);
3237     // Fall through if matched whole substring.
3238   } else { // non constant
3239     assert(int_cnt2 == -1, "should be != 0");
3240 
3241     addl(tmp, cnt2);
3242     // Found result if we matched whole substring.
3243     cmpl(tmp, stride);
3244     jcc(Assembler::lessEqual, RET_FOUND);
3245 
3246     // Repeat search for small substring (<= 8 chars)
3247     // from new point 'str1' without reloading substring.
3248     cmpl(cnt2, stride);
3249     // Have to check that we don't read beyond string.
3250     jccb(Assembler::lessEqual, ADJUST_STR);
3251 
3252     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3253     // Compare the rest of substring (> 8 chars).
3254     movptr(str1, result);
3255 
3256     cmpl(tmp, cnt2);
3257     // First 8 chars are already matched.
3258     jccb(Assembler::equal, CHECK_NEXT);
3259 
3260     bind(SCAN_SUBSTR);
3261     pcmpestri(vec, Address(str1, 0), mode);
3262     // Need to reload strings pointers if not matched whole vector
3263     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3264 
3265     bind(CHECK_NEXT);
3266     subl(cnt2, stride);
3267     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3268     addptr(str1, 16);
3269     if (ae == StrIntrinsicNode::UL) {
3270       addptr(str2, 8);
3271     } else {
3272       addptr(str2, 16);
3273     }
3274     subl(cnt1, stride);
3275     cmpl(cnt2, stride); // Do not read beyond substring
3276     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3277     // Back-up strings to avoid reading beyond substring.
3278 
3279     if (ae == StrIntrinsicNode::UL) {
3280       lea(str2, Address(str2, cnt2, scale2, -8));
3281       lea(str1, Address(str1, cnt2, scale1, -16));
3282     } else {
3283       lea(str2, Address(str2, cnt2, scale2, -16));
3284       lea(str1, Address(str1, cnt2, scale1, -16));
3285     }
3286     subl(cnt1, cnt2);
3287     movl(cnt2, stride);
3288     addl(cnt1, stride);
3289     bind(CONT_SCAN_SUBSTR);
3290     if (ae == StrIntrinsicNode::UL) {
3291       pmovzxbw(vec, Address(str2, 0));
3292     } else {
3293       movdqu(vec, Address(str2, 0));
3294     }
3295     jmp(SCAN_SUBSTR);
3296 
3297     bind(RET_FOUND_LONG);
3298     movptr(str1, Address(rsp, wordSize));
3299   } // non constant
3300 
3301   bind(RET_FOUND);
3302   // Compute substr offset
3303   subptr(result, str1);
3304   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3305     shrl(result, 1); // index
3306   }
3307   bind(CLEANUP);
3308   pop(rsp); // restore SP
3309 
3310 } // string_indexof
3311 
3312 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3313                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3314   ShortBranchVerifier sbv(this);
3315   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3316 
3317   int stride = 8;
3318 
3319   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3320         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3321         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3322         FOUND_SEQ_CHAR, DONE_LABEL;
3323 
3324   movptr(result, str1);
3325   if (UseAVX >= 2) {
3326     cmpl(cnt1, stride);
3327     jcc(Assembler::less, SCAN_TO_CHAR);
3328     cmpl(cnt1, 2*stride);
3329     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3330     movdl(vec1, ch);
3331     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3332     vpxor(vec2, vec2);
3333     movl(tmp, cnt1);
3334     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3335     andl(cnt1,0x0000000F);  //tail count (in chars)
3336 
3337     bind(SCAN_TO_16_CHAR_LOOP);
3338     vmovdqu(vec3, Address(result, 0));
3339     vpcmpeqw(vec3, vec3, vec1, 1);
3340     vptest(vec2, vec3);
3341     jcc(Assembler::carryClear, FOUND_CHAR);
3342     addptr(result, 32);
3343     subl(tmp, 2*stride);
3344     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3345     jmp(SCAN_TO_8_CHAR);
3346     bind(SCAN_TO_8_CHAR_INIT);
3347     movdl(vec1, ch);
3348     pshuflw(vec1, vec1, 0x00);
3349     pshufd(vec1, vec1, 0);
3350     pxor(vec2, vec2);
3351   }
3352   bind(SCAN_TO_8_CHAR);
3353   cmpl(cnt1, stride);
3354   jcc(Assembler::less, SCAN_TO_CHAR);
3355   if (UseAVX < 2) {
3356     movdl(vec1, ch);
3357     pshuflw(vec1, vec1, 0x00);
3358     pshufd(vec1, vec1, 0);
3359     pxor(vec2, vec2);
3360   }
3361   movl(tmp, cnt1);
3362   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3363   andl(cnt1,0x00000007);  //tail count (in chars)
3364 
3365   bind(SCAN_TO_8_CHAR_LOOP);
3366   movdqu(vec3, Address(result, 0));
3367   pcmpeqw(vec3, vec1);
3368   ptest(vec2, vec3);
3369   jcc(Assembler::carryClear, FOUND_CHAR);
3370   addptr(result, 16);
3371   subl(tmp, stride);
3372   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3373   bind(SCAN_TO_CHAR);
3374   testl(cnt1, cnt1);
3375   jcc(Assembler::zero, RET_NOT_FOUND);
3376   bind(SCAN_TO_CHAR_LOOP);
3377   load_unsigned_short(tmp, Address(result, 0));
3378   cmpl(ch, tmp);
3379   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3380   addptr(result, 2);
3381   subl(cnt1, 1);
3382   jccb(Assembler::zero, RET_NOT_FOUND);
3383   jmp(SCAN_TO_CHAR_LOOP);
3384 
3385   bind(RET_NOT_FOUND);
3386   movl(result, -1);
3387   jmpb(DONE_LABEL);
3388 
3389   bind(FOUND_CHAR);
3390   if (UseAVX >= 2) {
3391     vpmovmskb(tmp, vec3);
3392   } else {
3393     pmovmskb(tmp, vec3);
3394   }
3395   bsfl(ch, tmp);
3396   addptr(result, ch);
3397 
3398   bind(FOUND_SEQ_CHAR);
3399   subptr(result, str1);
3400   shrl(result, 1);
3401 
3402   bind(DONE_LABEL);
3403 } // string_indexof_char
3404 
3405 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3406                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3407   ShortBranchVerifier sbv(this);
3408   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3409 
3410   int stride = 16;
3411 
3412   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3413         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3414         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3415         FOUND_SEQ_CHAR, DONE_LABEL;
3416 
3417   movptr(result, str1);
3418   if (UseAVX >= 2) {
3419     cmpl(cnt1, stride);
3420     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3421     cmpl(cnt1, stride*2);
3422     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3423     movdl(vec1, ch);
3424     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3425     vpxor(vec2, vec2);
3426     movl(tmp, cnt1);
3427     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3428     andl(cnt1,0x0000001F);  //tail count (in chars)
3429 
3430     bind(SCAN_TO_32_CHAR_LOOP);
3431     vmovdqu(vec3, Address(result, 0));
3432     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3433     vptest(vec2, vec3);
3434     jcc(Assembler::carryClear, FOUND_CHAR);
3435     addptr(result, 32);
3436     subl(tmp, stride*2);
3437     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3438     jmp(SCAN_TO_16_CHAR);
3439 
3440     bind(SCAN_TO_16_CHAR_INIT);
3441     movdl(vec1, ch);
3442     pxor(vec2, vec2);
3443     pshufb(vec1, vec2);
3444   }
3445 
3446   bind(SCAN_TO_16_CHAR);
3447   cmpl(cnt1, stride);
3448   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3449   if (UseAVX < 2) {
3450     movdl(vec1, ch);
3451     pxor(vec2, vec2);
3452     pshufb(vec1, vec2);
3453   }
3454   movl(tmp, cnt1);
3455   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3456   andl(cnt1,0x0000000F);  //tail count (in bytes)
3457 
3458   bind(SCAN_TO_16_CHAR_LOOP);
3459   movdqu(vec3, Address(result, 0));
3460   pcmpeqb(vec3, vec1);
3461   ptest(vec2, vec3);
3462   jcc(Assembler::carryClear, FOUND_CHAR);
3463   addptr(result, 16);
3464   subl(tmp, stride);
3465   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3466 
3467   bind(SCAN_TO_CHAR_INIT);
3468   testl(cnt1, cnt1);
3469   jcc(Assembler::zero, RET_NOT_FOUND);
3470   bind(SCAN_TO_CHAR_LOOP);
3471   load_unsigned_byte(tmp, Address(result, 0));
3472   cmpl(ch, tmp);
3473   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3474   addptr(result, 1);
3475   subl(cnt1, 1);
3476   jccb(Assembler::zero, RET_NOT_FOUND);
3477   jmp(SCAN_TO_CHAR_LOOP);
3478 
3479   bind(RET_NOT_FOUND);
3480   movl(result, -1);
3481   jmpb(DONE_LABEL);
3482 
3483   bind(FOUND_CHAR);
3484   if (UseAVX >= 2) {
3485     vpmovmskb(tmp, vec3);
3486   } else {
3487     pmovmskb(tmp, vec3);
3488   }
3489   bsfl(ch, tmp);
3490   addptr(result, ch);
3491 
3492   bind(FOUND_SEQ_CHAR);
3493   subptr(result, str1);
3494 
3495   bind(DONE_LABEL);
3496 } // stringL_indexof_char
3497 
3498 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3499   switch (eltype) {
3500   case T_BOOLEAN: return sizeof(jboolean);
3501   case T_BYTE:  return sizeof(jbyte);
3502   case T_SHORT: return sizeof(jshort);
3503   case T_CHAR:  return sizeof(jchar);
3504   case T_INT:   return sizeof(jint);
3505   default:
3506     ShouldNotReachHere();
3507     return -1;
3508   }
3509 }
3510 
3511 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3512   switch (eltype) {
3513   // T_BOOLEAN used as surrogate for unsigned byte
3514   case T_BOOLEAN: movzbl(dst, src);   break;
3515   case T_BYTE:    movsbl(dst, src);   break;
3516   case T_SHORT:   movswl(dst, src);   break;
3517   case T_CHAR:    movzwl(dst, src);   break;
3518   case T_INT:     movl(dst, src);     break;
3519   default:
3520     ShouldNotReachHere();
3521   }
3522 }
3523 
3524 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3525   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3526 }
3527 
3528 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3529   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3530 }
3531 
3532 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3533   const int vlen = Assembler::AVX_256bit;
3534   switch (eltype) {
3535   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3536   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3537   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3538   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3539   case T_INT:
3540     // do nothing
3541     break;
3542   default:
3543     ShouldNotReachHere();
3544   }
3545 }
3546 
3547 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3548                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3549                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3550                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3551                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3552                                         BasicType eltype) {
3553   ShortBranchVerifier sbv(this);
3554   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3555   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3556   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3557 
3558   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3559         SHORT_UNROLLED_LOOP_EXIT,
3560         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3561         UNROLLED_VECTOR_LOOP_BEGIN,
3562         END;
3563   switch (eltype) {
3564   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3565   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3566   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3567   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3568   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3569   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3570   }
3571 
3572   // For "renaming" for readibility of the code
3573   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3574                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3575                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3576 
3577   const int elsize = arrays_hashcode_elsize(eltype);
3578 
3579   /*
3580     if (cnt1 >= 2) {
3581       if (cnt1 >= 32) {
3582         UNROLLED VECTOR LOOP
3583       }
3584       UNROLLED SCALAR LOOP
3585     }
3586     SINGLE SCALAR
3587    */
3588 
3589   cmpl(cnt1, 32);
3590   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3591 
3592   // cnt1 >= 32 && generate_vectorized_loop
3593   xorl(index, index);
3594 
3595   // vresult = IntVector.zero(I256);
3596   for (int idx = 0; idx < 4; idx++) {
3597     vpxor(vresult[idx], vresult[idx]);
3598   }
3599   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3600   Register bound = tmp2;
3601   Register next = tmp3;
3602   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3603   movl(next, Address(tmp2, 0));
3604   movdl(vnext, next);
3605   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3606 
3607   // index = 0;
3608   // bound = cnt1 & ~(32 - 1);
3609   movl(bound, cnt1);
3610   andl(bound, ~(32 - 1));
3611   // for (; index < bound; index += 32) {
3612   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3613   // result *= next;
3614   imull(result, next);
3615   // loop fission to upfront the cost of fetching from memory, OOO execution
3616   // can then hopefully do a better job of prefetching
3617   for (int idx = 0; idx < 4; idx++) {
3618     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3619   }
3620   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3621   for (int idx = 0; idx < 4; idx++) {
3622     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3623     arrays_hashcode_elvcast(vtmp[idx], eltype);
3624     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3625   }
3626   // index += 32;
3627   addl(index, 32);
3628   // index < bound;
3629   cmpl(index, bound);
3630   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3631   // }
3632 
3633   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3634   subl(cnt1, bound);
3635   // release bound
3636 
3637   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3638   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3639   for (int idx = 0; idx < 4; idx++) {
3640     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, (int)((8 * idx + 1) * sizeof(jint))), T_INT);
3641     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3642   }
3643   // result += vresult.reduceLanes(ADD);
3644   for (int idx = 0; idx < 4; idx++) {
3645     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3646   }
3647 
3648   // } else if (cnt1 < 32) {
3649 
3650   bind(SHORT_UNROLLED_BEGIN);
3651   // int i = 1;
3652   movl(index, 1);
3653   cmpl(index, cnt1);
3654   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3655 
3656   // for (; i < cnt1 ; i += 2) {
3657   bind(SHORT_UNROLLED_LOOP_BEGIN);
3658   movl(tmp3, 961);
3659   imull(result, tmp3);
3660   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3661   movl(tmp3, tmp2);
3662   shll(tmp3, 5);
3663   subl(tmp3, tmp2);
3664   addl(result, tmp3);
3665   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3666   addl(result, tmp3);
3667   addl(index, 2);
3668   cmpl(index, cnt1);
3669   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3670 
3671   // }
3672   // if (i >= cnt1) {
3673   bind(SHORT_UNROLLED_LOOP_EXIT);
3674   jccb(Assembler::greater, END);
3675   movl(tmp2, result);
3676   shll(result, 5);
3677   subl(result, tmp2);
3678   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3679   addl(result, tmp3);
3680   // }
3681   bind(END);
3682 
3683   BLOCK_COMMENT("} // arrays_hashcode");
3684 
3685 } // arrays_hashcode
3686 
3687 // helper function for string_compare
3688 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3689                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3690                                            Address::ScaleFactor scale2, Register index, int ae) {
3691   if (ae == StrIntrinsicNode::LL) {
3692     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3693     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3694   } else if (ae == StrIntrinsicNode::UU) {
3695     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3696     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3697   } else {
3698     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3699     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3700   }
3701 }
3702 
3703 // Compare strings, used for char[] and byte[].
3704 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3705                                        Register cnt1, Register cnt2, Register result,
3706                                        XMMRegister vec1, int ae, KRegister mask) {
3707   ShortBranchVerifier sbv(this);
3708   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3709   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3710   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3711   int stride2x2 = 0x40;
3712   Address::ScaleFactor scale = Address::no_scale;
3713   Address::ScaleFactor scale1 = Address::no_scale;
3714   Address::ScaleFactor scale2 = Address::no_scale;
3715 
3716   if (ae != StrIntrinsicNode::LL) {
3717     stride2x2 = 0x20;
3718   }
3719 
3720   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3721     shrl(cnt2, 1);
3722   }
3723   // Compute the minimum of the string lengths and the
3724   // difference of the string lengths (stack).
3725   // Do the conditional move stuff
3726   movl(result, cnt1);
3727   subl(cnt1, cnt2);
3728   push(cnt1);
3729   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3730 
3731   // Is the minimum length zero?
3732   testl(cnt2, cnt2);
3733   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3734   if (ae == StrIntrinsicNode::LL) {
3735     // Load first bytes
3736     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3737     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3738   } else if (ae == StrIntrinsicNode::UU) {
3739     // Load first characters
3740     load_unsigned_short(result, Address(str1, 0));
3741     load_unsigned_short(cnt1, Address(str2, 0));
3742   } else {
3743     load_unsigned_byte(result, Address(str1, 0));
3744     load_unsigned_short(cnt1, Address(str2, 0));
3745   }
3746   subl(result, cnt1);
3747   jcc(Assembler::notZero,  POP_LABEL);
3748 
3749   if (ae == StrIntrinsicNode::UU) {
3750     // Divide length by 2 to get number of chars
3751     shrl(cnt2, 1);
3752   }
3753   cmpl(cnt2, 1);
3754   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3755 
3756   // Check if the strings start at the same location and setup scale and stride
3757   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3758     cmpptr(str1, str2);
3759     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3760     if (ae == StrIntrinsicNode::LL) {
3761       scale = Address::times_1;
3762       stride = 16;
3763     } else {
3764       scale = Address::times_2;
3765       stride = 8;
3766     }
3767   } else {
3768     scale1 = Address::times_1;
3769     scale2 = Address::times_2;
3770     // scale not used
3771     stride = 8;
3772   }
3773 
3774   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3775     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3776     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3777     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3778     Label COMPARE_TAIL_LONG;
3779     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3780 
3781     int pcmpmask = 0x19;
3782     if (ae == StrIntrinsicNode::LL) {
3783       pcmpmask &= ~0x01;
3784     }
3785 
3786     // Setup to compare 16-chars (32-bytes) vectors,
3787     // start from first character again because it has aligned address.
3788     if (ae == StrIntrinsicNode::LL) {
3789       stride2 = 32;
3790     } else {
3791       stride2 = 16;
3792     }
3793     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3794       adr_stride = stride << scale;
3795     } else {
3796       adr_stride1 = 8;  //stride << scale1;
3797       adr_stride2 = 16; //stride << scale2;
3798     }
3799 
3800     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3801     // rax and rdx are used by pcmpestri as elements counters
3802     movl(result, cnt2);
3803     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3804     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3805 
3806     // fast path : compare first 2 8-char vectors.
3807     bind(COMPARE_16_CHARS);
3808     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3809       movdqu(vec1, Address(str1, 0));
3810     } else {
3811       pmovzxbw(vec1, Address(str1, 0));
3812     }
3813     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3814     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3815 
3816     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3817       movdqu(vec1, Address(str1, adr_stride));
3818       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3819     } else {
3820       pmovzxbw(vec1, Address(str1, adr_stride1));
3821       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3822     }
3823     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3824     addl(cnt1, stride);
3825 
3826     // Compare the characters at index in cnt1
3827     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3828     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3829     subl(result, cnt2);
3830     jmp(POP_LABEL);
3831 
3832     // Setup the registers to start vector comparison loop
3833     bind(COMPARE_WIDE_VECTORS);
3834     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3835       lea(str1, Address(str1, result, scale));
3836       lea(str2, Address(str2, result, scale));
3837     } else {
3838       lea(str1, Address(str1, result, scale1));
3839       lea(str2, Address(str2, result, scale2));
3840     }
3841     subl(result, stride2);
3842     subl(cnt2, stride2);
3843     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3844     negptr(result);
3845 
3846     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3847     bind(COMPARE_WIDE_VECTORS_LOOP);
3848 
3849 #ifdef _LP64
3850     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3851       cmpl(cnt2, stride2x2);
3852       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3853       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3854       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3855 
3856       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3857       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3858         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3859         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3860       } else {
3861         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3862         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3863       }
3864       kortestql(mask, mask);
3865       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3866       addptr(result, stride2x2);  // update since we already compared at this addr
3867       subl(cnt2, stride2x2);      // and sub the size too
3868       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3869 
3870       vpxor(vec1, vec1);
3871       jmpb(COMPARE_WIDE_TAIL);
3872     }//if (VM_Version::supports_avx512vlbw())
3873 #endif // _LP64
3874 
3875 
3876     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3877     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3878       vmovdqu(vec1, Address(str1, result, scale));
3879       vpxor(vec1, Address(str2, result, scale));
3880     } else {
3881       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3882       vpxor(vec1, Address(str2, result, scale2));
3883     }
3884     vptest(vec1, vec1);
3885     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3886     addptr(result, stride2);
3887     subl(cnt2, stride2);
3888     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3889     // clean upper bits of YMM registers
3890     vpxor(vec1, vec1);
3891 
3892     // compare wide vectors tail
3893     bind(COMPARE_WIDE_TAIL);
3894     testptr(result, result);
3895     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3896 
3897     movl(result, stride2);
3898     movl(cnt2, result);
3899     negptr(result);
3900     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3901 
3902     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3903     bind(VECTOR_NOT_EQUAL);
3904     // clean upper bits of YMM registers
3905     vpxor(vec1, vec1);
3906     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3907       lea(str1, Address(str1, result, scale));
3908       lea(str2, Address(str2, result, scale));
3909     } else {
3910       lea(str1, Address(str1, result, scale1));
3911       lea(str2, Address(str2, result, scale2));
3912     }
3913     jmp(COMPARE_16_CHARS);
3914 
3915     // Compare tail chars, length between 1 to 15 chars
3916     bind(COMPARE_TAIL_LONG);
3917     movl(cnt2, result);
3918     cmpl(cnt2, stride);
3919     jcc(Assembler::less, COMPARE_SMALL_STR);
3920 
3921     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3922       movdqu(vec1, Address(str1, 0));
3923     } else {
3924       pmovzxbw(vec1, Address(str1, 0));
3925     }
3926     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3927     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3928     subptr(cnt2, stride);
3929     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3930     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3931       lea(str1, Address(str1, result, scale));
3932       lea(str2, Address(str2, result, scale));
3933     } else {
3934       lea(str1, Address(str1, result, scale1));
3935       lea(str2, Address(str2, result, scale2));
3936     }
3937     negptr(cnt2);
3938     jmpb(WHILE_HEAD_LABEL);
3939 
3940     bind(COMPARE_SMALL_STR);
3941   } else if (UseSSE42Intrinsics) {
3942     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3943     int pcmpmask = 0x19;
3944     // Setup to compare 8-char (16-byte) vectors,
3945     // start from first character again because it has aligned address.
3946     movl(result, cnt2);
3947     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3948     if (ae == StrIntrinsicNode::LL) {
3949       pcmpmask &= ~0x01;
3950     }
3951     jcc(Assembler::zero, COMPARE_TAIL);
3952     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3953       lea(str1, Address(str1, result, scale));
3954       lea(str2, Address(str2, result, scale));
3955     } else {
3956       lea(str1, Address(str1, result, scale1));
3957       lea(str2, Address(str2, result, scale2));
3958     }
3959     negptr(result);
3960 
3961     // pcmpestri
3962     //   inputs:
3963     //     vec1- substring
3964     //     rax - negative string length (elements count)
3965     //     mem - scanned string
3966     //     rdx - string length (elements count)
3967     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3968     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3969     //   outputs:
3970     //     rcx - first mismatched element index
3971     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3972 
3973     bind(COMPARE_WIDE_VECTORS);
3974     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3975       movdqu(vec1, Address(str1, result, scale));
3976       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3977     } else {
3978       pmovzxbw(vec1, Address(str1, result, scale1));
3979       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3980     }
3981     // After pcmpestri cnt1(rcx) contains mismatched element index
3982 
3983     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3984     addptr(result, stride);
3985     subptr(cnt2, stride);
3986     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3987 
3988     // compare wide vectors tail
3989     testptr(result, result);
3990     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3991 
3992     movl(cnt2, stride);
3993     movl(result, stride);
3994     negptr(result);
3995     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3996       movdqu(vec1, Address(str1, result, scale));
3997       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3998     } else {
3999       pmovzxbw(vec1, Address(str1, result, scale1));
4000       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4001     }
4002     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4003 
4004     // Mismatched characters in the vectors
4005     bind(VECTOR_NOT_EQUAL);
4006     addptr(cnt1, result);
4007     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4008     subl(result, cnt2);
4009     jmpb(POP_LABEL);
4010 
4011     bind(COMPARE_TAIL); // limit is zero
4012     movl(cnt2, result);
4013     // Fallthru to tail compare
4014   }
4015   // Shift str2 and str1 to the end of the arrays, negate min
4016   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4017     lea(str1, Address(str1, cnt2, scale));
4018     lea(str2, Address(str2, cnt2, scale));
4019   } else {
4020     lea(str1, Address(str1, cnt2, scale1));
4021     lea(str2, Address(str2, cnt2, scale2));
4022   }
4023   decrementl(cnt2);  // first character was compared already
4024   negptr(cnt2);
4025 
4026   // Compare the rest of the elements
4027   bind(WHILE_HEAD_LABEL);
4028   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4029   subl(result, cnt1);
4030   jccb(Assembler::notZero, POP_LABEL);
4031   increment(cnt2);
4032   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4033 
4034   // Strings are equal up to min length.  Return the length difference.
4035   bind(LENGTH_DIFF_LABEL);
4036   pop(result);
4037   if (ae == StrIntrinsicNode::UU) {
4038     // Divide diff by 2 to get number of chars
4039     sarl(result, 1);
4040   }
4041   jmpb(DONE_LABEL);
4042 
4043 #ifdef _LP64
4044   if (VM_Version::supports_avx512vlbw()) {
4045 
4046     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4047 
4048     kmovql(cnt1, mask);
4049     notq(cnt1);
4050     bsfq(cnt2, cnt1);
4051     if (ae != StrIntrinsicNode::LL) {
4052       // Divide diff by 2 to get number of chars
4053       sarl(cnt2, 1);
4054     }
4055     addq(result, cnt2);
4056     if (ae == StrIntrinsicNode::LL) {
4057       load_unsigned_byte(cnt1, Address(str2, result));
4058       load_unsigned_byte(result, Address(str1, result));
4059     } else if (ae == StrIntrinsicNode::UU) {
4060       load_unsigned_short(cnt1, Address(str2, result, scale));
4061       load_unsigned_short(result, Address(str1, result, scale));
4062     } else {
4063       load_unsigned_short(cnt1, Address(str2, result, scale2));
4064       load_unsigned_byte(result, Address(str1, result, scale1));
4065     }
4066     subl(result, cnt1);
4067     jmpb(POP_LABEL);
4068   }//if (VM_Version::supports_avx512vlbw())
4069 #endif // _LP64
4070 
4071   // Discard the stored length difference
4072   bind(POP_LABEL);
4073   pop(cnt1);
4074 
4075   // That's it
4076   bind(DONE_LABEL);
4077   if(ae == StrIntrinsicNode::UL) {
4078     negl(result);
4079   }
4080 
4081 }
4082 
4083 // Search for Non-ASCII character (Negative byte value) in a byte array,
4084 // return the index of the first such character, otherwise the length
4085 // of the array segment searched.
4086 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4087 //   @IntrinsicCandidate
4088 //   public static int countPositives(byte[] ba, int off, int len) {
4089 //     for (int i = off; i < off + len; i++) {
4090 //       if (ba[i] < 0) {
4091 //         return i - off;
4092 //       }
4093 //     }
4094 //     return len;
4095 //   }
4096 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4097   Register result, Register tmp1,
4098   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4099   // rsi: byte array
4100   // rcx: len
4101   // rax: result
4102   ShortBranchVerifier sbv(this);
4103   assert_different_registers(ary1, len, result, tmp1);
4104   assert_different_registers(vec1, vec2);
4105   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4106 
4107   movl(result, len); // copy
4108   // len == 0
4109   testl(len, len);
4110   jcc(Assembler::zero, DONE);
4111 
4112   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4113     VM_Version::supports_avx512vlbw() &&
4114     VM_Version::supports_bmi2()) {
4115 
4116     Label test_64_loop, test_tail, BREAK_LOOP;
4117     movl(tmp1, len);
4118     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4119 
4120     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4121     andl(len,  0xffffffc0); // vector count (in chars)
4122     jccb(Assembler::zero, test_tail);
4123 
4124     lea(ary1, Address(ary1, len, Address::times_1));
4125     negptr(len);
4126 
4127     bind(test_64_loop);
4128     // Check whether our 64 elements of size byte contain negatives
4129     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4130     kortestql(mask1, mask1);
4131     jcc(Assembler::notZero, BREAK_LOOP);
4132 
4133     addptr(len, 64);
4134     jccb(Assembler::notZero, test_64_loop);
4135 
4136     bind(test_tail);
4137     // bail out when there is nothing to be done
4138     testl(tmp1, -1);
4139     jcc(Assembler::zero, DONE);
4140 
4141 
4142     // check the tail for absense of negatives
4143     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4144 #ifdef _LP64
4145     {
4146       Register tmp3_aliased = len;
4147       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4148       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4149       notq(tmp3_aliased);
4150       kmovql(mask2, tmp3_aliased);
4151     }
4152 #else
4153     Label k_init;
4154     jmp(k_init);
4155 
4156     // We could not read 64-bits from a general purpose register thus we move
4157     // data required to compose 64 1's to the instruction stream
4158     // We emit 64 byte wide series of elements from 0..63 which later on would
4159     // be used as a compare targets with tail count contained in tmp1 register.
4160     // Result would be a k register having tmp1 consecutive number or 1
4161     // counting from least significant bit.
4162     address tmp = pc();
4163     emit_int64(0x0706050403020100);
4164     emit_int64(0x0F0E0D0C0B0A0908);
4165     emit_int64(0x1716151413121110);
4166     emit_int64(0x1F1E1D1C1B1A1918);
4167     emit_int64(0x2726252423222120);
4168     emit_int64(0x2F2E2D2C2B2A2928);
4169     emit_int64(0x3736353433323130);
4170     emit_int64(0x3F3E3D3C3B3A3938);
4171 
4172     bind(k_init);
4173     lea(len, InternalAddress(tmp));
4174     // create mask to test for negative byte inside a vector
4175     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4176     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4177 
4178 #endif
4179     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4180     ktestq(mask1, mask2);
4181     jcc(Assembler::zero, DONE);
4182 
4183     // do a full check for negative registers in the tail
4184     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4185                      // ary1 already pointing to the right place
4186     jmpb(TAIL_START);
4187 
4188     bind(BREAK_LOOP);
4189     // At least one byte in the last 64 byte block was negative.
4190     // Set up to look at the last 64 bytes as if they were a tail
4191     lea(ary1, Address(ary1, len, Address::times_1));
4192     addptr(result, len);
4193     // Ignore the very last byte: if all others are positive,
4194     // it must be negative, so we can skip right to the 2+1 byte
4195     // end comparison at this point
4196     orl(result, 63);
4197     movl(len, 63);
4198     // Fallthru to tail compare
4199   } else {
4200 
4201     if (UseAVX >= 2 && UseSSE >= 2) {
4202       // With AVX2, use 32-byte vector compare
4203       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4204 
4205       // Compare 32-byte vectors
4206       testl(len, 0xffffffe0);   // vector count (in bytes)
4207       jccb(Assembler::zero, TAIL_START);
4208 
4209       andl(len, 0xffffffe0);
4210       lea(ary1, Address(ary1, len, Address::times_1));
4211       negptr(len);
4212 
4213       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4214       movdl(vec2, tmp1);
4215       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4216 
4217       bind(COMPARE_WIDE_VECTORS);
4218       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4219       vptest(vec1, vec2);
4220       jccb(Assembler::notZero, BREAK_LOOP);
4221       addptr(len, 32);
4222       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4223 
4224       testl(result, 0x0000001f);   // any bytes remaining?
4225       jcc(Assembler::zero, DONE);
4226 
4227       // Quick test using the already prepared vector mask
4228       movl(len, result);
4229       andl(len, 0x0000001f);
4230       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4231       vptest(vec1, vec2);
4232       jcc(Assembler::zero, DONE);
4233       // There are zeros, jump to the tail to determine exactly where
4234       jmpb(TAIL_START);
4235 
4236       bind(BREAK_LOOP);
4237       // At least one byte in the last 32-byte vector is negative.
4238       // Set up to look at the last 32 bytes as if they were a tail
4239       lea(ary1, Address(ary1, len, Address::times_1));
4240       addptr(result, len);
4241       // Ignore the very last byte: if all others are positive,
4242       // it must be negative, so we can skip right to the 2+1 byte
4243       // end comparison at this point
4244       orl(result, 31);
4245       movl(len, 31);
4246       // Fallthru to tail compare
4247     } else if (UseSSE42Intrinsics) {
4248       // With SSE4.2, use double quad vector compare
4249       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4250 
4251       // Compare 16-byte vectors
4252       testl(len, 0xfffffff0);   // vector count (in bytes)
4253       jcc(Assembler::zero, TAIL_START);
4254 
4255       andl(len, 0xfffffff0);
4256       lea(ary1, Address(ary1, len, Address::times_1));
4257       negptr(len);
4258 
4259       movl(tmp1, 0x80808080);
4260       movdl(vec2, tmp1);
4261       pshufd(vec2, vec2, 0);
4262 
4263       bind(COMPARE_WIDE_VECTORS);
4264       movdqu(vec1, Address(ary1, len, Address::times_1));
4265       ptest(vec1, vec2);
4266       jccb(Assembler::notZero, BREAK_LOOP);
4267       addptr(len, 16);
4268       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4269 
4270       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4271       jcc(Assembler::zero, DONE);
4272 
4273       // Quick test using the already prepared vector mask
4274       movl(len, result);
4275       andl(len, 0x0000000f);   // tail count (in bytes)
4276       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4277       ptest(vec1, vec2);
4278       jcc(Assembler::zero, DONE);
4279       jmpb(TAIL_START);
4280 
4281       bind(BREAK_LOOP);
4282       // At least one byte in the last 16-byte vector is negative.
4283       // Set up and look at the last 16 bytes as if they were a tail
4284       lea(ary1, Address(ary1, len, Address::times_1));
4285       addptr(result, len);
4286       // Ignore the very last byte: if all others are positive,
4287       // it must be negative, so we can skip right to the 2+1 byte
4288       // end comparison at this point
4289       orl(result, 15);
4290       movl(len, 15);
4291       // Fallthru to tail compare
4292     }
4293   }
4294 
4295   bind(TAIL_START);
4296   // Compare 4-byte vectors
4297   andl(len, 0xfffffffc); // vector count (in bytes)
4298   jccb(Assembler::zero, COMPARE_CHAR);
4299 
4300   lea(ary1, Address(ary1, len, Address::times_1));
4301   negptr(len);
4302 
4303   bind(COMPARE_VECTORS);
4304   movl(tmp1, Address(ary1, len, Address::times_1));
4305   andl(tmp1, 0x80808080);
4306   jccb(Assembler::notZero, TAIL_ADJUST);
4307   addptr(len, 4);
4308   jccb(Assembler::notZero, COMPARE_VECTORS);
4309 
4310   // Compare trailing char (final 2-3 bytes), if any
4311   bind(COMPARE_CHAR);
4312 
4313   testl(result, 0x2);   // tail  char
4314   jccb(Assembler::zero, COMPARE_BYTE);
4315   load_unsigned_short(tmp1, Address(ary1, 0));
4316   andl(tmp1, 0x00008080);
4317   jccb(Assembler::notZero, CHAR_ADJUST);
4318   lea(ary1, Address(ary1, 2));
4319 
4320   bind(COMPARE_BYTE);
4321   testl(result, 0x1);   // tail  byte
4322   jccb(Assembler::zero, DONE);
4323   load_unsigned_byte(tmp1, Address(ary1, 0));
4324   testl(tmp1, 0x00000080);
4325   jccb(Assembler::zero, DONE);
4326   subptr(result, 1);
4327   jmpb(DONE);
4328 
4329   bind(TAIL_ADJUST);
4330   // there are negative bits in the last 4 byte block.
4331   // Adjust result and check the next three bytes
4332   addptr(result, len);
4333   orl(result, 3);
4334   lea(ary1, Address(ary1, len, Address::times_1));
4335   jmpb(COMPARE_CHAR);
4336 
4337   bind(CHAR_ADJUST);
4338   // We are looking at a char + optional byte tail, and found that one
4339   // of the bytes in the char is negative. Adjust the result, check the
4340   // first byte and readjust if needed.
4341   andl(result, 0xfffffffc);
4342   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4343   jccb(Assembler::notZero, DONE);
4344   addptr(result, 1);
4345 
4346   // That's it
4347   bind(DONE);
4348   if (UseAVX >= 2 && UseSSE >= 2) {
4349     // clean upper bits of YMM registers
4350     vpxor(vec1, vec1);
4351     vpxor(vec2, vec2);
4352   }
4353 }
4354 
4355 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4356 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4357                                       Register limit, Register result, Register chr,
4358                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4359                                       KRegister mask, bool expand_ary2) {
4360   // for expand_ary2, limit is the (smaller) size of the second array.
4361   ShortBranchVerifier sbv(this);
4362   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4363 
4364   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4365          "Expansion only implemented for AVX2");
4366 
4367   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4368   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4369 
4370   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4371   int scaleIncr = expand_ary2 ? 8 : 16;
4372 
4373   if (is_array_equ) {
4374     // Check the input args
4375     cmpoop(ary1, ary2);
4376     jcc(Assembler::equal, TRUE_LABEL);
4377 
4378     // Need additional checks for arrays_equals.
4379     testptr(ary1, ary1);
4380     jcc(Assembler::zero, FALSE_LABEL);
4381     testptr(ary2, ary2);
4382     jcc(Assembler::zero, FALSE_LABEL);
4383 
4384     // Check the lengths
4385     movl(limit, Address(ary1, length_offset));
4386     cmpl(limit, Address(ary2, length_offset));
4387     jcc(Assembler::notEqual, FALSE_LABEL);
4388   }
4389 
4390   // count == 0
4391   testl(limit, limit);
4392   jcc(Assembler::zero, TRUE_LABEL);
4393 
4394   if (is_array_equ) {
4395     // Load array address
4396     lea(ary1, Address(ary1, base_offset));
4397     lea(ary2, Address(ary2, base_offset));
4398   }
4399 
4400   if (is_array_equ && is_char) {
4401     // arrays_equals when used for char[].
4402     shll(limit, 1);      // byte count != 0
4403   }
4404   movl(result, limit); // copy
4405 
4406   if (UseAVX >= 2) {
4407     // With AVX2, use 32-byte vector compare
4408     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4409 
4410     // Compare 32-byte vectors
4411     if (expand_ary2) {
4412       andl(result, 0x0000000f);  //   tail count (in bytes)
4413       andl(limit, 0xfffffff0);   // vector count (in bytes)
4414       jcc(Assembler::zero, COMPARE_TAIL);
4415     } else {
4416       andl(result, 0x0000001f);  //   tail count (in bytes)
4417       andl(limit, 0xffffffe0);   // vector count (in bytes)
4418       jcc(Assembler::zero, COMPARE_TAIL_16);
4419     }
4420 
4421     lea(ary1, Address(ary1, limit, scaleFactor));
4422     lea(ary2, Address(ary2, limit, Address::times_1));
4423     negptr(limit);
4424 
4425 #ifdef _LP64
4426     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4427       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4428 
4429       cmpl(limit, -64);
4430       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4431 
4432       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4433 
4434       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4435       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4436       kortestql(mask, mask);
4437       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4438       addptr(limit, 64);  // update since we already compared at this addr
4439       cmpl(limit, -64);
4440       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4441 
4442       // At this point we may still need to compare -limit+result bytes.
4443       // We could execute the next two instruction and just continue via non-wide path:
4444       //  cmpl(limit, 0);
4445       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4446       // But since we stopped at the points ary{1,2}+limit which are
4447       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4448       // (|limit| <= 32 and result < 32),
4449       // we may just compare the last 64 bytes.
4450       //
4451       addptr(result, -64);   // it is safe, bc we just came from this area
4452       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4453       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4454       kortestql(mask, mask);
4455       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4456 
4457       jmp(TRUE_LABEL);
4458 
4459       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4460 
4461     }//if (VM_Version::supports_avx512vlbw())
4462 #endif //_LP64
4463     bind(COMPARE_WIDE_VECTORS);
4464     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4465     if (expand_ary2) {
4466       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4467     } else {
4468       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4469     }
4470     vpxor(vec1, vec2);
4471 
4472     vptest(vec1, vec1);
4473     jcc(Assembler::notZero, FALSE_LABEL);
4474     addptr(limit, scaleIncr * 2);
4475     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4476 
4477     testl(result, result);
4478     jcc(Assembler::zero, TRUE_LABEL);
4479 
4480     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4481     if (expand_ary2) {
4482       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4483     } else {
4484       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4485     }
4486     vpxor(vec1, vec2);
4487 
4488     vptest(vec1, vec1);
4489     jcc(Assembler::notZero, FALSE_LABEL);
4490     jmp(TRUE_LABEL);
4491 
4492     bind(COMPARE_TAIL_16); // limit is zero
4493     movl(limit, result);
4494 
4495     // Compare 16-byte chunks
4496     andl(result, 0x0000000f);  //   tail count (in bytes)
4497     andl(limit, 0xfffffff0);   // vector count (in bytes)
4498     jcc(Assembler::zero, COMPARE_TAIL);
4499 
4500     lea(ary1, Address(ary1, limit, scaleFactor));
4501     lea(ary2, Address(ary2, limit, Address::times_1));
4502     negptr(limit);
4503 
4504     bind(COMPARE_WIDE_VECTORS_16);
4505     movdqu(vec1, Address(ary1, limit, scaleFactor));
4506     if (expand_ary2) {
4507       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4508     } else {
4509       movdqu(vec2, Address(ary2, limit, Address::times_1));
4510     }
4511     pxor(vec1, vec2);
4512 
4513     ptest(vec1, vec1);
4514     jcc(Assembler::notZero, FALSE_LABEL);
4515     addptr(limit, scaleIncr);
4516     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4517 
4518     bind(COMPARE_TAIL); // limit is zero
4519     movl(limit, result);
4520     // Fallthru to tail compare
4521   } else if (UseSSE42Intrinsics) {
4522     // With SSE4.2, use double quad vector compare
4523     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4524 
4525     // Compare 16-byte vectors
4526     andl(result, 0x0000000f);  //   tail count (in bytes)
4527     andl(limit, 0xfffffff0);   // vector count (in bytes)
4528     jcc(Assembler::zero, COMPARE_TAIL);
4529 
4530     lea(ary1, Address(ary1, limit, Address::times_1));
4531     lea(ary2, Address(ary2, limit, Address::times_1));
4532     negptr(limit);
4533 
4534     bind(COMPARE_WIDE_VECTORS);
4535     movdqu(vec1, Address(ary1, limit, Address::times_1));
4536     movdqu(vec2, Address(ary2, limit, Address::times_1));
4537     pxor(vec1, vec2);
4538 
4539     ptest(vec1, vec1);
4540     jcc(Assembler::notZero, FALSE_LABEL);
4541     addptr(limit, 16);
4542     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4543 
4544     testl(result, result);
4545     jcc(Assembler::zero, TRUE_LABEL);
4546 
4547     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4548     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4549     pxor(vec1, vec2);
4550 
4551     ptest(vec1, vec1);
4552     jccb(Assembler::notZero, FALSE_LABEL);
4553     jmpb(TRUE_LABEL);
4554 
4555     bind(COMPARE_TAIL); // limit is zero
4556     movl(limit, result);
4557     // Fallthru to tail compare
4558   }
4559 
4560   // Compare 4-byte vectors
4561   if (expand_ary2) {
4562     testl(result, result);
4563     jccb(Assembler::zero, TRUE_LABEL);
4564   } else {
4565     andl(limit, 0xfffffffc); // vector count (in bytes)
4566     jccb(Assembler::zero, COMPARE_CHAR);
4567   }
4568 
4569   lea(ary1, Address(ary1, limit, scaleFactor));
4570   lea(ary2, Address(ary2, limit, Address::times_1));
4571   negptr(limit);
4572 
4573   bind(COMPARE_VECTORS);
4574   if (expand_ary2) {
4575     // There are no "vector" operations for bytes to shorts
4576     movzbl(chr, Address(ary2, limit, Address::times_1));
4577     cmpw(Address(ary1, limit, Address::times_2), chr);
4578     jccb(Assembler::notEqual, FALSE_LABEL);
4579     addptr(limit, 1);
4580     jcc(Assembler::notZero, COMPARE_VECTORS);
4581     jmp(TRUE_LABEL);
4582   } else {
4583     movl(chr, Address(ary1, limit, Address::times_1));
4584     cmpl(chr, Address(ary2, limit, Address::times_1));
4585     jccb(Assembler::notEqual, FALSE_LABEL);
4586     addptr(limit, 4);
4587     jcc(Assembler::notZero, COMPARE_VECTORS);
4588   }
4589 
4590   // Compare trailing char (final 2 bytes), if any
4591   bind(COMPARE_CHAR);
4592   testl(result, 0x2);   // tail  char
4593   jccb(Assembler::zero, COMPARE_BYTE);
4594   load_unsigned_short(chr, Address(ary1, 0));
4595   load_unsigned_short(limit, Address(ary2, 0));
4596   cmpl(chr, limit);
4597   jccb(Assembler::notEqual, FALSE_LABEL);
4598 
4599   if (is_array_equ && is_char) {
4600     bind(COMPARE_BYTE);
4601   } else {
4602     lea(ary1, Address(ary1, 2));
4603     lea(ary2, Address(ary2, 2));
4604 
4605     bind(COMPARE_BYTE);
4606     testl(result, 0x1);   // tail  byte
4607     jccb(Assembler::zero, TRUE_LABEL);
4608     load_unsigned_byte(chr, Address(ary1, 0));
4609     load_unsigned_byte(limit, Address(ary2, 0));
4610     cmpl(chr, limit);
4611     jccb(Assembler::notEqual, FALSE_LABEL);
4612   }
4613   bind(TRUE_LABEL);
4614   movl(result, 1);   // return true
4615   jmpb(DONE);
4616 
4617   bind(FALSE_LABEL);
4618   xorl(result, result); // return false
4619 
4620   // That's it
4621   bind(DONE);
4622   if (UseAVX >= 2) {
4623     // clean upper bits of YMM registers
4624     vpxor(vec1, vec1);
4625     vpxor(vec2, vec2);
4626   }
4627 }
4628 
4629 #ifdef _LP64
4630 
4631 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4632 #define __ masm.
4633   Register dst = stub.data<0>();
4634   XMMRegister src = stub.data<1>();
4635   address target = stub.data<2>();
4636   __ bind(stub.entry());
4637   __ subptr(rsp, 8);
4638   __ movdbl(Address(rsp), src);
4639   __ call(RuntimeAddress(target));
4640   __ pop(dst);
4641   __ jmp(stub.continuation());
4642 #undef __
4643 }
4644 
4645 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4646   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4647   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4648 
4649   address slowpath_target;
4650   if (dst_bt == T_INT) {
4651     if (src_bt == T_FLOAT) {
4652       cvttss2sil(dst, src);
4653       cmpl(dst, 0x80000000);
4654       slowpath_target = StubRoutines::x86::f2i_fixup();
4655     } else {
4656       cvttsd2sil(dst, src);
4657       cmpl(dst, 0x80000000);
4658       slowpath_target = StubRoutines::x86::d2i_fixup();
4659     }
4660   } else {
4661     if (src_bt == T_FLOAT) {
4662       cvttss2siq(dst, src);
4663       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4664       slowpath_target = StubRoutines::x86::f2l_fixup();
4665     } else {
4666       cvttsd2siq(dst, src);
4667       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4668       slowpath_target = StubRoutines::x86::d2l_fixup();
4669     }
4670   }
4671 
4672   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4673   jcc(Assembler::equal, stub->entry());
4674   bind(stub->continuation());
4675 }
4676 
4677 #endif // _LP64
4678 
4679 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4680                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4681   switch(ideal_opc) {
4682     case Op_LShiftVS:
4683       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4684     case Op_LShiftVI:
4685       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4686     case Op_LShiftVL:
4687       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4688     case Op_RShiftVS:
4689       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4690     case Op_RShiftVI:
4691       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4692     case Op_RShiftVL:
4693       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4694     case Op_URShiftVS:
4695       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4696     case Op_URShiftVI:
4697       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4698     case Op_URShiftVL:
4699       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4700     case Op_RotateRightV:
4701       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4702     case Op_RotateLeftV:
4703       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4704     default:
4705       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4706       break;
4707   }
4708 }
4709 
4710 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4711                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4712   if (is_unsigned) {
4713     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4714   } else {
4715     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4716   }
4717 }
4718 
4719 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4720                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4721   switch (elem_bt) {
4722     case T_BYTE:
4723       if (ideal_opc == Op_SaturatingAddV) {
4724         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4725       } else {
4726         assert(ideal_opc == Op_SaturatingSubV, "");
4727         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4728       }
4729       break;
4730     case T_SHORT:
4731       if (ideal_opc == Op_SaturatingAddV) {
4732         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4733       } else {
4734         assert(ideal_opc == Op_SaturatingSubV, "");
4735         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4736       }
4737       break;
4738     default:
4739       fatal("Unsupported type %s", type2name(elem_bt));
4740       break;
4741   }
4742 }
4743 
4744 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4745                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4746   switch (elem_bt) {
4747     case T_BYTE:
4748       if (ideal_opc == Op_SaturatingAddV) {
4749         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4750       } else {
4751         assert(ideal_opc == Op_SaturatingSubV, "");
4752         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4753       }
4754       break;
4755     case T_SHORT:
4756       if (ideal_opc == Op_SaturatingAddV) {
4757         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4758       } else {
4759         assert(ideal_opc == Op_SaturatingSubV, "");
4760         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4761       }
4762       break;
4763     default:
4764       fatal("Unsupported type %s", type2name(elem_bt));
4765       break;
4766   }
4767 }
4768 
4769 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4770                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4771   if (is_unsigned) {
4772     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4773   } else {
4774     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4775   }
4776 }
4777 
4778 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4779                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4780   switch (elem_bt) {
4781     case T_BYTE:
4782       if (ideal_opc == Op_SaturatingAddV) {
4783         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4784       } else {
4785         assert(ideal_opc == Op_SaturatingSubV, "");
4786         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4787       }
4788       break;
4789     case T_SHORT:
4790       if (ideal_opc == Op_SaturatingAddV) {
4791         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4792       } else {
4793         assert(ideal_opc == Op_SaturatingSubV, "");
4794         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4795       }
4796       break;
4797     default:
4798       fatal("Unsupported type %s", type2name(elem_bt));
4799       break;
4800   }
4801 }
4802 
4803 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4804                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4805   switch (elem_bt) {
4806     case T_BYTE:
4807       if (ideal_opc == Op_SaturatingAddV) {
4808         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4809       } else {
4810         assert(ideal_opc == Op_SaturatingSubV, "");
4811         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4812       }
4813       break;
4814     case T_SHORT:
4815       if (ideal_opc == Op_SaturatingAddV) {
4816         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4817       } else {
4818         assert(ideal_opc == Op_SaturatingSubV, "");
4819         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4820       }
4821       break;
4822     default:
4823       fatal("Unsupported type %s", type2name(elem_bt));
4824       break;
4825   }
4826 }
4827 
4828 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4829                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4830                                     bool is_varshift) {
4831   switch (ideal_opc) {
4832     case Op_AddVB:
4833       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4834     case Op_AddVS:
4835       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4836     case Op_AddVI:
4837       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4838     case Op_AddVL:
4839       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4840     case Op_AddVF:
4841       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4842     case Op_AddVD:
4843       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4844     case Op_SubVB:
4845       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4846     case Op_SubVS:
4847       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4848     case Op_SubVI:
4849       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4850     case Op_SubVL:
4851       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4852     case Op_SubVF:
4853       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4854     case Op_SubVD:
4855       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4856     case Op_MulVS:
4857       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4858     case Op_MulVI:
4859       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4860     case Op_MulVL:
4861       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4862     case Op_MulVF:
4863       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4864     case Op_MulVD:
4865       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4866     case Op_DivVF:
4867       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4868     case Op_DivVD:
4869       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4870     case Op_SqrtVF:
4871       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4872     case Op_SqrtVD:
4873       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4874     case Op_AbsVB:
4875       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4876     case Op_AbsVS:
4877       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4878     case Op_AbsVI:
4879       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4880     case Op_AbsVL:
4881       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4882     case Op_FmaVF:
4883       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4884     case Op_FmaVD:
4885       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4886     case Op_VectorRearrange:
4887       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4888     case Op_LShiftVS:
4889       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4890     case Op_LShiftVI:
4891       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4892     case Op_LShiftVL:
4893       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4894     case Op_RShiftVS:
4895       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4896     case Op_RShiftVI:
4897       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4898     case Op_RShiftVL:
4899       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4900     case Op_URShiftVS:
4901       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4902     case Op_URShiftVI:
4903       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4904     case Op_URShiftVL:
4905       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4906     case Op_RotateLeftV:
4907       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4908     case Op_RotateRightV:
4909       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4910     case Op_MaxV:
4911       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4912     case Op_MinV:
4913       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4914     case Op_UMinV:
4915       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4916     case Op_UMaxV:
4917       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4918     case Op_XorV:
4919       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4920     case Op_OrV:
4921       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4922     case Op_AndV:
4923       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4924     default:
4925       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4926       break;
4927   }
4928 }
4929 
4930 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4931                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4932   switch (ideal_opc) {
4933     case Op_AddVB:
4934       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4935     case Op_AddVS:
4936       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4937     case Op_AddVI:
4938       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4939     case Op_AddVL:
4940       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4941     case Op_AddVF:
4942       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4943     case Op_AddVD:
4944       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4945     case Op_SubVB:
4946       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4947     case Op_SubVS:
4948       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4949     case Op_SubVI:
4950       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4951     case Op_SubVL:
4952       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4953     case Op_SubVF:
4954       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4955     case Op_SubVD:
4956       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4957     case Op_MulVS:
4958       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4959     case Op_MulVI:
4960       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4961     case Op_MulVL:
4962       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4963     case Op_MulVF:
4964       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4965     case Op_MulVD:
4966       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4967     case Op_DivVF:
4968       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4969     case Op_DivVD:
4970       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4971     case Op_FmaVF:
4972       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4973     case Op_FmaVD:
4974       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4975     case Op_MaxV:
4976       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4977     case Op_MinV:
4978       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4979     case Op_UMaxV:
4980       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4981     case Op_UMinV:
4982       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4983     case Op_XorV:
4984       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4985     case Op_OrV:
4986       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4987     case Op_AndV:
4988       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4989     default:
4990       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4991       break;
4992   }
4993 }
4994 
4995 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4996                                   KRegister src1, KRegister src2) {
4997   BasicType etype = T_ILLEGAL;
4998   switch(mask_len) {
4999     case 2:
5000     case 4:
5001     case 8:  etype = T_BYTE; break;
5002     case 16: etype = T_SHORT; break;
5003     case 32: etype = T_INT; break;
5004     case 64: etype = T_LONG; break;
5005     default: fatal("Unsupported type"); break;
5006   }
5007   assert(etype != T_ILLEGAL, "");
5008   switch(ideal_opc) {
5009     case Op_AndVMask:
5010       kand(etype, dst, src1, src2); break;
5011     case Op_OrVMask:
5012       kor(etype, dst, src1, src2); break;
5013     case Op_XorVMask:
5014       kxor(etype, dst, src1, src2); break;
5015     default:
5016       fatal("Unsupported masked operation"); break;
5017   }
5018 }
5019 
5020 /*
5021  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5022  * If src is NaN, the result is 0.
5023  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
5024  * the result is equal to the value of Integer.MIN_VALUE.
5025  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
5026  * the result is equal to the value of Integer.MAX_VALUE.
5027  */
5028 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5029                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5030                                                                    Register rscratch, AddressLiteral float_sign_flip,
5031                                                                    int vec_enc) {
5032   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5033   Label done;
5034   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
5035   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5036   vptest(xtmp2, xtmp2, vec_enc);
5037   jccb(Assembler::equal, done);
5038 
5039   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
5040   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
5041 
5042   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5043   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
5044   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
5045 
5046   // Recompute the mask for remaining special value.
5047   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
5048   // Extract SRC values corresponding to TRUE mask lanes.
5049   vpand(xtmp4, xtmp2, src, vec_enc);
5050   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
5051   // values are set.
5052   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5053 
5054   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5055   bind(done);
5056 }
5057 
5058 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5059                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5060                                                                     Register rscratch, AddressLiteral float_sign_flip,
5061                                                                     int vec_enc) {
5062   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5063   Label done;
5064   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5065   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5066   kortestwl(ktmp1, ktmp1);
5067   jccb(Assembler::equal, done);
5068 
5069   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5070   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5071   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5072 
5073   kxorwl(ktmp1, ktmp1, ktmp2);
5074   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5075   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5076   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5077   bind(done);
5078 }
5079 
5080 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5081                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5082                                                                      Register rscratch, AddressLiteral double_sign_flip,
5083                                                                      int vec_enc) {
5084   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5085 
5086   Label done;
5087   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5088   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5089   kortestwl(ktmp1, ktmp1);
5090   jccb(Assembler::equal, done);
5091 
5092   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5093   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5094   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5095 
5096   kxorwl(ktmp1, ktmp1, ktmp2);
5097   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5098   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5099   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5100   bind(done);
5101 }
5102 
5103 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5104                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5105                                                                      Register rscratch, AddressLiteral float_sign_flip,
5106                                                                      int vec_enc) {
5107   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5108   Label done;
5109   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5110   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5111   kortestwl(ktmp1, ktmp1);
5112   jccb(Assembler::equal, done);
5113 
5114   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5115   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5116   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5117 
5118   kxorwl(ktmp1, ktmp1, ktmp2);
5119   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5120   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5121   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5122   bind(done);
5123 }
5124 
5125 /*
5126  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5127  * If src is NaN, the result is 0.
5128  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5129  * the result is equal to the value of Long.MIN_VALUE.
5130  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5131  * the result is equal to the value of Long.MAX_VALUE.
5132  */
5133 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5134                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5135                                                                       Register rscratch, AddressLiteral double_sign_flip,
5136                                                                       int vec_enc) {
5137   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5138 
5139   Label done;
5140   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5141   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5142   kortestwl(ktmp1, ktmp1);
5143   jccb(Assembler::equal, done);
5144 
5145   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5146   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5147   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5148 
5149   kxorwl(ktmp1, ktmp1, ktmp2);
5150   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5151   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5152   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5153   bind(done);
5154 }
5155 
5156 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5157                                                              XMMRegister xtmp, int index, int vec_enc) {
5158    assert(vec_enc < Assembler::AVX_512bit, "");
5159    if (vec_enc == Assembler::AVX_256bit) {
5160      vextractf128_high(xtmp, src);
5161      vshufps(dst, src, xtmp, index, vec_enc);
5162    } else {
5163      vshufps(dst, src, zero, index, vec_enc);
5164    }
5165 }
5166 
5167 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5168                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5169                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5170   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5171 
5172   Label done;
5173   // Compare the destination lanes with float_sign_flip
5174   // value to get mask for all special values.
5175   movdqu(xtmp1, float_sign_flip, rscratch);
5176   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5177   ptest(xtmp2, xtmp2);
5178   jccb(Assembler::equal, done);
5179 
5180   // Flip float_sign_flip to get max integer value.
5181   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5182   pxor(xtmp1, xtmp4);
5183 
5184   // Set detination lanes corresponding to unordered source lanes as zero.
5185   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5186   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5187 
5188   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5189   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5190   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5191 
5192   // Recompute the mask for remaining special value.
5193   pxor(xtmp2, xtmp3);
5194   // Extract mask corresponding to non-negative source lanes.
5195   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5196 
5197   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5198   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5199   pand(xtmp3, xtmp2);
5200 
5201   // Replace destination lanes holding special value(0x80000000) with max int
5202   // if corresponding source lane holds a +ve value.
5203   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5204   bind(done);
5205 }
5206 
5207 
5208 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5209                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5210   switch(to_elem_bt) {
5211     case T_SHORT:
5212       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5213       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5214       vpackusdw(dst, dst, zero, vec_enc);
5215       if (vec_enc == Assembler::AVX_256bit) {
5216         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5217       }
5218       break;
5219     case  T_BYTE:
5220       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5221       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5222       vpackusdw(dst, dst, zero, vec_enc);
5223       if (vec_enc == Assembler::AVX_256bit) {
5224         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5225       }
5226       vpackuswb(dst, dst, zero, vec_enc);
5227       break;
5228     default: assert(false, "%s", type2name(to_elem_bt));
5229   }
5230 }
5231 
5232 /*
5233  * Algorithm for vector D2L and F2I conversions:-
5234  * a) Perform vector D2L/F2I cast.
5235  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5236  *    It signifies that source value could be any of the special floating point
5237  *    values(NaN,-Inf,Inf,Max,-Min).
5238  * c) Set destination to zero if source is NaN value.
5239  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5240  */
5241 
5242 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5243                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5244                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5245   int to_elem_sz = type2aelembytes(to_elem_bt);
5246   assert(to_elem_sz <= 4, "");
5247   vcvttps2dq(dst, src, vec_enc);
5248   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5249   if (to_elem_sz < 4) {
5250     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5251     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5252   }
5253 }
5254 
5255 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5256                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5257                                             Register rscratch, int vec_enc) {
5258   int to_elem_sz = type2aelembytes(to_elem_bt);
5259   assert(to_elem_sz <= 4, "");
5260   vcvttps2dq(dst, src, vec_enc);
5261   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5262   switch(to_elem_bt) {
5263     case T_INT:
5264       break;
5265     case T_SHORT:
5266       evpmovdw(dst, dst, vec_enc);
5267       break;
5268     case T_BYTE:
5269       evpmovdb(dst, dst, vec_enc);
5270       break;
5271     default: assert(false, "%s", type2name(to_elem_bt));
5272   }
5273 }
5274 
5275 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5276                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5277                                             Register rscratch, int vec_enc) {
5278   evcvttps2qq(dst, src, vec_enc);
5279   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5280 }
5281 
5282 // Handling for downcasting from double to integer or sub-word types on AVX2.
5283 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5284                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5285                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5286   int to_elem_sz = type2aelembytes(to_elem_bt);
5287   assert(to_elem_sz < 8, "");
5288   vcvttpd2dq(dst, src, vec_enc);
5289   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5290                                               float_sign_flip, vec_enc);
5291   if (to_elem_sz < 4) {
5292     // xtmp4 holds all zero lanes.
5293     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5294   }
5295 }
5296 
5297 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5298                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5299                                             KRegister ktmp2, AddressLiteral sign_flip,
5300                                             Register rscratch, int vec_enc) {
5301   if (VM_Version::supports_avx512dq()) {
5302     evcvttpd2qq(dst, src, vec_enc);
5303     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5304     switch(to_elem_bt) {
5305       case T_LONG:
5306         break;
5307       case T_INT:
5308         evpmovsqd(dst, dst, vec_enc);
5309         break;
5310       case T_SHORT:
5311         evpmovsqd(dst, dst, vec_enc);
5312         evpmovdw(dst, dst, vec_enc);
5313         break;
5314       case T_BYTE:
5315         evpmovsqd(dst, dst, vec_enc);
5316         evpmovdb(dst, dst, vec_enc);
5317         break;
5318       default: assert(false, "%s", type2name(to_elem_bt));
5319     }
5320   } else {
5321     assert(type2aelembytes(to_elem_bt) <= 4, "");
5322     vcvttpd2dq(dst, src, vec_enc);
5323     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5324     switch(to_elem_bt) {
5325       case T_INT:
5326         break;
5327       case T_SHORT:
5328         evpmovdw(dst, dst, vec_enc);
5329         break;
5330       case T_BYTE:
5331         evpmovdb(dst, dst, vec_enc);
5332         break;
5333       default: assert(false, "%s", type2name(to_elem_bt));
5334     }
5335   }
5336 }
5337 
5338 #ifdef _LP64
5339 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5340                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5341                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5342   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5343   // and re-instantiate original MXCSR.RC mode after that.
5344   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5345 
5346   mov64(tmp, julong_cast(0.5L));
5347   evpbroadcastq(xtmp1, tmp, vec_enc);
5348   vaddpd(xtmp1, src , xtmp1, vec_enc);
5349   evcvtpd2qq(dst, xtmp1, vec_enc);
5350   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5351                                                 double_sign_flip, vec_enc);;
5352 
5353   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5354 }
5355 
5356 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5357                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5358                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5359   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5360   // and re-instantiate original MXCSR.RC mode after that.
5361   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5362 
5363   movl(tmp, jint_cast(0.5));
5364   movq(xtmp1, tmp);
5365   vbroadcastss(xtmp1, xtmp1, vec_enc);
5366   vaddps(xtmp1, src , xtmp1, vec_enc);
5367   vcvtps2dq(dst, xtmp1, vec_enc);
5368   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5369                                               float_sign_flip, vec_enc);
5370 
5371   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5372 }
5373 
5374 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5375                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5376                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5377   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5378   // and re-instantiate original MXCSR.RC mode after that.
5379   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5380 
5381   movl(tmp, jint_cast(0.5));
5382   movq(xtmp1, tmp);
5383   vbroadcastss(xtmp1, xtmp1, vec_enc);
5384   vaddps(xtmp1, src , xtmp1, vec_enc);
5385   vcvtps2dq(dst, xtmp1, vec_enc);
5386   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5387 
5388   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5389 }
5390 #endif // _LP64
5391 
5392 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5393                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5394   switch (from_elem_bt) {
5395     case T_BYTE:
5396       switch (to_elem_bt) {
5397         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5398         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5399         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5400         default: ShouldNotReachHere();
5401       }
5402       break;
5403     case T_SHORT:
5404       switch (to_elem_bt) {
5405         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5406         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5407         default: ShouldNotReachHere();
5408       }
5409       break;
5410     case T_INT:
5411       assert(to_elem_bt == T_LONG, "");
5412       vpmovzxdq(dst, src, vlen_enc);
5413       break;
5414     default:
5415       ShouldNotReachHere();
5416   }
5417 }
5418 
5419 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5420                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5421   switch (from_elem_bt) {
5422     case T_BYTE:
5423       switch (to_elem_bt) {
5424         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5425         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5426         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5427         default: ShouldNotReachHere();
5428       }
5429       break;
5430     case T_SHORT:
5431       switch (to_elem_bt) {
5432         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5433         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5434         default: ShouldNotReachHere();
5435       }
5436       break;
5437     case T_INT:
5438       assert(to_elem_bt == T_LONG, "");
5439       vpmovsxdq(dst, src, vlen_enc);
5440       break;
5441     default:
5442       ShouldNotReachHere();
5443   }
5444 }
5445 
5446 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5447                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5448   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5449   assert(vlen_enc != AVX_512bit, "");
5450 
5451   int dst_bt_size = type2aelembytes(dst_bt);
5452   int src_bt_size = type2aelembytes(src_bt);
5453   if (dst_bt_size > src_bt_size) {
5454     switch (dst_bt_size / src_bt_size) {
5455       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5456       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5457       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5458       default: ShouldNotReachHere();
5459     }
5460   } else {
5461     assert(dst_bt_size < src_bt_size, "");
5462     switch (src_bt_size / dst_bt_size) {
5463       case 2: {
5464         if (vlen_enc == AVX_128bit) {
5465           vpacksswb(dst, src, src, vlen_enc);
5466         } else {
5467           vpacksswb(dst, src, src, vlen_enc);
5468           vpermq(dst, dst, 0x08, vlen_enc);
5469         }
5470         break;
5471       }
5472       case 4: {
5473         if (vlen_enc == AVX_128bit) {
5474           vpackssdw(dst, src, src, vlen_enc);
5475           vpacksswb(dst, dst, dst, vlen_enc);
5476         } else {
5477           vpackssdw(dst, src, src, vlen_enc);
5478           vpermq(dst, dst, 0x08, vlen_enc);
5479           vpacksswb(dst, dst, dst, AVX_128bit);
5480         }
5481         break;
5482       }
5483       case 8: {
5484         if (vlen_enc == AVX_128bit) {
5485           vpshufd(dst, src, 0x08, vlen_enc);
5486           vpackssdw(dst, dst, dst, vlen_enc);
5487           vpacksswb(dst, dst, dst, vlen_enc);
5488         } else {
5489           vpshufd(dst, src, 0x08, vlen_enc);
5490           vpermq(dst, dst, 0x08, vlen_enc);
5491           vpackssdw(dst, dst, dst, AVX_128bit);
5492           vpacksswb(dst, dst, dst, AVX_128bit);
5493         }
5494         break;
5495       }
5496       default: ShouldNotReachHere();
5497     }
5498   }
5499 }
5500 
5501 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5502                                    bool merge, BasicType bt, int vlen_enc) {
5503   if (bt == T_INT) {
5504     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5505   } else {
5506     assert(bt == T_LONG, "");
5507     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5508   }
5509 }
5510 
5511 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5512                                    bool merge, BasicType bt, int vlen_enc) {
5513   if (bt == T_INT) {
5514     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5515   } else {
5516     assert(bt == T_LONG, "");
5517     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5518   }
5519 }
5520 
5521 #ifdef _LP64
5522 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5523                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5524                                                int vec_enc) {
5525   int index = 0;
5526   int vindex = 0;
5527   mov64(rtmp1, 0x0101010101010101L);
5528   pdepq(rtmp1, src, rtmp1);
5529   if (mask_len > 8) {
5530     movq(rtmp2, src);
5531     vpxor(xtmp, xtmp, xtmp, vec_enc);
5532     movq(xtmp, rtmp1);
5533   }
5534   movq(dst, rtmp1);
5535 
5536   mask_len -= 8;
5537   while (mask_len > 0) {
5538     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5539     index++;
5540     if ((index % 2) == 0) {
5541       pxor(xtmp, xtmp);
5542     }
5543     mov64(rtmp1, 0x0101010101010101L);
5544     shrq(rtmp2, 8);
5545     pdepq(rtmp1, rtmp2, rtmp1);
5546     pinsrq(xtmp, rtmp1, index % 2);
5547     vindex = index / 2;
5548     if (vindex) {
5549       // Write entire 16 byte vector when both 64 bit
5550       // lanes are update to save redundant instructions.
5551       if (index % 2) {
5552         vinsertf128(dst, dst, xtmp, vindex);
5553       }
5554     } else {
5555       vmovdqu(dst, xtmp);
5556     }
5557     mask_len -= 8;
5558   }
5559 }
5560 
5561 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5562   switch(opc) {
5563     case Op_VectorMaskTrueCount:
5564       popcntq(dst, tmp);
5565       break;
5566     case Op_VectorMaskLastTrue:
5567       if (VM_Version::supports_lzcnt()) {
5568         lzcntq(tmp, tmp);
5569         movl(dst, 63);
5570         subl(dst, tmp);
5571       } else {
5572         movl(dst, -1);
5573         bsrq(tmp, tmp);
5574         cmov32(Assembler::notZero, dst, tmp);
5575       }
5576       break;
5577     case Op_VectorMaskFirstTrue:
5578       if (VM_Version::supports_bmi1()) {
5579         if (masklen < 32) {
5580           orl(tmp, 1 << masklen);
5581           tzcntl(dst, tmp);
5582         } else if (masklen == 32) {
5583           tzcntl(dst, tmp);
5584         } else {
5585           assert(masklen == 64, "");
5586           tzcntq(dst, tmp);
5587         }
5588       } else {
5589         if (masklen < 32) {
5590           orl(tmp, 1 << masklen);
5591           bsfl(dst, tmp);
5592         } else {
5593           assert(masklen == 32 || masklen == 64, "");
5594           movl(dst, masklen);
5595           if (masklen == 32)  {
5596             bsfl(tmp, tmp);
5597           } else {
5598             bsfq(tmp, tmp);
5599           }
5600           cmov32(Assembler::notZero, dst, tmp);
5601         }
5602       }
5603       break;
5604     case Op_VectorMaskToLong:
5605       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5606       break;
5607     default: assert(false, "Unhandled mask operation");
5608   }
5609 }
5610 
5611 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5612                                               int masklen, int masksize, int vec_enc) {
5613   assert(VM_Version::supports_popcnt(), "");
5614 
5615   if(VM_Version::supports_avx512bw()) {
5616     kmovql(tmp, mask);
5617   } else {
5618     assert(masklen <= 16, "");
5619     kmovwl(tmp, mask);
5620   }
5621 
5622   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5623   // operations needs to be clipped.
5624   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5625     andq(tmp, (1 << masklen) - 1);
5626   }
5627 
5628   vector_mask_operation_helper(opc, dst, tmp, masklen);
5629 }
5630 
5631 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5632                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5633   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5634          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5635   assert(VM_Version::supports_popcnt(), "");
5636 
5637   bool need_clip = false;
5638   switch(bt) {
5639     case T_BOOLEAN:
5640       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5641       vpxor(xtmp, xtmp, xtmp, vec_enc);
5642       vpsubb(xtmp, xtmp, mask, vec_enc);
5643       vpmovmskb(tmp, xtmp, vec_enc);
5644       need_clip = masklen < 16;
5645       break;
5646     case T_BYTE:
5647       vpmovmskb(tmp, mask, vec_enc);
5648       need_clip = masklen < 16;
5649       break;
5650     case T_SHORT:
5651       vpacksswb(xtmp, mask, mask, vec_enc);
5652       if (masklen >= 16) {
5653         vpermpd(xtmp, xtmp, 8, vec_enc);
5654       }
5655       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5656       need_clip = masklen < 16;
5657       break;
5658     case T_INT:
5659     case T_FLOAT:
5660       vmovmskps(tmp, mask, vec_enc);
5661       need_clip = masklen < 4;
5662       break;
5663     case T_LONG:
5664     case T_DOUBLE:
5665       vmovmskpd(tmp, mask, vec_enc);
5666       need_clip = masklen < 2;
5667       break;
5668     default: assert(false, "Unhandled type, %s", type2name(bt));
5669   }
5670 
5671   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5672   // operations needs to be clipped.
5673   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5674     // need_clip implies masklen < 32
5675     andq(tmp, (1 << masklen) - 1);
5676   }
5677 
5678   vector_mask_operation_helper(opc, dst, tmp, masklen);
5679 }
5680 
5681 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5682                                              Register rtmp2, int mask_len) {
5683   kmov(rtmp1, src);
5684   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5685   mov64(rtmp2, -1L);
5686   pextq(rtmp2, rtmp2, rtmp1);
5687   kmov(dst, rtmp2);
5688 }
5689 
5690 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5691                                                     XMMRegister mask, Register rtmp, Register rscratch,
5692                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5693                                                     int vec_enc) {
5694   assert(type2aelembytes(bt) >= 4, "");
5695   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5696   address compress_perm_table = nullptr;
5697   address expand_perm_table = nullptr;
5698   if (type2aelembytes(bt) == 8) {
5699     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5700     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5701     vmovmskpd(rtmp, mask, vec_enc);
5702   } else {
5703     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5704     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5705     vmovmskps(rtmp, mask, vec_enc);
5706   }
5707   shlq(rtmp, 5); // for 32 byte permute row.
5708   if (opcode == Op_CompressV) {
5709     lea(rscratch, ExternalAddress(compress_perm_table));
5710   } else {
5711     lea(rscratch, ExternalAddress(expand_perm_table));
5712   }
5713   addptr(rtmp, rscratch);
5714   vmovdqu(permv, Address(rtmp));
5715   vpermps(dst, permv, src, Assembler::AVX_256bit);
5716   vpxor(xtmp, xtmp, xtmp, vec_enc);
5717   // Blend the result with zero vector using permute mask, each column entry
5718   // in a permute table row contains either a valid permute index or a -1 (default)
5719   // value, this can potentially be used as a blending mask after
5720   // compressing/expanding the source vector lanes.
5721   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5722 }
5723 
5724 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5725                                                bool merge, BasicType bt, int vec_enc) {
5726   if (opcode == Op_CompressV) {
5727     switch(bt) {
5728     case T_BYTE:
5729       evpcompressb(dst, mask, src, merge, vec_enc);
5730       break;
5731     case T_CHAR:
5732     case T_SHORT:
5733       evpcompressw(dst, mask, src, merge, vec_enc);
5734       break;
5735     case T_INT:
5736       evpcompressd(dst, mask, src, merge, vec_enc);
5737       break;
5738     case T_FLOAT:
5739       evcompressps(dst, mask, src, merge, vec_enc);
5740       break;
5741     case T_LONG:
5742       evpcompressq(dst, mask, src, merge, vec_enc);
5743       break;
5744     case T_DOUBLE:
5745       evcompresspd(dst, mask, src, merge, vec_enc);
5746       break;
5747     default:
5748       fatal("Unsupported type %s", type2name(bt));
5749       break;
5750     }
5751   } else {
5752     assert(opcode == Op_ExpandV, "");
5753     switch(bt) {
5754     case T_BYTE:
5755       evpexpandb(dst, mask, src, merge, vec_enc);
5756       break;
5757     case T_CHAR:
5758     case T_SHORT:
5759       evpexpandw(dst, mask, src, merge, vec_enc);
5760       break;
5761     case T_INT:
5762       evpexpandd(dst, mask, src, merge, vec_enc);
5763       break;
5764     case T_FLOAT:
5765       evexpandps(dst, mask, src, merge, vec_enc);
5766       break;
5767     case T_LONG:
5768       evpexpandq(dst, mask, src, merge, vec_enc);
5769       break;
5770     case T_DOUBLE:
5771       evexpandpd(dst, mask, src, merge, vec_enc);
5772       break;
5773     default:
5774       fatal("Unsupported type %s", type2name(bt));
5775       break;
5776     }
5777   }
5778 }
5779 #endif
5780 
5781 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5782                                            KRegister ktmp1, int vec_enc) {
5783   if (opcode == Op_SignumVD) {
5784     vsubpd(dst, zero, one, vec_enc);
5785     // if src < 0 ? -1 : 1
5786     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5787     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5788     // if src == NaN, -0.0 or 0.0 return src.
5789     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5790     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5791   } else {
5792     assert(opcode == Op_SignumVF, "");
5793     vsubps(dst, zero, one, vec_enc);
5794     // if src < 0 ? -1 : 1
5795     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5796     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5797     // if src == NaN, -0.0 or 0.0 return src.
5798     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5799     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5800   }
5801 }
5802 
5803 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5804                                           XMMRegister xtmp1, int vec_enc) {
5805   if (opcode == Op_SignumVD) {
5806     vsubpd(dst, zero, one, vec_enc);
5807     // if src < 0 ? -1 : 1
5808     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5809     // if src == NaN, -0.0 or 0.0 return src.
5810     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5811     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5812   } else {
5813     assert(opcode == Op_SignumVF, "");
5814     vsubps(dst, zero, one, vec_enc);
5815     // if src < 0 ? -1 : 1
5816     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5817     // if src == NaN, -0.0 or 0.0 return src.
5818     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5819     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5820   }
5821 }
5822 
5823 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5824   if (VM_Version::supports_avx512bw()) {
5825     if (mask_len > 32) {
5826       kmovql(dst, src);
5827     } else {
5828       kmovdl(dst, src);
5829       if (mask_len != 32) {
5830         kshiftrdl(dst, dst, 32 - mask_len);
5831       }
5832     }
5833   } else {
5834     assert(mask_len <= 16, "");
5835     kmovwl(dst, src);
5836     if (mask_len != 16) {
5837       kshiftrwl(dst, dst, 16 - mask_len);
5838     }
5839   }
5840 }
5841 
5842 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5843   int lane_size = type2aelembytes(bt);
5844   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5845   if ((is_LP64 || lane_size < 8) &&
5846       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5847        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5848     movptr(rtmp, imm32);
5849     switch(lane_size) {
5850       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5851       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5852       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5853       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5854       fatal("Unsupported lane size %d", lane_size);
5855       break;
5856     }
5857   } else {
5858     movptr(rtmp, imm32);
5859     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5860     switch(lane_size) {
5861       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5862       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5863       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5864       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5865       fatal("Unsupported lane size %d", lane_size);
5866       break;
5867     }
5868   }
5869 }
5870 
5871 //
5872 // Following is lookup table based popcount computation algorithm:-
5873 //       Index   Bit set count
5874 //     [ 0000 ->   0,
5875 //       0001 ->   1,
5876 //       0010 ->   1,
5877 //       0011 ->   2,
5878 //       0100 ->   1,
5879 //       0101 ->   2,
5880 //       0110 ->   2,
5881 //       0111 ->   3,
5882 //       1000 ->   1,
5883 //       1001 ->   2,
5884 //       1010 ->   3,
5885 //       1011 ->   3,
5886 //       1100 ->   2,
5887 //       1101 ->   3,
5888 //       1111 ->   4 ]
5889 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5890 //     shuffle indices for lookup table access.
5891 //  b. Right shift each byte of vector lane by 4 positions.
5892 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5893 //     shuffle indices for lookup table access.
5894 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5895 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5896 //     count of all the bytes of a quadword.
5897 //  f. Perform step e. for upper 128bit vector lane.
5898 //  g. Pack the bitset count of quadwords back to double word.
5899 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5900 
5901 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5902                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5903   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5904   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5905   vpsrlw(dst, src, 4, vec_enc);
5906   vpand(dst, dst, xtmp1, vec_enc);
5907   vpand(xtmp1, src, xtmp1, vec_enc);
5908   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5909   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5910   vpshufb(dst, xtmp2, dst, vec_enc);
5911   vpaddb(dst, dst, xtmp1, vec_enc);
5912 }
5913 
5914 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5915                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5916   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5917   // Following code is as per steps e,f,g and h of above algorithm.
5918   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5919   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5920   vpsadbw(dst, dst, xtmp2, vec_enc);
5921   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5922   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5923   vpackuswb(dst, xtmp1, dst, vec_enc);
5924 }
5925 
5926 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5927                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5928   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5929   // Add the popcount of upper and lower bytes of word.
5930   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5931   vpsrlw(dst, xtmp1, 8, vec_enc);
5932   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5933   vpaddw(dst, dst, xtmp1, vec_enc);
5934 }
5935 
5936 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5937                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5938   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5939   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5940   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5941 }
5942 
5943 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5944                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5945   switch(bt) {
5946     case T_LONG:
5947       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5948       break;
5949     case T_INT:
5950       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5951       break;
5952     case T_CHAR:
5953     case T_SHORT:
5954       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5955       break;
5956     case T_BYTE:
5957     case T_BOOLEAN:
5958       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5959       break;
5960     default:
5961       fatal("Unsupported type %s", type2name(bt));
5962       break;
5963   }
5964 }
5965 
5966 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5967                                                       KRegister mask, bool merge, int vec_enc) {
5968   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5969   switch(bt) {
5970     case T_LONG:
5971       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5972       evpopcntq(dst, mask, src, merge, vec_enc);
5973       break;
5974     case T_INT:
5975       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5976       evpopcntd(dst, mask, src, merge, vec_enc);
5977       break;
5978     case T_CHAR:
5979     case T_SHORT:
5980       assert(VM_Version::supports_avx512_bitalg(), "");
5981       evpopcntw(dst, mask, src, merge, vec_enc);
5982       break;
5983     case T_BYTE:
5984     case T_BOOLEAN:
5985       assert(VM_Version::supports_avx512_bitalg(), "");
5986       evpopcntb(dst, mask, src, merge, vec_enc);
5987       break;
5988     default:
5989       fatal("Unsupported type %s", type2name(bt));
5990       break;
5991   }
5992 }
5993 
5994 #ifndef _LP64
5995 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5996   assert(VM_Version::supports_avx512bw(), "");
5997   kmovdl(tmp, src);
5998   kunpckdql(dst, tmp, tmp);
5999 }
6000 #endif
6001 
6002 // Bit reversal algorithm first reverses the bits of each byte followed by
6003 // a byte level reversal for multi-byte primitive types (short/int/long).
6004 // Algorithm performs a lookup table access to get reverse bit sequence
6005 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6006 // is obtained by swapping the reverse bit sequences of upper and lower
6007 // nibble of a byte.
6008 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6009                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
6010   if (VM_Version::supports_avx512vlbw()) {
6011 
6012     // Get the reverse bit sequence of lower nibble of each byte.
6013     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6014     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6015     evpandq(dst, xtmp2, src, vec_enc);
6016     vpshufb(dst, xtmp1, dst, vec_enc);
6017     vpsllq(dst, dst, 4, vec_enc);
6018 
6019     // Get the reverse bit sequence of upper nibble of each byte.
6020     vpandn(xtmp2, xtmp2, src, vec_enc);
6021     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6022     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6023 
6024     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6025     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6026     evporq(xtmp2, dst, xtmp2, vec_enc);
6027     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6028 
6029   } else if(vec_enc == Assembler::AVX_512bit) {
6030     // Shift based bit reversal.
6031     assert(bt == T_LONG || bt == T_INT, "");
6032 
6033     // Swap lower and upper nibble of each byte.
6034     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6035 
6036     // Swap two least and most significant bits of each nibble.
6037     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6038 
6039     // Swap adjacent pair of bits.
6040     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6041     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6042 
6043     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6044     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6045   } else {
6046     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6047     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6048 
6049     // Get the reverse bit sequence of lower nibble of each byte.
6050     vpand(dst, xtmp2, src, vec_enc);
6051     vpshufb(dst, xtmp1, dst, vec_enc);
6052     vpsllq(dst, dst, 4, vec_enc);
6053 
6054     // Get the reverse bit sequence of upper nibble of each byte.
6055     vpandn(xtmp2, xtmp2, src, vec_enc);
6056     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6057     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6058 
6059     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6060     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6061     vpor(xtmp2, dst, xtmp2, vec_enc);
6062     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6063   }
6064 }
6065 
6066 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6067                                                 XMMRegister xtmp, Register rscratch) {
6068   assert(VM_Version::supports_gfni(), "");
6069   assert(rscratch != noreg || always_reachable(mask), "missing");
6070 
6071   // Galois field instruction based bit reversal based on following algorithm.
6072   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6073   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6074   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6075   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6076 }
6077 
6078 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6079                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6080   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6081   evpandq(dst, xtmp1, src, vec_enc);
6082   vpsllq(dst, dst, nbits, vec_enc);
6083   vpandn(xtmp1, xtmp1, src, vec_enc);
6084   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6085   evporq(dst, dst, xtmp1, vec_enc);
6086 }
6087 
6088 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6089                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6090   // Shift based bit reversal.
6091   assert(VM_Version::supports_evex(), "");
6092   switch(bt) {
6093     case T_LONG:
6094       // Swap upper and lower double word of each quad word.
6095       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6096       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6097       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6098       break;
6099     case T_INT:
6100       // Swap upper and lower word of each double word.
6101       evprord(xtmp1, k0, src, 16, true, vec_enc);
6102       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6103       break;
6104     case T_CHAR:
6105     case T_SHORT:
6106       // Swap upper and lower byte of each word.
6107       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6108       break;
6109     case T_BYTE:
6110       evmovdquq(dst, k0, src, true, vec_enc);
6111       break;
6112     default:
6113       fatal("Unsupported type %s", type2name(bt));
6114       break;
6115   }
6116 }
6117 
6118 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6119   if (bt == T_BYTE) {
6120     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6121       evmovdquq(dst, k0, src, true, vec_enc);
6122     } else {
6123       vmovdqu(dst, src);
6124     }
6125     return;
6126   }
6127   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6128   // pre-computed shuffle indices.
6129   switch(bt) {
6130     case T_LONG:
6131       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6132       break;
6133     case T_INT:
6134       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6135       break;
6136     case T_CHAR:
6137     case T_SHORT:
6138       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6139       break;
6140     default:
6141       fatal("Unsupported type %s", type2name(bt));
6142       break;
6143   }
6144   vpshufb(dst, src, dst, vec_enc);
6145 }
6146 
6147 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6148                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6149                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6150   assert(is_integral_type(bt), "");
6151   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6152   assert(VM_Version::supports_avx512cd(), "");
6153   switch(bt) {
6154     case T_LONG:
6155       evplzcntq(dst, ktmp, src, merge, vec_enc);
6156       break;
6157     case T_INT:
6158       evplzcntd(dst, ktmp, src, merge, vec_enc);
6159       break;
6160     case T_SHORT:
6161       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6162       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6163       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6164       vpunpckhwd(dst, xtmp1, src, vec_enc);
6165       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6166       vpackusdw(dst, xtmp2, dst, vec_enc);
6167       break;
6168     case T_BYTE:
6169       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6170       // accessing the lookup table.
6171       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6172       // accessing the lookup table.
6173       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6174       assert(VM_Version::supports_avx512bw(), "");
6175       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6176       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6177       vpand(xtmp2, dst, src, vec_enc);
6178       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6179       vpsrlw(xtmp3, src, 4, vec_enc);
6180       vpand(xtmp3, dst, xtmp3, vec_enc);
6181       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6182       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6183       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6184       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6185       break;
6186     default:
6187       fatal("Unsupported type %s", type2name(bt));
6188       break;
6189   }
6190 }
6191 
6192 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6193                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6194   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6195   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6196   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6197   // accessing the lookup table.
6198   vpand(dst, xtmp2, src, vec_enc);
6199   vpshufb(dst, xtmp1, dst, vec_enc);
6200   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6201   // accessing the lookup table.
6202   vpsrlw(xtmp3, src, 4, vec_enc);
6203   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6204   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6205   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6206   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6207   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6208   vpaddb(dst, dst, xtmp2, vec_enc);
6209   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6210 }
6211 
6212 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6213                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6214   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6215   // Add zero counts of lower byte and upper byte of a word if
6216   // upper byte holds a zero value.
6217   vpsrlw(xtmp3, src, 8, vec_enc);
6218   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6219   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6220   vpsllw(xtmp2, dst, 8, vec_enc);
6221   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6222   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6223   vpsrlw(dst, dst, 8, vec_enc);
6224 }
6225 
6226 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6227                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6228   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6229   // hence biased exponent can be used to compute leading zero count as per
6230   // following formula:-
6231   // LZCNT = 32 - (biased_exp - 127)
6232   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6233 
6234   // Broadcast 0xFF
6235   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6236   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6237 
6238   // Extract biased exponent.
6239   vcvtdq2ps(dst, src, vec_enc);
6240   vpsrld(dst, dst, 23, vec_enc);
6241   vpand(dst, dst, xtmp1, vec_enc);
6242 
6243   // Broadcast 127.
6244   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6245   // Exponent = biased_exp - 127
6246   vpsubd(dst, dst, xtmp1, vec_enc);
6247 
6248   // Exponent = Exponent  + 1
6249   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6250   vpaddd(dst, dst, xtmp3, vec_enc);
6251 
6252   // Replace -ve exponent with zero, exponent is -ve when src
6253   // lane contains a zero value.
6254   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6255   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6256 
6257   // Rematerialize broadcast 32.
6258   vpslld(xtmp1, xtmp3, 5, vec_enc);
6259   // Exponent is 32 if corresponding source lane contains max_int value.
6260   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6261   // LZCNT = 32 - exponent
6262   vpsubd(dst, xtmp1, dst, vec_enc);
6263 
6264   // Replace LZCNT with a value 1 if corresponding source lane
6265   // contains max_int value.
6266   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6267 
6268   // Replace biased_exp with 0 if source lane value is less than zero.
6269   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6270   vblendvps(dst, dst, xtmp2, src, vec_enc);
6271 }
6272 
6273 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6274                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6275   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6276   // Add zero counts of lower word and upper word of a double word if
6277   // upper word holds a zero value.
6278   vpsrld(xtmp3, src, 16, vec_enc);
6279   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6280   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6281   vpslld(xtmp2, dst, 16, vec_enc);
6282   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6283   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6284   vpsrld(dst, dst, 16, vec_enc);
6285   // Add zero counts of lower doubleword and upper doubleword of a
6286   // quadword if upper doubleword holds a zero value.
6287   vpsrlq(xtmp3, src, 32, vec_enc);
6288   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6289   vpsllq(xtmp2, dst, 32, vec_enc);
6290   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6291   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6292   vpsrlq(dst, dst, 32, vec_enc);
6293 }
6294 
6295 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6296                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6297                                                        Register rtmp, int vec_enc) {
6298   assert(is_integral_type(bt), "unexpected type");
6299   assert(vec_enc < Assembler::AVX_512bit, "");
6300   switch(bt) {
6301     case T_LONG:
6302       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6303       break;
6304     case T_INT:
6305       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6306       break;
6307     case T_SHORT:
6308       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6309       break;
6310     case T_BYTE:
6311       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6312       break;
6313     default:
6314       fatal("Unsupported type %s", type2name(bt));
6315       break;
6316   }
6317 }
6318 
6319 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6320   switch(bt) {
6321     case T_BYTE:
6322       vpsubb(dst, src1, src2, vec_enc);
6323       break;
6324     case T_SHORT:
6325       vpsubw(dst, src1, src2, vec_enc);
6326       break;
6327     case T_INT:
6328       vpsubd(dst, src1, src2, vec_enc);
6329       break;
6330     case T_LONG:
6331       vpsubq(dst, src1, src2, vec_enc);
6332       break;
6333     default:
6334       fatal("Unsupported type %s", type2name(bt));
6335       break;
6336   }
6337 }
6338 
6339 // Trailing zero count computation is based on leading zero count operation as per
6340 // following equation. All AVX3 targets support AVX512CD feature which offers
6341 // direct vector instruction to compute leading zero count.
6342 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6343 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6344                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6345                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6346   assert(is_integral_type(bt), "");
6347   // xtmp = -1
6348   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6349   // xtmp = xtmp + src
6350   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6351   // xtmp = xtmp & ~src
6352   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6353   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6354   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6355   vpsub(bt, dst, xtmp4, dst, vec_enc);
6356 }
6357 
6358 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6359 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6360 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6361                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6362   assert(is_integral_type(bt), "");
6363   // xtmp = 0
6364   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6365   // xtmp = 0 - src
6366   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6367   // xtmp = xtmp | src
6368   vpor(xtmp3, xtmp3, src, vec_enc);
6369   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6370   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6371   vpsub(bt, dst, xtmp1, dst, vec_enc);
6372 }
6373 
6374 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6375   Label done;
6376   Label neg_divisor_fastpath;
6377   cmpl(divisor, 0);
6378   jccb(Assembler::less, neg_divisor_fastpath);
6379   xorl(rdx, rdx);
6380   divl(divisor);
6381   jmpb(done);
6382   bind(neg_divisor_fastpath);
6383   // Fastpath for divisor < 0:
6384   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6385   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6386   movl(rdx, rax);
6387   subl(rdx, divisor);
6388   if (VM_Version::supports_bmi1()) {
6389     andnl(rax, rdx, rax);
6390   } else {
6391     notl(rdx);
6392     andl(rax, rdx);
6393   }
6394   shrl(rax, 31);
6395   bind(done);
6396 }
6397 
6398 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6399   Label done;
6400   Label neg_divisor_fastpath;
6401   cmpl(divisor, 0);
6402   jccb(Assembler::less, neg_divisor_fastpath);
6403   xorl(rdx, rdx);
6404   divl(divisor);
6405   jmpb(done);
6406   bind(neg_divisor_fastpath);
6407   // Fastpath when divisor < 0:
6408   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6409   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6410   movl(rdx, rax);
6411   subl(rax, divisor);
6412   if (VM_Version::supports_bmi1()) {
6413     andnl(rax, rax, rdx);
6414   } else {
6415     notl(rax);
6416     andl(rax, rdx);
6417   }
6418   sarl(rax, 31);
6419   andl(rax, divisor);
6420   subl(rdx, rax);
6421   bind(done);
6422 }
6423 
6424 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6425   Label done;
6426   Label neg_divisor_fastpath;
6427 
6428   cmpl(divisor, 0);
6429   jccb(Assembler::less, neg_divisor_fastpath);
6430   xorl(rdx, rdx);
6431   divl(divisor);
6432   jmpb(done);
6433   bind(neg_divisor_fastpath);
6434   // Fastpath for divisor < 0:
6435   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6436   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6437   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6438   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6439   movl(rdx, rax);
6440   subl(rax, divisor);
6441   if (VM_Version::supports_bmi1()) {
6442     andnl(rax, rax, rdx);
6443   } else {
6444     notl(rax);
6445     andl(rax, rdx);
6446   }
6447   movl(tmp, rax);
6448   shrl(rax, 31); // quotient
6449   sarl(tmp, 31);
6450   andl(tmp, divisor);
6451   subl(rdx, tmp); // remainder
6452   bind(done);
6453 }
6454 
6455 #ifdef _LP64
6456 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6457                                  XMMRegister xtmp2, Register rtmp) {
6458   if(VM_Version::supports_gfni()) {
6459     // Galois field instruction based bit reversal based on following algorithm.
6460     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6461     mov64(rtmp, 0x8040201008040201L);
6462     movq(xtmp1, src);
6463     movq(xtmp2, rtmp);
6464     gf2p8affineqb(xtmp1, xtmp2, 0);
6465     movq(dst, xtmp1);
6466   } else {
6467     // Swap even and odd numbered bits.
6468     movl(rtmp, src);
6469     andl(rtmp, 0x55555555);
6470     shll(rtmp, 1);
6471     movl(dst, src);
6472     andl(dst, 0xAAAAAAAA);
6473     shrl(dst, 1);
6474     orl(dst, rtmp);
6475 
6476     // Swap LSB and MSB 2 bits of each nibble.
6477     movl(rtmp, dst);
6478     andl(rtmp, 0x33333333);
6479     shll(rtmp, 2);
6480     andl(dst, 0xCCCCCCCC);
6481     shrl(dst, 2);
6482     orl(dst, rtmp);
6483 
6484     // Swap LSB and MSB 4 bits of each byte.
6485     movl(rtmp, dst);
6486     andl(rtmp, 0x0F0F0F0F);
6487     shll(rtmp, 4);
6488     andl(dst, 0xF0F0F0F0);
6489     shrl(dst, 4);
6490     orl(dst, rtmp);
6491   }
6492   bswapl(dst);
6493 }
6494 
6495 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6496                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6497   if(VM_Version::supports_gfni()) {
6498     // Galois field instruction based bit reversal based on following algorithm.
6499     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6500     mov64(rtmp1, 0x8040201008040201L);
6501     movq(xtmp1, src);
6502     movq(xtmp2, rtmp1);
6503     gf2p8affineqb(xtmp1, xtmp2, 0);
6504     movq(dst, xtmp1);
6505   } else {
6506     // Swap even and odd numbered bits.
6507     movq(rtmp1, src);
6508     mov64(rtmp2, 0x5555555555555555L);
6509     andq(rtmp1, rtmp2);
6510     shlq(rtmp1, 1);
6511     movq(dst, src);
6512     notq(rtmp2);
6513     andq(dst, rtmp2);
6514     shrq(dst, 1);
6515     orq(dst, rtmp1);
6516 
6517     // Swap LSB and MSB 2 bits of each nibble.
6518     movq(rtmp1, dst);
6519     mov64(rtmp2, 0x3333333333333333L);
6520     andq(rtmp1, rtmp2);
6521     shlq(rtmp1, 2);
6522     notq(rtmp2);
6523     andq(dst, rtmp2);
6524     shrq(dst, 2);
6525     orq(dst, rtmp1);
6526 
6527     // Swap LSB and MSB 4 bits of each byte.
6528     movq(rtmp1, dst);
6529     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6530     andq(rtmp1, rtmp2);
6531     shlq(rtmp1, 4);
6532     notq(rtmp2);
6533     andq(dst, rtmp2);
6534     shrq(dst, 4);
6535     orq(dst, rtmp1);
6536   }
6537   bswapq(dst);
6538 }
6539 
6540 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6541   Label done;
6542   Label neg_divisor_fastpath;
6543   cmpq(divisor, 0);
6544   jccb(Assembler::less, neg_divisor_fastpath);
6545   xorl(rdx, rdx);
6546   divq(divisor);
6547   jmpb(done);
6548   bind(neg_divisor_fastpath);
6549   // Fastpath for divisor < 0:
6550   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6551   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6552   movq(rdx, rax);
6553   subq(rdx, divisor);
6554   if (VM_Version::supports_bmi1()) {
6555     andnq(rax, rdx, rax);
6556   } else {
6557     notq(rdx);
6558     andq(rax, rdx);
6559   }
6560   shrq(rax, 63);
6561   bind(done);
6562 }
6563 
6564 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6565   Label done;
6566   Label neg_divisor_fastpath;
6567   cmpq(divisor, 0);
6568   jccb(Assembler::less, neg_divisor_fastpath);
6569   xorq(rdx, rdx);
6570   divq(divisor);
6571   jmp(done);
6572   bind(neg_divisor_fastpath);
6573   // Fastpath when divisor < 0:
6574   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6575   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6576   movq(rdx, rax);
6577   subq(rax, divisor);
6578   if (VM_Version::supports_bmi1()) {
6579     andnq(rax, rax, rdx);
6580   } else {
6581     notq(rax);
6582     andq(rax, rdx);
6583   }
6584   sarq(rax, 63);
6585   andq(rax, divisor);
6586   subq(rdx, rax);
6587   bind(done);
6588 }
6589 
6590 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6591   Label done;
6592   Label neg_divisor_fastpath;
6593   cmpq(divisor, 0);
6594   jccb(Assembler::less, neg_divisor_fastpath);
6595   xorq(rdx, rdx);
6596   divq(divisor);
6597   jmp(done);
6598   bind(neg_divisor_fastpath);
6599   // Fastpath for divisor < 0:
6600   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6601   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6602   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6603   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6604   movq(rdx, rax);
6605   subq(rax, divisor);
6606   if (VM_Version::supports_bmi1()) {
6607     andnq(rax, rax, rdx);
6608   } else {
6609     notq(rax);
6610     andq(rax, rdx);
6611   }
6612   movq(tmp, rax);
6613   shrq(rax, 63); // quotient
6614   sarq(tmp, 63);
6615   andq(tmp, divisor);
6616   subq(rdx, tmp); // remainder
6617   bind(done);
6618 }
6619 #endif
6620 
6621 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6622                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6623                                         int vlen_enc) {
6624   assert(VM_Version::supports_avx512bw(), "");
6625   // Byte shuffles are inlane operations and indices are determined using
6626   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6627   // normalized to index range 0-15. This makes sure that all the multiples
6628   // of an index value are placed at same relative position in 128 bit
6629   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6630   // will be 16th element in their respective 128 bit lanes.
6631   movl(rtmp, 16);
6632   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6633 
6634   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6635   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6636   // original shuffle indices and move the shuffled lanes corresponding to true
6637   // mask to destination vector.
6638   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6639   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6640   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6641 
6642   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6643   // and broadcasting second 128 bit lane.
6644   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6645   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6646   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6647   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6648   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6649 
6650   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6651   // and broadcasting third 128 bit lane.
6652   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6653   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6654   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6655   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6656   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6657 
6658   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6659   // and broadcasting third 128 bit lane.
6660   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6661   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6662   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6663   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6664   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6665 }
6666 
6667 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6668                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6669   if (vlen_enc == AVX_128bit) {
6670     vpermilps(dst, src, shuffle, vlen_enc);
6671   } else if (bt == T_INT) {
6672     vpermd(dst, shuffle, src, vlen_enc);
6673   } else {
6674     assert(bt == T_FLOAT, "");
6675     vpermps(dst, shuffle, src, vlen_enc);
6676   }
6677 }
6678 
6679 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6680   switch(elem_bt) {
6681     case T_BYTE:
6682       if (ideal_opc == Op_SaturatingAddV) {
6683         vpaddsb(dst, src1, src2, vlen_enc);
6684       } else {
6685         assert(ideal_opc == Op_SaturatingSubV, "");
6686         vpsubsb(dst, src1, src2, vlen_enc);
6687       }
6688       break;
6689     case T_SHORT:
6690       if (ideal_opc == Op_SaturatingAddV) {
6691         vpaddsw(dst, src1, src2, vlen_enc);
6692       } else {
6693         assert(ideal_opc == Op_SaturatingSubV, "");
6694         vpsubsw(dst, src1, src2, vlen_enc);
6695       }
6696       break;
6697     default:
6698       fatal("Unsupported type %s", type2name(elem_bt));
6699       break;
6700   }
6701 }
6702 
6703 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6704   switch(elem_bt) {
6705     case T_BYTE:
6706       if (ideal_opc == Op_SaturatingAddV) {
6707         vpaddusb(dst, src1, src2, vlen_enc);
6708       } else {
6709         assert(ideal_opc == Op_SaturatingSubV, "");
6710         vpsubusb(dst, src1, src2, vlen_enc);
6711       }
6712       break;
6713     case T_SHORT:
6714       if (ideal_opc == Op_SaturatingAddV) {
6715         vpaddusw(dst, src1, src2, vlen_enc);
6716       } else {
6717         assert(ideal_opc == Op_SaturatingSubV, "");
6718         vpsubusw(dst, src1, src2, vlen_enc);
6719       }
6720       break;
6721     default:
6722       fatal("Unsupported type %s", type2name(elem_bt));
6723       break;
6724   }
6725 }
6726 
6727 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6728                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6729   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6730   // overflow_mask = Inp1 <u Inp2
6731   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6732   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6733   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6734 }
6735 
6736 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6737                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6738   // Emulate unsigned comparison using signed comparison
6739   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6740   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6741   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6742   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6743 
6744   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6745 
6746   // Res = INP1 - INP2 (non-commutative and non-associative)
6747   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6748   // Res = Mask ? Zero : Res
6749   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6750   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6751 }
6752 
6753 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6754                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6755   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6756   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6757   // Res = Signed Add INP1, INP2
6758   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6759   // T1 = SRC1 | SRC2
6760   vpor(xtmp1, src1, src2, vlen_enc);
6761   // Max_Unsigned = -1
6762   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6763   // Unsigned compare:  Mask = Res <u T1
6764   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6765   // res  = Mask ? Max_Unsigned : Res
6766   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6767 }
6768 
6769 //
6770 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6771 // unsigned addition operation.
6772 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6773 //
6774 // We empirically determined its semantic equivalence to following reduced expression
6775 //    overflow_mask =  (a + b) <u (a | b)
6776 //
6777 // and also verified it though Alive2 solver.
6778 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6779 //
6780 
6781 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6782                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6783   // Res = Signed Add INP1, INP2
6784   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6785   // Compute T1 = INP1 | INP2
6786   vpor(xtmp3, src1, src2, vlen_enc);
6787   // T1 = Minimum signed value.
6788   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6789   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6790   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6791   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6792   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6793   // Compute overflow detection mask = Res<1> <s T1
6794   if (elem_bt == T_INT) {
6795     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6796   } else {
6797     assert(elem_bt == T_LONG, "");
6798     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6799   }
6800   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6801 }
6802 
6803 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6804                                       int vlen_enc, bool xtmp2_hold_M1) {
6805   if (VM_Version::supports_avx512dq()) {
6806     evpmovq2m(ktmp, src, vlen_enc);
6807   } else {
6808     assert(VM_Version::supports_evex(), "");
6809     if (!xtmp2_hold_M1) {
6810       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6811     }
6812     evpsraq(xtmp1, src, 63, vlen_enc);
6813     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6814   }
6815 }
6816 
6817 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6818                                       int vlen_enc, bool xtmp2_hold_M1) {
6819   if (VM_Version::supports_avx512dq()) {
6820     evpmovd2m(ktmp, src, vlen_enc);
6821   } else {
6822     assert(VM_Version::supports_evex(), "");
6823     if (!xtmp2_hold_M1) {
6824       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6825     }
6826     vpsrad(xtmp1, src, 31, vlen_enc);
6827     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6828   }
6829 }
6830 
6831 
6832 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6833   if (elem_bt == T_LONG) {
6834     if (VM_Version::supports_evex()) {
6835       evpsraq(dst, src, 63, vlen_enc);
6836     } else {
6837       vpsrad(dst, src, 31, vlen_enc);
6838       vpshufd(dst, dst, 0xF5, vlen_enc);
6839     }
6840   } else {
6841     assert(elem_bt == T_INT, "");
6842     vpsrad(dst, src, 31, vlen_enc);
6843   }
6844 }
6845 
6846 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6847   if (compute_allones) {
6848     if (vlen_enc == Assembler::AVX_512bit) {
6849       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6850     } else {
6851       vpcmpeqq(allones, allones, allones, vlen_enc);
6852     }
6853   }
6854   if (elem_bt == T_LONG) {
6855     vpsrlq(dst, allones, 1, vlen_enc);
6856   } else {
6857     assert(elem_bt == T_INT, "");
6858     vpsrld(dst, allones, 1, vlen_enc);
6859   }
6860 }
6861 
6862 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6863   if (compute_allones) {
6864     if (vlen_enc == Assembler::AVX_512bit) {
6865       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6866     } else {
6867       vpcmpeqq(allones, allones, allones, vlen_enc);
6868     }
6869   }
6870   if (elem_bt == T_LONG) {
6871     vpsllq(dst, allones, 63, vlen_enc);
6872   } else {
6873     assert(elem_bt == T_INT, "");
6874     vpslld(dst, allones, 31, vlen_enc);
6875   }
6876 }
6877 
6878 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6879                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6880   switch(elem_bt) {
6881     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6882     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6883     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6884     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6885     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6886   }
6887 }
6888 
6889 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6890   switch(elem_bt) {
6891     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6892     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6893     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6894     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6895     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6896   }
6897 }
6898 
6899 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6900                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6901   if (elem_bt == T_LONG) {
6902     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6903   } else {
6904     assert(elem_bt == T_INT, "");
6905     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6906   }
6907 }
6908 
6909 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6910                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6911                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6912   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6913   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6914   // Overflow detection based on Hacker's delight section 2-13.
6915   if (ideal_opc == Op_SaturatingAddV) {
6916     // res = src1 + src2
6917     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6918     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6919     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6920     vpxor(xtmp1, dst, src1, vlen_enc);
6921     vpxor(xtmp2, dst, src2, vlen_enc);
6922     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6923   } else {
6924     assert(ideal_opc == Op_SaturatingSubV, "");
6925     // res = src1 - src2
6926     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6927     // Overflow occurs when both inputs have opposite polarity and
6928     // result polarity does not comply with first input polarity.
6929     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6930     vpxor(xtmp1, src1, src2, vlen_enc);
6931     vpxor(xtmp2, dst, src1, vlen_enc);
6932     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6933   }
6934 
6935   // Compute overflow detection mask.
6936   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6937   // Note: xtmp1 hold -1 in all its lanes after above call.
6938 
6939   // Compute mask based on first input polarity.
6940   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6941 
6942   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6943   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6944 
6945   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6946   // set bits in first input polarity mask holds a min value.
6947   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6948   // Blend destination lanes with saturated values using overflow detection mask.
6949   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6950 }
6951 
6952 
6953 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6954                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6955                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6956   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6957   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6958   // Overflow detection based on Hacker's delight section 2-13.
6959   if (ideal_opc == Op_SaturatingAddV) {
6960     // res = src1 + src2
6961     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6962     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6963     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6964     vpxor(xtmp1, dst, src1, vlen_enc);
6965     vpxor(xtmp2, dst, src2, vlen_enc);
6966     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6967   } else {
6968     assert(ideal_opc == Op_SaturatingSubV, "");
6969     // res = src1 - src2
6970     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6971     // Overflow occurs when both inputs have opposite polarity and
6972     // result polarity does not comply with first input polarity.
6973     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6974     vpxor(xtmp1, src1, src2, vlen_enc);
6975     vpxor(xtmp2, dst, src1, vlen_enc);
6976     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6977   }
6978 
6979   // Sign-extend to compute overflow detection mask.
6980   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6981 
6982   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6983   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6984   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6985 
6986   // Compose saturating min/max vector using first input polarity mask.
6987   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6988   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6989 
6990   // Blend result with saturating vector using overflow detection mask.
6991   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6992 }
6993 
6994 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6995   switch(elem_bt) {
6996     case T_BYTE:
6997       if (ideal_opc == Op_SaturatingAddV) {
6998         vpaddsb(dst, src1, src2, vlen_enc);
6999       } else {
7000         assert(ideal_opc == Op_SaturatingSubV, "");
7001         vpsubsb(dst, src1, src2, vlen_enc);
7002       }
7003       break;
7004     case T_SHORT:
7005       if (ideal_opc == Op_SaturatingAddV) {
7006         vpaddsw(dst, src1, src2, vlen_enc);
7007       } else {
7008         assert(ideal_opc == Op_SaturatingSubV, "");
7009         vpsubsw(dst, src1, src2, vlen_enc);
7010       }
7011       break;
7012     default:
7013       fatal("Unsupported type %s", type2name(elem_bt));
7014       break;
7015   }
7016 }
7017 
7018 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7019   switch(elem_bt) {
7020     case T_BYTE:
7021       if (ideal_opc == Op_SaturatingAddV) {
7022         vpaddusb(dst, src1, src2, vlen_enc);
7023       } else {
7024         assert(ideal_opc == Op_SaturatingSubV, "");
7025         vpsubusb(dst, src1, src2, vlen_enc);
7026       }
7027       break;
7028     case T_SHORT:
7029       if (ideal_opc == Op_SaturatingAddV) {
7030         vpaddusw(dst, src1, src2, vlen_enc);
7031       } else {
7032         assert(ideal_opc == Op_SaturatingSubV, "");
7033         vpsubusw(dst, src1, src2, vlen_enc);
7034       }
7035       break;
7036     default:
7037       fatal("Unsupported type %s", type2name(elem_bt));
7038       break;
7039   }
7040 }
7041 
7042 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7043                                                      XMMRegister src2, int vlen_enc) {
7044   switch(elem_bt) {
7045     case T_BYTE:
7046       evpermi2b(dst, src1, src2, vlen_enc);
7047       break;
7048     case T_SHORT:
7049       evpermi2w(dst, src1, src2, vlen_enc);
7050       break;
7051     case T_INT:
7052       evpermi2d(dst, src1, src2, vlen_enc);
7053       break;
7054     case T_LONG:
7055       evpermi2q(dst, src1, src2, vlen_enc);
7056       break;
7057     case T_FLOAT:
7058       evpermi2ps(dst, src1, src2, vlen_enc);
7059       break;
7060     case T_DOUBLE:
7061       evpermi2pd(dst, src1, src2, vlen_enc);
7062       break;
7063     default:
7064       fatal("Unsupported type %s", type2name(elem_bt));
7065       break;
7066   }
7067 }
7068 
7069 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7070   if (is_unsigned) {
7071     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7072   } else {
7073     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7074   }
7075 }
7076 
7077 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7078   if (is_unsigned) {
7079     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7080   } else {
7081     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7082   }
7083 }