1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  54   if (C->clinit_barrier_on_entry()) {
  55     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  56     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  57 
  58     Label L_skip_barrier;
  59     Register klass = rscratch1;
  60 
  61     mov_metadata(klass, C->method()->holder()->constant_encoding());
  62     clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
  63 
  64     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  65 
  66     bind(L_skip_barrier);
  67   }
  68 
  69   int framesize = C->output()->frame_size_in_bytes();
  70   int bangsize = C->output()->bang_size_in_bytes();
  71   bool fp_mode_24b = false;
  72   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  73 
  74   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  75   // NativeJump::patch_verified_entry will be able to patch out the entry
  76   // code safely. The push to verify stack depth is ok at 5 bytes,
  77   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  78   // stack bang then we must use the 6 byte frame allocation even if
  79   // we have no frame. :-(
  80   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  81 
  82   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  83   // Remove word for return addr
  84   framesize -= wordSize;
  85   stack_bang_size -= wordSize;
  86 
  87   // Calls to C2R adapters often do not accept exceptional returns.
  88   // We require that their callers must bang for them.  But be careful, because
  89   // some VM calls (such as call site linkage) can use several kilobytes of
  90   // stack.  But the stack safety zone should account for that.
  91   // See bugs 4446381, 4468289, 4497237.
  92   if (stack_bang_size > 0) {
  93     generate_stack_overflow_check(stack_bang_size);
  94 
  95     // We always push rbp, so that on return to interpreter rbp, will be
  96     // restored correctly and we can correct the stack.
  97     push(rbp);
  98     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  99     if (PreserveFramePointer) {
 100       mov(rbp, rsp);
 101     }
 102     // Remove word for ebp
 103     framesize -= wordSize;
 104 
 105     // Create frame
 106     if (framesize) {
 107       subptr(rsp, framesize);
 108     }
 109   } else {
 110     // Create frame (force generation of a 4 byte immediate value)
 111     subptr_imm32(rsp, framesize);
 112 
 113     // Save RBP register now.
 114     framesize -= wordSize;
 115     movptr(Address(rsp, framesize), rbp);
 116     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 117     if (PreserveFramePointer) {
 118       movptr(rbp, rsp);
 119       if (framesize > 0) {
 120         addptr(rbp, framesize);
 121       }
 122     }
 123   }
 124 
 125   if (C->needs_stack_repair()) {
 126     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 127     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 128     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
 129   }
 130 
 131   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 132     framesize -= wordSize;
 133     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 134   }
 135 
 136 #ifndef _LP64
 137   // If method sets FPU control word do it now
 138   if (fp_mode_24b) {
 139     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 140   }
 141   if (UseSSE >= 2 && VerifyFPU) {
 142     verify_FPU(0, "FPU stack must be clean on entry");
 143   }
 144 #endif
 145 
 146 #ifdef ASSERT
 147   if (VerifyStackAtCalls) {
 148     Label L;
 149     push(rax);
 150     mov(rax, rsp);
 151     andptr(rax, StackAlignmentInBytes-1);
 152     cmpptr(rax, StackAlignmentInBytes-wordSize);
 153     pop(rax);
 154     jcc(Assembler::equal, L);
 155     STOP("Stack is not properly aligned!");
 156     bind(L);
 157   }
 158 #endif
 159 }
 160 
 161 void C2_MacroAssembler::entry_barrier() {
 162   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 163 #ifdef _LP64
 164   if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 165     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 166     Label dummy_slow_path;
 167     Label dummy_continuation;
 168     Label* slow_path = &dummy_slow_path;
 169     Label* continuation = &dummy_continuation;
 170     if (!Compile::current()->output()->in_scratch_emit_size()) {
 171       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 172       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 173       Compile::current()->output()->add_stub(stub);
 174       slow_path = &stub->entry();
 175       continuation = &stub->continuation();
 176     }
 177     bs->nmethod_entry_barrier(this, slow_path, continuation);
 178   }
 179 #else
 180   // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 181   bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 182 #endif
 183 }
 184 
 185 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 186   switch (vlen_in_bytes) {
 187     case  4: // fall-through
 188     case  8: // fall-through
 189     case 16: return Assembler::AVX_128bit;
 190     case 32: return Assembler::AVX_256bit;
 191     case 64: return Assembler::AVX_512bit;
 192 
 193     default: {
 194       ShouldNotReachHere();
 195       return Assembler::AVX_NoVec;
 196     }
 197   }
 198 }
 199 
 200 // fast_lock and fast_unlock used by C2
 201 
 202 // Because the transitions from emitted code to the runtime
 203 // monitorenter/exit helper stubs are so slow it's critical that
 204 // we inline both the stack-locking fast path and the inflated fast path.
 205 //
 206 // See also: cmpFastLock and cmpFastUnlock.
 207 //
 208 // What follows is a specialized inline transliteration of the code
 209 // in enter() and exit(). If we're concerned about I$ bloat another
 210 // option would be to emit TrySlowEnter and TrySlowExit methods
 211 // at startup-time.  These methods would accept arguments as
 212 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 213 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 214 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 215 // In practice, however, the # of lock sites is bounded and is usually small.
 216 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 217 // if the processor uses simple bimodal branch predictors keyed by EIP
 218 // Since the helper routines would be called from multiple synchronization
 219 // sites.
 220 //
 221 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 222 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 223 // to those specialized methods.  That'd give us a mostly platform-independent
 224 // implementation that the JITs could optimize and inline at their pleasure.
 225 // Done correctly, the only time we'd need to cross to native could would be
 226 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 227 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 228 // (b) explicit barriers or fence operations.
 229 //
 230 // TODO:
 231 //
 232 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 233 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 234 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 235 //    the lock operators would typically be faster than reifying Self.
 236 //
 237 // *  Ideally I'd define the primitives as:
 238 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 239 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 240 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 241 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 242 //    Furthermore the register assignments are overconstrained, possibly resulting in
 243 //    sub-optimal code near the synchronization site.
 244 //
 245 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 246 //    Alternately, use a better sp-proximity test.
 247 //
 248 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 249 //    Either one is sufficient to uniquely identify a thread.
 250 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 251 //
 252 // *  Intrinsify notify() and notifyAll() for the common cases where the
 253 //    object is locked by the calling thread but the waitlist is empty.
 254 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 255 //
 256 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 257 //    But beware of excessive branch density on AMD Opterons.
 258 //
 259 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 260 //    or failure of the fast path.  If the fast path fails then we pass
 261 //    control to the slow path, typically in C.  In fast_lock and
 262 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 263 //    will emit a conditional branch immediately after the node.
 264 //    So we have branches to branches and lots of ICC.ZF games.
 265 //    Instead, it might be better to have C2 pass a "FailureLabel"
 266 //    into fast_lock and fast_unlock.  In the case of success, control
 267 //    will drop through the node.  ICC.ZF is undefined at exit.
 268 //    In the case of failure, the node will branch directly to the
 269 //    FailureLabel
 270 
 271 
 272 // obj: object to lock
 273 // box: on-stack box address (displaced header location) - KILLED
 274 // rax,: tmp -- KILLED
 275 // scr: tmp -- KILLED
 276 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 277                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 278                                  Metadata* method_data) {
 279   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 280   // Ensure the register assignments are disjoint
 281   assert(tmpReg == rax, "");
 282   assert(cx1Reg == noreg, "");
 283   assert(cx2Reg == noreg, "");
 284   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 285 
 286   // Possible cases that we'll encounter in fast_lock
 287   // ------------------------------------------------
 288   // * Inflated
 289   //    -- unlocked
 290   //    -- Locked
 291   //       = by self
 292   //       = by other
 293   // * neutral
 294   // * stack-locked
 295   //    -- by self
 296   //       = sp-proximity test hits
 297   //       = sp-proximity test generates false-negative
 298   //    -- by other
 299   //
 300 
 301   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 302 
 303   if (DiagnoseSyncOnValueBasedClasses != 0) {
 304     load_klass(tmpReg, objReg, scrReg);
 305     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 306     jcc(Assembler::notZero, DONE_LABEL);
 307   }
 308 
 309   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 310   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 311   jcc(Assembler::notZero, IsInflated);
 312 
 313   if (LockingMode == LM_MONITOR) {
 314     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 315     testptr(objReg, objReg);
 316   } else {
 317     assert(LockingMode == LM_LEGACY, "must be");
 318     // Attempt stack-locking ...
 319     orptr (tmpReg, markWord::unlocked_value);
 320     if (EnableValhalla) {
 321       // Mask inline_type bit such that we go to the slow path if object is an inline type
 322       andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place));
 323     }
 324     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 325     lock();
 326     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 327     jcc(Assembler::equal, COUNT);           // Success
 328 
 329     // Recursive locking.
 330     // The object is stack-locked: markword contains stack pointer to BasicLock.
 331     // Locked by current thread if difference with current SP is less than one page.
 332     subptr(tmpReg, rsp);
 333     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 334     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 335     movptr(Address(boxReg, 0), tmpReg);
 336   }
 337   jmp(DONE_LABEL);
 338 
 339   bind(IsInflated);
 340   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 341 
 342 #ifndef _LP64
 343   // The object is inflated.
 344 
 345   // boxReg refers to the on-stack BasicLock in the current frame.
 346   // We'd like to write:
 347   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 348   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 349   // additional latency as we have another ST in the store buffer that must drain.
 350 
 351   // avoid ST-before-CAS
 352   // register juggle because we need tmpReg for cmpxchgptr below
 353   movptr(scrReg, boxReg);
 354   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 355 
 356   // Optimistic form: consider XORL tmpReg,tmpReg
 357   movptr(tmpReg, NULL_WORD);
 358 
 359   // Appears unlocked - try to swing _owner from null to non-null.
 360   // Ideally, I'd manifest "Self" with get_thread and then attempt
 361   // to CAS the register containing Self into m->Owner.
 362   // But we don't have enough registers, so instead we can either try to CAS
 363   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 364   // we later store "Self" into m->Owner.  Transiently storing a stack address
 365   // (rsp or the address of the box) into  m->owner is harmless.
 366   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 367   lock();
 368   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 369   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 370   // If we weren't able to swing _owner from null to the BasicLock
 371   // then take the slow path.
 372   jccb  (Assembler::notZero, NO_COUNT);
 373   // update _owner from BasicLock to thread
 374   get_thread (scrReg);                    // beware: clobbers ICCs
 375   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 376   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 377 
 378   // If the CAS fails we can either retry or pass control to the slow path.
 379   // We use the latter tactic.
 380   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 381   // If the CAS was successful ...
 382   //   Self has acquired the lock
 383   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 384   // Intentional fall-through into DONE_LABEL ...
 385 #else // _LP64
 386   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 387   movq(scrReg, tmpReg);
 388   xorq(tmpReg, tmpReg);
 389   lock();
 390   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 391   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 392   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 393   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 394   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 395   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 396 
 397   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 398   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 399   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 400   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 401 #endif // _LP64
 402   bind(DONE_LABEL);
 403 
 404   // ZFlag == 1 count in fast path
 405   // ZFlag == 0 count in slow path
 406   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 407 
 408   bind(COUNT);
 409   // Count monitors in fast path
 410   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 411 
 412   xorl(tmpReg, tmpReg); // Set ZF == 1
 413 
 414   bind(NO_COUNT);
 415 
 416   // At NO_COUNT the icc ZFlag is set as follows ...
 417   // fast_unlock uses the same protocol.
 418   // ZFlag == 1 -> Success
 419   // ZFlag == 0 -> Failure - force control through the slow path
 420 }
 421 
 422 // obj: object to unlock
 423 // box: box address (displaced header location), killed.  Must be EAX.
 424 // tmp: killed, cannot be obj nor box.
 425 //
 426 // Some commentary on balanced locking:
 427 //
 428 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 429 // Methods that don't have provably balanced locking are forced to run in the
 430 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 431 // The interpreter provides two properties:
 432 // I1:  At return-time the interpreter automatically and quietly unlocks any
 433 //      objects acquired the current activation (frame).  Recall that the
 434 //      interpreter maintains an on-stack list of locks currently held by
 435 //      a frame.
 436 // I2:  If a method attempts to unlock an object that is not held by the
 437 //      the frame the interpreter throws IMSX.
 438 //
 439 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 440 // B() doesn't have provably balanced locking so it runs in the interpreter.
 441 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 442 // is still locked by A().
 443 //
 444 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 445 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 446 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 447 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 448 // Arguably given that the spec legislates the JNI case as undefined our implementation
 449 // could reasonably *avoid* checking owner in fast_unlock().
 450 // In the interest of performance we elide m->Owner==Self check in unlock.
 451 // A perfectly viable alternative is to elide the owner check except when
 452 // Xcheck:jni is enabled.
 453 
 454 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 455   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 456   assert(boxReg == rax, "");
 457   assert_different_registers(objReg, boxReg, tmpReg);
 458 
 459   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 460 
 461   if (LockingMode == LM_LEGACY) {
 462     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 463     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 464   }
 465   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 466   if (LockingMode != LM_MONITOR) {
 467     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 468     jcc(Assembler::zero, Stacked);
 469   }
 470 
 471   // It's inflated.
 472 
 473   // Despite our balanced locking property we still check that m->_owner == Self
 474   // as java routines or native JNI code called by this thread might
 475   // have released the lock.
 476   // Refer to the comments in synchronizer.cpp for how we might encode extra
 477   // state in _succ so we can avoid fetching EntryList|cxq.
 478   //
 479   // If there's no contention try a 1-0 exit.  That is, exit without
 480   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 481   // we detect and recover from the race that the 1-0 exit admits.
 482   //
 483   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 484   // before it STs null into _owner, releasing the lock.  Updates
 485   // to data protected by the critical section must be visible before
 486   // we drop the lock (and thus before any other thread could acquire
 487   // the lock and observe the fields protected by the lock).
 488   // IA32's memory-model is SPO, so STs are ordered with respect to
 489   // each other and there's no need for an explicit barrier (fence).
 490   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 491   Label LSuccess, LNotRecursive;
 492 
 493   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 494   jccb(Assembler::equal, LNotRecursive);
 495 
 496   // Recursive inflated unlock
 497   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 498   jmpb(LSuccess);
 499 
 500   bind(LNotRecursive);
 501 
 502   // Set owner to null.
 503   // Release to satisfy the JMM
 504   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 505   // We need a full fence after clearing owner to avoid stranding.
 506   // StoreLoad achieves this.
 507   membar(StoreLoad);
 508 
 509   // Check if the entry lists are empty (EntryList first - by convention).
 510   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 511   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 512   jccb(Assembler::zero, LSuccess);    // If so we are done.
 513 
 514   // Check if there is a successor.
 515   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 516   jccb(Assembler::notZero, LSuccess); // If so we are done.
 517 
 518   // Save the monitor pointer in the current thread, so we can try to
 519   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 520   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 521 #ifndef _LP64
 522   get_thread(boxReg);
 523   movptr(Address(boxReg, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 524 #else // _LP64
 525   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 526 #endif
 527 
 528   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 529   jmpb  (DONE_LABEL);
 530 
 531   bind  (LSuccess);
 532   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 533   jmpb  (DONE_LABEL);
 534 
 535   if (LockingMode == LM_LEGACY) {
 536     bind  (Stacked);
 537     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 538     lock();
 539     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 540     // Intentional fall-thru into DONE_LABEL
 541   }
 542 
 543   bind(DONE_LABEL);
 544 
 545   // ZFlag == 1 count in fast path
 546   // ZFlag == 0 count in slow path
 547   jccb(Assembler::notZero, NO_COUNT);
 548 
 549   bind(COUNT);
 550   // Count monitors in fast path
 551 #ifndef _LP64
 552   get_thread(tmpReg);
 553   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 554 #else // _LP64
 555   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 556 #endif
 557 
 558   xorl(tmpReg, tmpReg); // Set ZF == 1
 559 
 560   bind(NO_COUNT);
 561 }
 562 
 563 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 564                                               Register t, Register thread) {
 565   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 566   assert(rax_reg == rax, "Used for CAS");
 567   assert_different_registers(obj, box, rax_reg, t, thread);
 568 
 569   // Handle inflated monitor.
 570   Label inflated;
 571   // Finish fast lock successfully. ZF value is irrelevant.
 572   Label locked;
 573   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 574   Label slow_path;
 575 
 576   if (UseObjectMonitorTable) {
 577     // Clear cache in case fast locking succeeds.
 578     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 579   }
 580 
 581   if (DiagnoseSyncOnValueBasedClasses != 0) {
 582     load_klass(rax_reg, obj, t);
 583     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 584     jcc(Assembler::notZero, slow_path);
 585   }
 586 
 587   const Register mark = t;
 588 
 589   { // Lightweight Lock
 590 
 591     Label push;
 592 
 593     const Register top = UseObjectMonitorTable ? rax_reg : box;
 594 
 595     // Load the mark.
 596     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 597 
 598     // Prefetch top.
 599     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 600 
 601     // Check for monitor (0b10).
 602     testptr(mark, markWord::monitor_value);
 603     jcc(Assembler::notZero, inflated);
 604 
 605     // Check if lock-stack is full.
 606     cmpl(top, LockStack::end_offset() - 1);
 607     jcc(Assembler::greater, slow_path);
 608 
 609     // Check if recursive.
 610     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 611     jccb(Assembler::equal, push);
 612 
 613     // Try to lock. Transition lock bits 0b01 => 0b00
 614     movptr(rax_reg, mark);
 615     orptr(rax_reg, markWord::unlocked_value);
 616     andptr(mark, ~(int32_t)markWord::unlocked_value);
 617     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 618     jcc(Assembler::notEqual, slow_path);
 619 
 620     if (UseObjectMonitorTable) {
 621       // Need to reload top, clobbered by CAS.
 622       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 623     }
 624     bind(push);
 625     // After successful lock, push object on lock-stack.
 626     movptr(Address(thread, top), obj);
 627     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 628     jmpb(locked);
 629   }
 630 
 631   { // Handle inflated monitor.
 632     bind(inflated);
 633 
 634     const Register monitor = t;
 635 
 636     if (!UseObjectMonitorTable) {
 637       assert(mark == monitor, "should be the same here");
 638     } else {
 639       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 640       // Fetch ObjectMonitor* from the cache or take the slow-path.
 641       Label monitor_found;
 642 
 643       // Load cache address
 644       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 645 
 646       const int num_unrolled = 2;
 647       for (int i = 0; i < num_unrolled; i++) {
 648         cmpptr(obj, Address(t));
 649         jccb(Assembler::equal, monitor_found);
 650         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 651       }
 652 
 653       Label loop;
 654 
 655       // Search for obj in cache.
 656       bind(loop);
 657 
 658       // Check for match.
 659       cmpptr(obj, Address(t));
 660       jccb(Assembler::equal, monitor_found);
 661 
 662       // Search until null encountered, guaranteed _null_sentinel at end.
 663       cmpptr(Address(t), 1);
 664       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 665       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 666       jmpb(loop);
 667 
 668       // Cache hit.
 669       bind(monitor_found);
 670       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 671     }
 672     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 673     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 674     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 675 
 676     Label monitor_locked;
 677     // Lock the monitor.
 678 
 679     // CAS owner (null => current thread).
 680     xorptr(rax_reg, rax_reg);
 681     lock(); cmpxchgptr(thread, owner_address);
 682     jccb(Assembler::equal, monitor_locked);
 683 
 684     // Check if recursive.
 685     cmpptr(thread, rax_reg);
 686     jccb(Assembler::notEqual, slow_path);
 687 
 688     // Recursive.
 689     increment(recursions_address);
 690 
 691     bind(monitor_locked);
 692     if (UseObjectMonitorTable) {
 693       // Cache the monitor for unlock
 694       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 695     }
 696   }
 697 
 698   bind(locked);
 699   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 700   // Set ZF = 1
 701   xorl(rax_reg, rax_reg);
 702 
 703 #ifdef ASSERT
 704   // Check that locked label is reached with ZF set.
 705   Label zf_correct;
 706   Label zf_bad_zero;
 707   jcc(Assembler::zero, zf_correct);
 708   jmp(zf_bad_zero);
 709 #endif
 710 
 711   bind(slow_path);
 712 #ifdef ASSERT
 713   // Check that slow_path label is reached with ZF not set.
 714   jcc(Assembler::notZero, zf_correct);
 715   stop("Fast Lock ZF != 0");
 716   bind(zf_bad_zero);
 717   stop("Fast Lock ZF != 1");
 718   bind(zf_correct);
 719 #endif
 720   // C2 uses the value of ZF to determine the continuation.
 721 }
 722 
 723 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 724   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 725   assert(reg_rax == rax, "Used for CAS");
 726   assert_different_registers(obj, reg_rax, t);
 727 
 728   // Handle inflated monitor.
 729   Label inflated, inflated_check_lock_stack;
 730   // Finish fast unlock successfully.  MUST jump with ZF == 1
 731   Label unlocked, slow_path;
 732 
 733   const Register mark = t;
 734   const Register monitor = t;
 735   const Register top = UseObjectMonitorTable ? t : reg_rax;
 736   const Register box = reg_rax;
 737 
 738   Label dummy;
 739   C2FastUnlockLightweightStub* stub = nullptr;
 740 
 741   if (!Compile::current()->output()->in_scratch_emit_size()) {
 742     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 743     Compile::current()->output()->add_stub(stub);
 744   }
 745 
 746   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 747 
 748   { // Lightweight Unlock
 749 
 750     // Load top.
 751     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 752 
 753     if (!UseObjectMonitorTable) {
 754       // Prefetch mark.
 755       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 756     }
 757 
 758     // Check if obj is top of lock-stack.
 759     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 760     // Top of lock stack was not obj. Must be monitor.
 761     jcc(Assembler::notEqual, inflated_check_lock_stack);
 762 
 763     // Pop lock-stack.
 764     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 765     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 766 
 767     // Check if recursive.
 768     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 769     jcc(Assembler::equal, unlocked);
 770 
 771     // We elide the monitor check, let the CAS fail instead.
 772 
 773     if (UseObjectMonitorTable) {
 774       // Load mark.
 775       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 776     }
 777 
 778     // Try to unlock. Transition lock bits 0b00 => 0b01
 779     movptr(reg_rax, mark);
 780     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 781     orptr(mark, markWord::unlocked_value);
 782     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 783     jcc(Assembler::notEqual, push_and_slow_path);
 784     jmp(unlocked);
 785   }
 786 
 787 
 788   { // Handle inflated monitor.
 789     bind(inflated_check_lock_stack);
 790 #ifdef ASSERT
 791     Label check_done;
 792     subl(top, oopSize);
 793     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 794     jcc(Assembler::below, check_done);
 795     cmpptr(obj, Address(thread, top));
 796     jccb(Assembler::notEqual, inflated_check_lock_stack);
 797     stop("Fast Unlock lock on stack");
 798     bind(check_done);
 799     if (UseObjectMonitorTable) {
 800       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 801     }
 802     testptr(mark, markWord::monitor_value);
 803     jccb(Assembler::notZero, inflated);
 804     stop("Fast Unlock not monitor");
 805 #endif
 806 
 807     bind(inflated);
 808 
 809     if (!UseObjectMonitorTable) {
 810       assert(mark == monitor, "should be the same here");
 811     } else {
 812       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 813       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 814       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 815       cmpptr(monitor, alignof(ObjectMonitor*));
 816       jcc(Assembler::below, slow_path);
 817     }
 818     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 819     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 820     const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag};
 821     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 822     const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag};
 823     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 824 
 825     Label recursive;
 826 
 827     // Check if recursive.
 828     cmpptr(recursions_address, 0);
 829     jccb(Assembler::notZero, recursive);
 830 
 831     // Set owner to null.
 832     // Release to satisfy the JMM
 833     movptr(owner_address, NULL_WORD);
 834     // We need a full fence after clearing owner to avoid stranding.
 835     // StoreLoad achieves this.
 836     membar(StoreLoad);
 837 
 838     // Check if the entry lists are empty (EntryList first - by convention).
 839     movptr(reg_rax, EntryList_address);
 840     orptr(reg_rax, cxq_address);
 841     jccb(Assembler::zero, unlocked);    // If so we are done.
 842 
 843     // Check if there is a successor.
 844     cmpptr(succ_address, NULL_WORD);
 845     jccb(Assembler::notZero, unlocked); // If so we are done.
 846 
 847     // Save the monitor pointer in the current thread, so we can try to
 848     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 849     if (!UseObjectMonitorTable) {
 850       andptr(monitor, ~(int32_t)markWord::monitor_value);
 851     }
 852     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 853 
 854     orl(t, 1); // Fast Unlock ZF = 0
 855     jmpb(slow_path);
 856 
 857     // Recursive unlock.
 858     bind(recursive);
 859     decrement(recursions_address);
 860   }
 861 
 862   bind(unlocked);
 863   decrement(Address(thread, JavaThread::held_monitor_count_offset()));
 864   xorl(t, t); // Fast Unlock ZF = 1
 865 
 866 #ifdef ASSERT
 867   // Check that unlocked label is reached with ZF set.
 868   Label zf_correct;
 869   Label zf_bad_zero;
 870   jcc(Assembler::zero, zf_correct);
 871   jmp(zf_bad_zero);
 872 #endif
 873 
 874   bind(slow_path);
 875   if (stub != nullptr) {
 876     bind(stub->slow_path_continuation());
 877   }
 878 #ifdef ASSERT
 879   // Check that stub->continuation() label is reached with ZF not set.
 880   jcc(Assembler::notZero, zf_correct);
 881   stop("Fast Unlock ZF != 0");
 882   bind(zf_bad_zero);
 883   stop("Fast Unlock ZF != 1");
 884   bind(zf_correct);
 885 #endif
 886   // C2 uses the value of ZF to determine the continuation.
 887 }
 888 
 889 //-------------------------------------------------------------------------------------------
 890 // Generic instructions support for use in .ad files C2 code generation
 891 
 892 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 893   if (dst != src) {
 894     movdqu(dst, src);
 895   }
 896   if (opcode == Op_AbsVD) {
 897     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 898   } else {
 899     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 900     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 901   }
 902 }
 903 
 904 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 905   if (opcode == Op_AbsVD) {
 906     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 907   } else {
 908     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 909     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 910   }
 911 }
 912 
 913 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 914   if (dst != src) {
 915     movdqu(dst, src);
 916   }
 917   if (opcode == Op_AbsVF) {
 918     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 919   } else {
 920     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 921     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 922   }
 923 }
 924 
 925 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 926   if (opcode == Op_AbsVF) {
 927     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 928   } else {
 929     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 930     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 931   }
 932 }
 933 
 934 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 935   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 936   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 937 
 938   if (opcode == Op_MinV) {
 939     if (elem_bt == T_BYTE) {
 940       pminsb(dst, src);
 941     } else if (elem_bt == T_SHORT) {
 942       pminsw(dst, src);
 943     } else if (elem_bt == T_INT) {
 944       pminsd(dst, src);
 945     } else {
 946       assert(elem_bt == T_LONG, "required");
 947       assert(tmp == xmm0, "required");
 948       assert_different_registers(dst, src, tmp);
 949       movdqu(xmm0, dst);
 950       pcmpgtq(xmm0, src);
 951       blendvpd(dst, src);  // xmm0 as mask
 952     }
 953   } else { // opcode == Op_MaxV
 954     if (elem_bt == T_BYTE) {
 955       pmaxsb(dst, src);
 956     } else if (elem_bt == T_SHORT) {
 957       pmaxsw(dst, src);
 958     } else if (elem_bt == T_INT) {
 959       pmaxsd(dst, src);
 960     } else {
 961       assert(elem_bt == T_LONG, "required");
 962       assert(tmp == xmm0, "required");
 963       assert_different_registers(dst, src, tmp);
 964       movdqu(xmm0, src);
 965       pcmpgtq(xmm0, dst);
 966       blendvpd(dst, src);  // xmm0 as mask
 967     }
 968   }
 969 }
 970 
 971 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 972                                   XMMRegister src1, Address src2, int vlen_enc) {
 973   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 974   if (opcode == Op_UMinV) {
 975     switch(elem_bt) {
 976       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 977       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 978       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 979       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 980       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 981     }
 982   } else {
 983     assert(opcode == Op_UMaxV, "required");
 984     switch(elem_bt) {
 985       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 986       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 987       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 988       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 989       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 990     }
 991   }
 992 }
 993 
 994 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 995   // For optimality, leverage a full vector width of 512 bits
 996   // for operations over smaller vector sizes on AVX512 targets.
 997   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 998     if (opcode == Op_UMaxV) {
 999       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
1000     } else {
1001       assert(opcode == Op_UMinV, "required");
1002       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
1003     }
1004   } else {
1005     // T1 = -1
1006     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
1007     // T1 = -1 << 63
1008     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
1009     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
1010     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
1011     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
1012     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
1013     // Mask = T2 > T1
1014     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
1015     if (opcode == Op_UMaxV) {
1016       // Res = Mask ? Src2 : Src1
1017       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
1018     } else {
1019       // Res = Mask ? Src1 : Src2
1020       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
1021     }
1022   }
1023 }
1024 
1025 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
1026                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
1027   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
1028   if (opcode == Op_UMinV) {
1029     switch(elem_bt) {
1030       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
1031       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
1032       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
1033       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
1034       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1035     }
1036   } else {
1037     assert(opcode == Op_UMaxV, "required");
1038     switch(elem_bt) {
1039       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
1040       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
1041       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
1042       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
1043       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1044     }
1045   }
1046 }
1047 
1048 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1049                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1050                                  int vlen_enc) {
1051   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1052 
1053   if (opcode == Op_MinV) {
1054     if (elem_bt == T_BYTE) {
1055       vpminsb(dst, src1, src2, vlen_enc);
1056     } else if (elem_bt == T_SHORT) {
1057       vpminsw(dst, src1, src2, vlen_enc);
1058     } else if (elem_bt == T_INT) {
1059       vpminsd(dst, src1, src2, vlen_enc);
1060     } else {
1061       assert(elem_bt == T_LONG, "required");
1062       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1063         vpminsq(dst, src1, src2, vlen_enc);
1064       } else {
1065         assert_different_registers(dst, src1, src2);
1066         vpcmpgtq(dst, src1, src2, vlen_enc);
1067         vblendvpd(dst, src1, src2, dst, vlen_enc);
1068       }
1069     }
1070   } else { // opcode == Op_MaxV
1071     if (elem_bt == T_BYTE) {
1072       vpmaxsb(dst, src1, src2, vlen_enc);
1073     } else if (elem_bt == T_SHORT) {
1074       vpmaxsw(dst, src1, src2, vlen_enc);
1075     } else if (elem_bt == T_INT) {
1076       vpmaxsd(dst, src1, src2, vlen_enc);
1077     } else {
1078       assert(elem_bt == T_LONG, "required");
1079       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1080         vpmaxsq(dst, src1, src2, vlen_enc);
1081       } else {
1082         assert_different_registers(dst, src1, src2);
1083         vpcmpgtq(dst, src1, src2, vlen_enc);
1084         vblendvpd(dst, src2, src1, dst, vlen_enc);
1085       }
1086     }
1087   }
1088 }
1089 
1090 // Float/Double min max
1091 
1092 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1093                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1094                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1095                                    int vlen_enc) {
1096   assert(UseAVX > 0, "required");
1097   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1098          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1099   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1100   assert_different_registers(a, tmp, atmp, btmp);
1101   assert_different_registers(b, tmp, atmp, btmp);
1102 
1103   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1104   bool is_double_word = is_double_word_type(elem_bt);
1105 
1106   /* Note on 'non-obvious' assembly sequence:
1107    *
1108    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1109    * and Java on how they handle floats:
1110    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1111    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1112    *
1113    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1114    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1115    *                (only useful when signs differ, noop otherwise)
1116    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1117 
1118    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1119    *   btmp = (b < +0.0) ? a : b
1120    *   atmp = (b < +0.0) ? b : a
1121    *   Tmp  = Max_Float(atmp , btmp)
1122    *   Res  = (atmp == NaN) ? atmp : Tmp
1123    */
1124 
1125   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1126   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1127   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1128   XMMRegister mask;
1129 
1130   if (!is_double_word && is_min) {
1131     mask = a;
1132     vblend = &MacroAssembler::vblendvps;
1133     vmaxmin = &MacroAssembler::vminps;
1134     vcmp = &MacroAssembler::vcmpps;
1135   } else if (!is_double_word && !is_min) {
1136     mask = b;
1137     vblend = &MacroAssembler::vblendvps;
1138     vmaxmin = &MacroAssembler::vmaxps;
1139     vcmp = &MacroAssembler::vcmpps;
1140   } else if (is_double_word && is_min) {
1141     mask = a;
1142     vblend = &MacroAssembler::vblendvpd;
1143     vmaxmin = &MacroAssembler::vminpd;
1144     vcmp = &MacroAssembler::vcmppd;
1145   } else {
1146     assert(is_double_word && !is_min, "sanity");
1147     mask = b;
1148     vblend = &MacroAssembler::vblendvpd;
1149     vmaxmin = &MacroAssembler::vmaxpd;
1150     vcmp = &MacroAssembler::vcmppd;
1151   }
1152 
1153   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1154   XMMRegister maxmin, scratch;
1155   if (dst == btmp) {
1156     maxmin = btmp;
1157     scratch = tmp;
1158   } else {
1159     maxmin = tmp;
1160     scratch = btmp;
1161   }
1162 
1163   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1164   if (precompute_mask && !is_double_word) {
1165     vpsrad(tmp, mask, 32, vlen_enc);
1166     mask = tmp;
1167   } else if (precompute_mask && is_double_word) {
1168     vpxor(tmp, tmp, tmp, vlen_enc);
1169     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1170     mask = tmp;
1171   }
1172 
1173   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1174   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1175   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1176   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1177   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1178 }
1179 
1180 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1181                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1182                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1183                                     int vlen_enc) {
1184   assert(UseAVX > 2, "required");
1185   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1186          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1187   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1188   assert_different_registers(dst, a, atmp, btmp);
1189   assert_different_registers(dst, b, atmp, btmp);
1190 
1191   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1192   bool is_double_word = is_double_word_type(elem_bt);
1193   bool merge = true;
1194 
1195   if (!is_double_word && is_min) {
1196     evpmovd2m(ktmp, a, vlen_enc);
1197     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1198     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1199     vminps(dst, atmp, btmp, vlen_enc);
1200     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1201     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1202   } else if (!is_double_word && !is_min) {
1203     evpmovd2m(ktmp, b, vlen_enc);
1204     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1205     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1206     vmaxps(dst, atmp, btmp, vlen_enc);
1207     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1208     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1209   } else if (is_double_word && is_min) {
1210     evpmovq2m(ktmp, a, vlen_enc);
1211     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1212     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1213     vminpd(dst, atmp, btmp, vlen_enc);
1214     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1215     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1216   } else {
1217     assert(is_double_word && !is_min, "sanity");
1218     evpmovq2m(ktmp, b, vlen_enc);
1219     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1220     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1221     vmaxpd(dst, atmp, btmp, vlen_enc);
1222     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1223     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1224   }
1225 }
1226 
1227 // Float/Double signum
1228 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1229   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1230 
1231   Label DONE_LABEL;
1232 
1233   if (opcode == Op_SignumF) {
1234     assert(UseSSE > 0, "required");
1235     ucomiss(dst, zero);
1236     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1237     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1238     movflt(dst, one);
1239     jcc(Assembler::above, DONE_LABEL);
1240     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1241   } else if (opcode == Op_SignumD) {
1242     assert(UseSSE > 1, "required");
1243     ucomisd(dst, zero);
1244     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1245     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1246     movdbl(dst, one);
1247     jcc(Assembler::above, DONE_LABEL);
1248     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1249   }
1250 
1251   bind(DONE_LABEL);
1252 }
1253 
1254 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1255   if (sign) {
1256     pmovsxbw(dst, src);
1257   } else {
1258     pmovzxbw(dst, src);
1259   }
1260 }
1261 
1262 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1263   if (sign) {
1264     vpmovsxbw(dst, src, vector_len);
1265   } else {
1266     vpmovzxbw(dst, src, vector_len);
1267   }
1268 }
1269 
1270 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1271   if (sign) {
1272     vpmovsxbd(dst, src, vector_len);
1273   } else {
1274     vpmovzxbd(dst, src, vector_len);
1275   }
1276 }
1277 
1278 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1279   if (sign) {
1280     vpmovsxwd(dst, src, vector_len);
1281   } else {
1282     vpmovzxwd(dst, src, vector_len);
1283   }
1284 }
1285 
1286 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1287                                      int shift, int vector_len) {
1288   if (opcode == Op_RotateLeftV) {
1289     if (etype == T_INT) {
1290       evprold(dst, src, shift, vector_len);
1291     } else {
1292       assert(etype == T_LONG, "expected type T_LONG");
1293       evprolq(dst, src, shift, vector_len);
1294     }
1295   } else {
1296     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1297     if (etype == T_INT) {
1298       evprord(dst, src, shift, vector_len);
1299     } else {
1300       assert(etype == T_LONG, "expected type T_LONG");
1301       evprorq(dst, src, shift, vector_len);
1302     }
1303   }
1304 }
1305 
1306 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1307                                      XMMRegister shift, int vector_len) {
1308   if (opcode == Op_RotateLeftV) {
1309     if (etype == T_INT) {
1310       evprolvd(dst, src, shift, vector_len);
1311     } else {
1312       assert(etype == T_LONG, "expected type T_LONG");
1313       evprolvq(dst, src, shift, vector_len);
1314     }
1315   } else {
1316     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1317     if (etype == T_INT) {
1318       evprorvd(dst, src, shift, vector_len);
1319     } else {
1320       assert(etype == T_LONG, "expected type T_LONG");
1321       evprorvq(dst, src, shift, vector_len);
1322     }
1323   }
1324 }
1325 
1326 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1327   if (opcode == Op_RShiftVI) {
1328     psrad(dst, shift);
1329   } else if (opcode == Op_LShiftVI) {
1330     pslld(dst, shift);
1331   } else {
1332     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1333     psrld(dst, shift);
1334   }
1335 }
1336 
1337 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1338   switch (opcode) {
1339     case Op_RShiftVI:  psrad(dst, shift); break;
1340     case Op_LShiftVI:  pslld(dst, shift); break;
1341     case Op_URShiftVI: psrld(dst, shift); break;
1342 
1343     default: assert(false, "%s", NodeClassNames[opcode]);
1344   }
1345 }
1346 
1347 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1348   if (opcode == Op_RShiftVI) {
1349     vpsrad(dst, nds, shift, vector_len);
1350   } else if (opcode == Op_LShiftVI) {
1351     vpslld(dst, nds, shift, vector_len);
1352   } else {
1353     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1354     vpsrld(dst, nds, shift, vector_len);
1355   }
1356 }
1357 
1358 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1359   switch (opcode) {
1360     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1361     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1362     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1363 
1364     default: assert(false, "%s", NodeClassNames[opcode]);
1365   }
1366 }
1367 
1368 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1369   switch (opcode) {
1370     case Op_RShiftVB:  // fall-through
1371     case Op_RShiftVS:  psraw(dst, shift); break;
1372 
1373     case Op_LShiftVB:  // fall-through
1374     case Op_LShiftVS:  psllw(dst, shift);   break;
1375 
1376     case Op_URShiftVS: // fall-through
1377     case Op_URShiftVB: psrlw(dst, shift);  break;
1378 
1379     default: assert(false, "%s", NodeClassNames[opcode]);
1380   }
1381 }
1382 
1383 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1384   switch (opcode) {
1385     case Op_RShiftVB:  // fall-through
1386     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1387 
1388     case Op_LShiftVB:  // fall-through
1389     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1390 
1391     case Op_URShiftVS: // fall-through
1392     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1393 
1394     default: assert(false, "%s", NodeClassNames[opcode]);
1395   }
1396 }
1397 
1398 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1399   switch (opcode) {
1400     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1401     case Op_LShiftVL:  psllq(dst, shift); break;
1402     case Op_URShiftVL: psrlq(dst, shift); break;
1403 
1404     default: assert(false, "%s", NodeClassNames[opcode]);
1405   }
1406 }
1407 
1408 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1409   if (opcode == Op_RShiftVL) {
1410     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1411   } else if (opcode == Op_LShiftVL) {
1412     psllq(dst, shift);
1413   } else {
1414     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1415     psrlq(dst, shift);
1416   }
1417 }
1418 
1419 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1420   switch (opcode) {
1421     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1422     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1423     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1424 
1425     default: assert(false, "%s", NodeClassNames[opcode]);
1426   }
1427 }
1428 
1429 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1430   if (opcode == Op_RShiftVL) {
1431     evpsraq(dst, nds, shift, vector_len);
1432   } else if (opcode == Op_LShiftVL) {
1433     vpsllq(dst, nds, shift, vector_len);
1434   } else {
1435     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1436     vpsrlq(dst, nds, shift, vector_len);
1437   }
1438 }
1439 
1440 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1441   switch (opcode) {
1442     case Op_RShiftVB:  // fall-through
1443     case Op_RShiftVS:  // fall-through
1444     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1445 
1446     case Op_LShiftVB:  // fall-through
1447     case Op_LShiftVS:  // fall-through
1448     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1449 
1450     case Op_URShiftVB: // fall-through
1451     case Op_URShiftVS: // fall-through
1452     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1453 
1454     default: assert(false, "%s", NodeClassNames[opcode]);
1455   }
1456 }
1457 
1458 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1459   switch (opcode) {
1460     case Op_RShiftVB:  // fall-through
1461     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1462 
1463     case Op_LShiftVB:  // fall-through
1464     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1465 
1466     case Op_URShiftVB: // fall-through
1467     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1468 
1469     default: assert(false, "%s", NodeClassNames[opcode]);
1470   }
1471 }
1472 
1473 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1474   assert(UseAVX >= 2, "required");
1475   switch (opcode) {
1476     case Op_RShiftVL: {
1477       if (UseAVX > 2) {
1478         assert(tmp == xnoreg, "not used");
1479         if (!VM_Version::supports_avx512vl()) {
1480           vlen_enc = Assembler::AVX_512bit;
1481         }
1482         evpsravq(dst, src, shift, vlen_enc);
1483       } else {
1484         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1485         vpsrlvq(dst, src, shift, vlen_enc);
1486         vpsrlvq(tmp, tmp, shift, vlen_enc);
1487         vpxor(dst, dst, tmp, vlen_enc);
1488         vpsubq(dst, dst, tmp, vlen_enc);
1489       }
1490       break;
1491     }
1492     case Op_LShiftVL: {
1493       assert(tmp == xnoreg, "not used");
1494       vpsllvq(dst, src, shift, vlen_enc);
1495       break;
1496     }
1497     case Op_URShiftVL: {
1498       assert(tmp == xnoreg, "not used");
1499       vpsrlvq(dst, src, shift, vlen_enc);
1500       break;
1501     }
1502     default: assert(false, "%s", NodeClassNames[opcode]);
1503   }
1504 }
1505 
1506 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1507 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1508   assert(opcode == Op_LShiftVB ||
1509          opcode == Op_RShiftVB ||
1510          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1511   bool sign = (opcode != Op_URShiftVB);
1512   assert(vector_len == 0, "required");
1513   vextendbd(sign, dst, src, 1);
1514   vpmovzxbd(vtmp, shift, 1);
1515   varshiftd(opcode, dst, dst, vtmp, 1);
1516   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1517   vextracti128_high(vtmp, dst);
1518   vpackusdw(dst, dst, vtmp, 0);
1519 }
1520 
1521 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1522 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1523   assert(opcode == Op_LShiftVB ||
1524          opcode == Op_RShiftVB ||
1525          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1526   bool sign = (opcode != Op_URShiftVB);
1527   int ext_vector_len = vector_len + 1;
1528   vextendbw(sign, dst, src, ext_vector_len);
1529   vpmovzxbw(vtmp, shift, ext_vector_len);
1530   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1531   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1532   if (vector_len == 0) {
1533     vextracti128_high(vtmp, dst);
1534     vpackuswb(dst, dst, vtmp, vector_len);
1535   } else {
1536     vextracti64x4_high(vtmp, dst);
1537     vpackuswb(dst, dst, vtmp, vector_len);
1538     vpermq(dst, dst, 0xD8, vector_len);
1539   }
1540 }
1541 
1542 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1543   switch(typ) {
1544     case T_BYTE:
1545       pinsrb(dst, val, idx);
1546       break;
1547     case T_SHORT:
1548       pinsrw(dst, val, idx);
1549       break;
1550     case T_INT:
1551       pinsrd(dst, val, idx);
1552       break;
1553     case T_LONG:
1554       pinsrq(dst, val, idx);
1555       break;
1556     default:
1557       assert(false,"Should not reach here.");
1558       break;
1559   }
1560 }
1561 
1562 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1563   switch(typ) {
1564     case T_BYTE:
1565       vpinsrb(dst, src, val, idx);
1566       break;
1567     case T_SHORT:
1568       vpinsrw(dst, src, val, idx);
1569       break;
1570     case T_INT:
1571       vpinsrd(dst, src, val, idx);
1572       break;
1573     case T_LONG:
1574       vpinsrq(dst, src, val, idx);
1575       break;
1576     default:
1577       assert(false,"Should not reach here.");
1578       break;
1579   }
1580 }
1581 
1582 #ifdef _LP64
1583 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1584                                                 XMMRegister dst, Register base,
1585                                                 Register idx_base,
1586                                                 Register offset, Register mask,
1587                                                 Register mask_idx, Register rtmp,
1588                                                 int vlen_enc) {
1589   vpxor(dst, dst, dst, vlen_enc);
1590   if (elem_bt == T_SHORT) {
1591     for (int i = 0; i < 4; i++) {
1592       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1593       Label skip_load;
1594       btq(mask, mask_idx);
1595       jccb(Assembler::carryClear, skip_load);
1596       movl(rtmp, Address(idx_base, i * 4));
1597       if (offset != noreg) {
1598         addl(rtmp, offset);
1599       }
1600       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1601       bind(skip_load);
1602       incq(mask_idx);
1603     }
1604   } else {
1605     assert(elem_bt == T_BYTE, "");
1606     for (int i = 0; i < 8; i++) {
1607       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1608       Label skip_load;
1609       btq(mask, mask_idx);
1610       jccb(Assembler::carryClear, skip_load);
1611       movl(rtmp, Address(idx_base, i * 4));
1612       if (offset != noreg) {
1613         addl(rtmp, offset);
1614       }
1615       pinsrb(dst, Address(base, rtmp), i);
1616       bind(skip_load);
1617       incq(mask_idx);
1618     }
1619   }
1620 }
1621 #endif // _LP64
1622 
1623 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1624                                          Register base, Register idx_base,
1625                                          Register offset, Register rtmp,
1626                                          int vlen_enc) {
1627   vpxor(dst, dst, dst, vlen_enc);
1628   if (elem_bt == T_SHORT) {
1629     for (int i = 0; i < 4; i++) {
1630       // dst[i] = src[offset + idx_base[i]]
1631       movl(rtmp, Address(idx_base, i * 4));
1632       if (offset != noreg) {
1633         addl(rtmp, offset);
1634       }
1635       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1636     }
1637   } else {
1638     assert(elem_bt == T_BYTE, "");
1639     for (int i = 0; i < 8; i++) {
1640       // dst[i] = src[offset + idx_base[i]]
1641       movl(rtmp, Address(idx_base, i * 4));
1642       if (offset != noreg) {
1643         addl(rtmp, offset);
1644       }
1645       pinsrb(dst, Address(base, rtmp), i);
1646     }
1647   }
1648 }
1649 
1650 /*
1651  * Gather using hybrid algorithm, first partially unroll scalar loop
1652  * to accumulate values from gather indices into a quad-word(64bit) slice.
1653  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1654  * permutation to place the slice into appropriate vector lane
1655  * locations in destination vector. Following pseudo code describes the
1656  * algorithm in detail:
1657  *
1658  * DST_VEC = ZERO_VEC
1659  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1660  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1661  * FOREACH_ITER:
1662  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1663  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1664  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1665  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1666  *
1667  * With each iteration, doubleword permute indices (0,1) corresponding
1668  * to gathered quadword gets right shifted by two lane positions.
1669  *
1670  */
1671 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1672                                         Register base, Register idx_base,
1673                                         Register offset, Register mask,
1674                                         XMMRegister xtmp1, XMMRegister xtmp2,
1675                                         XMMRegister temp_dst, Register rtmp,
1676                                         Register mask_idx, Register length,
1677                                         int vector_len, int vlen_enc) {
1678   Label GATHER8_LOOP;
1679   assert(is_subword_type(elem_ty), "");
1680   movl(length, vector_len);
1681   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1682   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1683   vallones(xtmp2, vlen_enc);
1684   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1685   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1686   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1687 
1688   bind(GATHER8_LOOP);
1689     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1690     if (mask == noreg) {
1691       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1692     } else {
1693       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1694     }
1695     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1696     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1697     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1698     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1699     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1700     vpor(dst, dst, temp_dst, vlen_enc);
1701     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1702     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1703     jcc(Assembler::notEqual, GATHER8_LOOP);
1704 }
1705 
1706 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1707   switch(typ) {
1708     case T_INT:
1709       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1710       break;
1711     case T_FLOAT:
1712       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1713       break;
1714     case T_LONG:
1715       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1716       break;
1717     case T_DOUBLE:
1718       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1719       break;
1720     default:
1721       assert(false,"Should not reach here.");
1722       break;
1723   }
1724 }
1725 
1726 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1727   switch(typ) {
1728     case T_INT:
1729       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1730       break;
1731     case T_FLOAT:
1732       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1733       break;
1734     case T_LONG:
1735       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1736       break;
1737     case T_DOUBLE:
1738       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1739       break;
1740     default:
1741       assert(false,"Should not reach here.");
1742       break;
1743   }
1744 }
1745 
1746 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1747   switch(typ) {
1748     case T_INT:
1749       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1750       break;
1751     case T_FLOAT:
1752       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1753       break;
1754     case T_LONG:
1755       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1756       break;
1757     case T_DOUBLE:
1758       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1759       break;
1760     default:
1761       assert(false,"Should not reach here.");
1762       break;
1763   }
1764 }
1765 
1766 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1767   if (vlen_in_bytes <= 16) {
1768     pxor (dst, dst);
1769     psubb(dst, src);
1770     switch (elem_bt) {
1771       case T_BYTE:   /* nothing to do */ break;
1772       case T_SHORT:  pmovsxbw(dst, dst); break;
1773       case T_INT:    pmovsxbd(dst, dst); break;
1774       case T_FLOAT:  pmovsxbd(dst, dst); break;
1775       case T_LONG:   pmovsxbq(dst, dst); break;
1776       case T_DOUBLE: pmovsxbq(dst, dst); break;
1777 
1778       default: assert(false, "%s", type2name(elem_bt));
1779     }
1780   } else {
1781     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1782     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1783 
1784     vpxor (dst, dst, dst, vlen_enc);
1785     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1786 
1787     switch (elem_bt) {
1788       case T_BYTE:   /* nothing to do */            break;
1789       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1790       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1791       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1792       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1793       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1794 
1795       default: assert(false, "%s", type2name(elem_bt));
1796     }
1797   }
1798 }
1799 
1800 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1801   if (novlbwdq) {
1802     vpmovsxbd(xtmp, src, vlen_enc);
1803     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1804             Assembler::eq, true, vlen_enc, noreg);
1805   } else {
1806     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1807     vpsubb(xtmp, xtmp, src, vlen_enc);
1808     evpmovb2m(dst, xtmp, vlen_enc);
1809   }
1810 }
1811 
1812 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1813   switch (vlen_in_bytes) {
1814     case 4:  movdl(dst, src);   break;
1815     case 8:  movq(dst, src);    break;
1816     case 16: movdqu(dst, src);  break;
1817     case 32: vmovdqu(dst, src); break;
1818     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1819     default: ShouldNotReachHere();
1820   }
1821 }
1822 
1823 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1824   assert(rscratch != noreg || always_reachable(src), "missing");
1825 
1826   if (reachable(src)) {
1827     load_vector(dst, as_Address(src), vlen_in_bytes);
1828   } else {
1829     lea(rscratch, src);
1830     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1831   }
1832 }
1833 
1834 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1835   int vlen_enc = vector_length_encoding(vlen);
1836   if (VM_Version::supports_avx()) {
1837     if (bt == T_LONG) {
1838       if (VM_Version::supports_avx2()) {
1839         vpbroadcastq(dst, src, vlen_enc);
1840       } else {
1841         vmovddup(dst, src, vlen_enc);
1842       }
1843     } else if (bt == T_DOUBLE) {
1844       if (vlen_enc != Assembler::AVX_128bit) {
1845         vbroadcastsd(dst, src, vlen_enc, noreg);
1846       } else {
1847         vmovddup(dst, src, vlen_enc);
1848       }
1849     } else {
1850       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1851         vpbroadcastd(dst, src, vlen_enc);
1852       } else {
1853         vbroadcastss(dst, src, vlen_enc);
1854       }
1855     }
1856   } else if (VM_Version::supports_sse3()) {
1857     movddup(dst, src);
1858   } else {
1859     movq(dst, src);
1860     if (vlen == 16) {
1861       punpcklqdq(dst, dst);
1862     }
1863   }
1864 }
1865 
1866 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1867   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1868   int offset = exact_log2(type2aelembytes(bt)) << 6;
1869   if (is_floating_point_type(bt)) {
1870     offset += 128;
1871   }
1872   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1873   load_vector(dst, addr, vlen_in_bytes);
1874 }
1875 
1876 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1877 
1878 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1879   int vector_len = Assembler::AVX_128bit;
1880 
1881   switch (opcode) {
1882     case Op_AndReductionV:  pand(dst, src); break;
1883     case Op_OrReductionV:   por (dst, src); break;
1884     case Op_XorReductionV:  pxor(dst, src); break;
1885     case Op_MinReductionV:
1886       switch (typ) {
1887         case T_BYTE:        pminsb(dst, src); break;
1888         case T_SHORT:       pminsw(dst, src); break;
1889         case T_INT:         pminsd(dst, src); break;
1890         case T_LONG:        assert(UseAVX > 2, "required");
1891                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1892         default:            assert(false, "wrong type");
1893       }
1894       break;
1895     case Op_MaxReductionV:
1896       switch (typ) {
1897         case T_BYTE:        pmaxsb(dst, src); break;
1898         case T_SHORT:       pmaxsw(dst, src); break;
1899         case T_INT:         pmaxsd(dst, src); break;
1900         case T_LONG:        assert(UseAVX > 2, "required");
1901                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1902         default:            assert(false, "wrong type");
1903       }
1904       break;
1905     case Op_AddReductionVF: addss(dst, src); break;
1906     case Op_AddReductionVD: addsd(dst, src); break;
1907     case Op_AddReductionVI:
1908       switch (typ) {
1909         case T_BYTE:        paddb(dst, src); break;
1910         case T_SHORT:       paddw(dst, src); break;
1911         case T_INT:         paddd(dst, src); break;
1912         default:            assert(false, "wrong type");
1913       }
1914       break;
1915     case Op_AddReductionVL: paddq(dst, src); break;
1916     case Op_MulReductionVF: mulss(dst, src); break;
1917     case Op_MulReductionVD: mulsd(dst, src); break;
1918     case Op_MulReductionVI:
1919       switch (typ) {
1920         case T_SHORT:       pmullw(dst, src); break;
1921         case T_INT:         pmulld(dst, src); break;
1922         default:            assert(false, "wrong type");
1923       }
1924       break;
1925     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1926                             evpmullq(dst, dst, src, vector_len); break;
1927     default:                assert(false, "wrong opcode");
1928   }
1929 }
1930 
1931 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1932   switch (opcode) {
1933     case Op_AddReductionVF: addps(dst, src); break;
1934     case Op_AddReductionVD: addpd(dst, src); break;
1935     case Op_MulReductionVF: mulps(dst, src); break;
1936     case Op_MulReductionVD: mulpd(dst, src); break;
1937     default:                assert(false, "%s", NodeClassNames[opcode]);
1938   }
1939 }
1940 
1941 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1942   int vector_len = Assembler::AVX_256bit;
1943 
1944   switch (opcode) {
1945     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1946     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1947     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1948     case Op_MinReductionV:
1949       switch (typ) {
1950         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1951         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1952         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1953         case T_LONG:        assert(UseAVX > 2, "required");
1954                             vpminsq(dst, src1, src2, vector_len); break;
1955         default:            assert(false, "wrong type");
1956       }
1957       break;
1958     case Op_MaxReductionV:
1959       switch (typ) {
1960         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1961         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1962         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1963         case T_LONG:        assert(UseAVX > 2, "required");
1964                             vpmaxsq(dst, src1, src2, vector_len); break;
1965         default:            assert(false, "wrong type");
1966       }
1967       break;
1968     case Op_AddReductionVI:
1969       switch (typ) {
1970         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1971         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1972         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1973         default:            assert(false, "wrong type");
1974       }
1975       break;
1976     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1977     case Op_MulReductionVI:
1978       switch (typ) {
1979         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1980         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1981         default:            assert(false, "wrong type");
1982       }
1983       break;
1984     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1985     default:                assert(false, "wrong opcode");
1986   }
1987 }
1988 
1989 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1990   int vector_len = Assembler::AVX_256bit;
1991 
1992   switch (opcode) {
1993     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1994     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1995     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1996     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1997     default:                assert(false, "%s", NodeClassNames[opcode]);
1998   }
1999 }
2000 
2001 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
2002                                   XMMRegister dst, XMMRegister src,
2003                                   XMMRegister vtmp1, XMMRegister vtmp2) {
2004   switch (opcode) {
2005     case Op_AddReductionVF:
2006     case Op_MulReductionVF:
2007       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2008       break;
2009 
2010     case Op_AddReductionVD:
2011     case Op_MulReductionVD:
2012       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2013       break;
2014 
2015     default: assert(false, "wrong opcode");
2016   }
2017 }
2018 
2019 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
2020                                             XMMRegister dst, XMMRegister src,
2021                                             XMMRegister vtmp1, XMMRegister vtmp2) {
2022   switch (opcode) {
2023     case Op_AddReductionVF:
2024     case Op_MulReductionVF:
2025       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2026       break;
2027 
2028     case Op_AddReductionVD:
2029     case Op_MulReductionVD:
2030       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2031       break;
2032 
2033     default: assert(false, "%s", NodeClassNames[opcode]);
2034   }
2035 }
2036 
2037 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2038                              Register dst, Register src1, XMMRegister src2,
2039                              XMMRegister vtmp1, XMMRegister vtmp2) {
2040   switch (vlen) {
2041     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2042     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2043     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2044     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2045 
2046     default: assert(false, "wrong vector length");
2047   }
2048 }
2049 
2050 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2051                              Register dst, Register src1, XMMRegister src2,
2052                              XMMRegister vtmp1, XMMRegister vtmp2) {
2053   switch (vlen) {
2054     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2055     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2056     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2057     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2058 
2059     default: assert(false, "wrong vector length");
2060   }
2061 }
2062 
2063 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2064                              Register dst, Register src1, XMMRegister src2,
2065                              XMMRegister vtmp1, XMMRegister vtmp2) {
2066   switch (vlen) {
2067     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2068     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2069     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2070     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2071 
2072     default: assert(false, "wrong vector length");
2073   }
2074 }
2075 
2076 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2077                              Register dst, Register src1, XMMRegister src2,
2078                              XMMRegister vtmp1, XMMRegister vtmp2) {
2079   switch (vlen) {
2080     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2081     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2082     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2083     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2084 
2085     default: assert(false, "wrong vector length");
2086   }
2087 }
2088 
2089 #ifdef _LP64
2090 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2091                              Register dst, Register src1, XMMRegister src2,
2092                              XMMRegister vtmp1, XMMRegister vtmp2) {
2093   switch (vlen) {
2094     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2095     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2096     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2097 
2098     default: assert(false, "wrong vector length");
2099   }
2100 }
2101 #endif // _LP64
2102 
2103 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2104   switch (vlen) {
2105     case 2:
2106       assert(vtmp2 == xnoreg, "");
2107       reduce2F(opcode, dst, src, vtmp1);
2108       break;
2109     case 4:
2110       assert(vtmp2 == xnoreg, "");
2111       reduce4F(opcode, dst, src, vtmp1);
2112       break;
2113     case 8:
2114       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2115       break;
2116     case 16:
2117       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2118       break;
2119     default: assert(false, "wrong vector length");
2120   }
2121 }
2122 
2123 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2124   switch (vlen) {
2125     case 2:
2126       assert(vtmp2 == xnoreg, "");
2127       reduce2D(opcode, dst, src, vtmp1);
2128       break;
2129     case 4:
2130       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2131       break;
2132     case 8:
2133       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2134       break;
2135     default: assert(false, "wrong vector length");
2136   }
2137 }
2138 
2139 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2140   switch (vlen) {
2141     case 2:
2142       assert(vtmp1 == xnoreg, "");
2143       assert(vtmp2 == xnoreg, "");
2144       unorderedReduce2F(opcode, dst, src);
2145       break;
2146     case 4:
2147       assert(vtmp2 == xnoreg, "");
2148       unorderedReduce4F(opcode, dst, src, vtmp1);
2149       break;
2150     case 8:
2151       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2152       break;
2153     case 16:
2154       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2155       break;
2156     default: assert(false, "wrong vector length");
2157   }
2158 }
2159 
2160 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2161   switch (vlen) {
2162     case 2:
2163       assert(vtmp1 == xnoreg, "");
2164       assert(vtmp2 == xnoreg, "");
2165       unorderedReduce2D(opcode, dst, src);
2166       break;
2167     case 4:
2168       assert(vtmp2 == xnoreg, "");
2169       unorderedReduce4D(opcode, dst, src, vtmp1);
2170       break;
2171     case 8:
2172       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2173       break;
2174     default: assert(false, "wrong vector length");
2175   }
2176 }
2177 
2178 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2179   if (opcode == Op_AddReductionVI) {
2180     if (vtmp1 != src2) {
2181       movdqu(vtmp1, src2);
2182     }
2183     phaddd(vtmp1, vtmp1);
2184   } else {
2185     pshufd(vtmp1, src2, 0x1);
2186     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2187   }
2188   movdl(vtmp2, src1);
2189   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2190   movdl(dst, vtmp1);
2191 }
2192 
2193 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2194   if (opcode == Op_AddReductionVI) {
2195     if (vtmp1 != src2) {
2196       movdqu(vtmp1, src2);
2197     }
2198     phaddd(vtmp1, src2);
2199     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2200   } else {
2201     pshufd(vtmp2, src2, 0xE);
2202     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2203     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2204   }
2205 }
2206 
2207 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2208   if (opcode == Op_AddReductionVI) {
2209     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2210     vextracti128_high(vtmp2, vtmp1);
2211     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2212     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2213   } else {
2214     vextracti128_high(vtmp1, src2);
2215     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2216     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2217   }
2218 }
2219 
2220 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2221   vextracti64x4_high(vtmp2, src2);
2222   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2223   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2224 }
2225 
2226 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2227   pshufd(vtmp2, src2, 0x1);
2228   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2229   movdqu(vtmp1, vtmp2);
2230   psrldq(vtmp1, 2);
2231   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2232   movdqu(vtmp2, vtmp1);
2233   psrldq(vtmp2, 1);
2234   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2235   movdl(vtmp2, src1);
2236   pmovsxbd(vtmp1, vtmp1);
2237   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2238   pextrb(dst, vtmp1, 0x0);
2239   movsbl(dst, dst);
2240 }
2241 
2242 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2243   pshufd(vtmp1, src2, 0xE);
2244   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2245   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2246 }
2247 
2248 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2249   vextracti128_high(vtmp2, src2);
2250   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2251   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2252 }
2253 
2254 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2255   vextracti64x4_high(vtmp1, src2);
2256   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2257   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2258 }
2259 
2260 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2261   pmovsxbw(vtmp2, src2);
2262   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2263 }
2264 
2265 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2266   if (UseAVX > 1) {
2267     int vector_len = Assembler::AVX_256bit;
2268     vpmovsxbw(vtmp1, src2, vector_len);
2269     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2270   } else {
2271     pmovsxbw(vtmp2, src2);
2272     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2273     pshufd(vtmp2, src2, 0x1);
2274     pmovsxbw(vtmp2, src2);
2275     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2276   }
2277 }
2278 
2279 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2280   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2281     int vector_len = Assembler::AVX_512bit;
2282     vpmovsxbw(vtmp1, src2, vector_len);
2283     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2284   } else {
2285     assert(UseAVX >= 2,"Should not reach here.");
2286     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2287     vextracti128_high(vtmp2, src2);
2288     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2289   }
2290 }
2291 
2292 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2293   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2294   vextracti64x4_high(vtmp2, src2);
2295   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2296 }
2297 
2298 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2299   if (opcode == Op_AddReductionVI) {
2300     if (vtmp1 != src2) {
2301       movdqu(vtmp1, src2);
2302     }
2303     phaddw(vtmp1, vtmp1);
2304     phaddw(vtmp1, vtmp1);
2305   } else {
2306     pshufd(vtmp2, src2, 0x1);
2307     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2308     movdqu(vtmp1, vtmp2);
2309     psrldq(vtmp1, 2);
2310     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2311   }
2312   movdl(vtmp2, src1);
2313   pmovsxwd(vtmp1, vtmp1);
2314   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2315   pextrw(dst, vtmp1, 0x0);
2316   movswl(dst, dst);
2317 }
2318 
2319 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2320   if (opcode == Op_AddReductionVI) {
2321     if (vtmp1 != src2) {
2322       movdqu(vtmp1, src2);
2323     }
2324     phaddw(vtmp1, src2);
2325   } else {
2326     pshufd(vtmp1, src2, 0xE);
2327     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2328   }
2329   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2330 }
2331 
2332 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2333   if (opcode == Op_AddReductionVI) {
2334     int vector_len = Assembler::AVX_256bit;
2335     vphaddw(vtmp2, src2, src2, vector_len);
2336     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2337   } else {
2338     vextracti128_high(vtmp2, src2);
2339     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2340   }
2341   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2342 }
2343 
2344 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2345   int vector_len = Assembler::AVX_256bit;
2346   vextracti64x4_high(vtmp1, src2);
2347   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2348   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2349 }
2350 
2351 #ifdef _LP64
2352 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2353   pshufd(vtmp2, src2, 0xE);
2354   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2355   movdq(vtmp1, src1);
2356   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2357   movdq(dst, vtmp1);
2358 }
2359 
2360 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2361   vextracti128_high(vtmp1, src2);
2362   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2363   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2364 }
2365 
2366 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2367   vextracti64x4_high(vtmp2, src2);
2368   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2369   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2370 }
2371 
2372 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2373   mov64(temp, -1L);
2374   bzhiq(temp, temp, len);
2375   kmovql(dst, temp);
2376 }
2377 #endif // _LP64
2378 
2379 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2380   reduce_operation_128(T_FLOAT, opcode, dst, src);
2381   pshufd(vtmp, src, 0x1);
2382   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2383 }
2384 
2385 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2386   reduce2F(opcode, dst, src, vtmp);
2387   pshufd(vtmp, src, 0x2);
2388   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2389   pshufd(vtmp, src, 0x3);
2390   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2391 }
2392 
2393 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2394   reduce4F(opcode, dst, src, vtmp2);
2395   vextractf128_high(vtmp2, src);
2396   reduce4F(opcode, dst, vtmp2, vtmp1);
2397 }
2398 
2399 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2400   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2401   vextracti64x4_high(vtmp1, src);
2402   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2403 }
2404 
2405 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2406   pshufd(dst, src, 0x1);
2407   reduce_operation_128(T_FLOAT, opcode, dst, src);
2408 }
2409 
2410 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2411   pshufd(vtmp, src, 0xE);
2412   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2413   unorderedReduce2F(opcode, dst, vtmp);
2414 }
2415 
2416 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2417   vextractf128_high(vtmp1, src);
2418   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2419   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2420 }
2421 
2422 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2423   vextractf64x4_high(vtmp2, src);
2424   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2425   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2426 }
2427 
2428 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2429   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2430   pshufd(vtmp, src, 0xE);
2431   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2432 }
2433 
2434 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2435   reduce2D(opcode, dst, src, vtmp2);
2436   vextractf128_high(vtmp2, src);
2437   reduce2D(opcode, dst, vtmp2, vtmp1);
2438 }
2439 
2440 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2441   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2442   vextracti64x4_high(vtmp1, src);
2443   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2444 }
2445 
2446 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2447   pshufd(dst, src, 0xE);
2448   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2449 }
2450 
2451 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2452   vextractf128_high(vtmp, src);
2453   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2454   unorderedReduce2D(opcode, dst, vtmp);
2455 }
2456 
2457 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2458   vextractf64x4_high(vtmp2, src);
2459   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2460   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2461 }
2462 
2463 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2464   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2465 }
2466 
2467 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2468   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2469 }
2470 
2471 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2472   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2473 }
2474 
2475 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2476                                  int vec_enc) {
2477   switch(elem_bt) {
2478     case T_INT:
2479     case T_FLOAT:
2480       vmaskmovps(dst, src, mask, vec_enc);
2481       break;
2482     case T_LONG:
2483     case T_DOUBLE:
2484       vmaskmovpd(dst, src, mask, vec_enc);
2485       break;
2486     default:
2487       fatal("Unsupported type %s", type2name(elem_bt));
2488       break;
2489   }
2490 }
2491 
2492 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2493                                  int vec_enc) {
2494   switch(elem_bt) {
2495     case T_INT:
2496     case T_FLOAT:
2497       vmaskmovps(dst, src, mask, vec_enc);
2498       break;
2499     case T_LONG:
2500     case T_DOUBLE:
2501       vmaskmovpd(dst, src, mask, vec_enc);
2502       break;
2503     default:
2504       fatal("Unsupported type %s", type2name(elem_bt));
2505       break;
2506   }
2507 }
2508 
2509 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2510                                           XMMRegister dst, XMMRegister src,
2511                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2512                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2513   const int permconst[] = {1, 14};
2514   XMMRegister wsrc = src;
2515   XMMRegister wdst = xmm_0;
2516   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2517 
2518   int vlen_enc = Assembler::AVX_128bit;
2519   if (vlen == 16) {
2520     vlen_enc = Assembler::AVX_256bit;
2521   }
2522 
2523   for (int i = log2(vlen) - 1; i >=0; i--) {
2524     if (i == 0 && !is_dst_valid) {
2525       wdst = dst;
2526     }
2527     if (i == 3) {
2528       vextracti64x4_high(wtmp, wsrc);
2529     } else if (i == 2) {
2530       vextracti128_high(wtmp, wsrc);
2531     } else { // i = [0,1]
2532       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2533     }
2534     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2535     wsrc = wdst;
2536     vlen_enc = Assembler::AVX_128bit;
2537   }
2538   if (is_dst_valid) {
2539     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2540   }
2541 }
2542 
2543 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2544                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2545                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2546   XMMRegister wsrc = src;
2547   XMMRegister wdst = xmm_0;
2548   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2549   int vlen_enc = Assembler::AVX_128bit;
2550   if (vlen == 8) {
2551     vlen_enc = Assembler::AVX_256bit;
2552   }
2553   for (int i = log2(vlen) - 1; i >=0; i--) {
2554     if (i == 0 && !is_dst_valid) {
2555       wdst = dst;
2556     }
2557     if (i == 1) {
2558       vextracti128_high(wtmp, wsrc);
2559     } else if (i == 2) {
2560       vextracti64x4_high(wtmp, wsrc);
2561     } else {
2562       assert(i == 0, "%d", i);
2563       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2564     }
2565     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2566     wsrc = wdst;
2567     vlen_enc = Assembler::AVX_128bit;
2568   }
2569   if (is_dst_valid) {
2570     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2571   }
2572 }
2573 
2574 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2575   switch (bt) {
2576     case T_BYTE:  pextrb(dst, src, idx); break;
2577     case T_SHORT: pextrw(dst, src, idx); break;
2578     case T_INT:   pextrd(dst, src, idx); break;
2579     case T_LONG:  pextrq(dst, src, idx); break;
2580 
2581     default:
2582       assert(false,"Should not reach here.");
2583       break;
2584   }
2585 }
2586 
2587 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2588   int esize =  type2aelembytes(typ);
2589   int elem_per_lane = 16/esize;
2590   int lane = elemindex / elem_per_lane;
2591   int eindex = elemindex % elem_per_lane;
2592 
2593   if (lane >= 2) {
2594     assert(UseAVX > 2, "required");
2595     vextractf32x4(dst, src, lane & 3);
2596     return dst;
2597   } else if (lane > 0) {
2598     assert(UseAVX > 0, "required");
2599     vextractf128(dst, src, lane);
2600     return dst;
2601   } else {
2602     return src;
2603   }
2604 }
2605 
2606 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2607   if (typ == T_BYTE) {
2608     movsbl(dst, dst);
2609   } else if (typ == T_SHORT) {
2610     movswl(dst, dst);
2611   }
2612 }
2613 
2614 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2615   int esize =  type2aelembytes(typ);
2616   int elem_per_lane = 16/esize;
2617   int eindex = elemindex % elem_per_lane;
2618   assert(is_integral_type(typ),"required");
2619 
2620   if (eindex == 0) {
2621     if (typ == T_LONG) {
2622       movq(dst, src);
2623     } else {
2624       movdl(dst, src);
2625       movsxl(typ, dst);
2626     }
2627   } else {
2628     extract(typ, dst, src, eindex);
2629     movsxl(typ, dst);
2630   }
2631 }
2632 
2633 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2634   int esize =  type2aelembytes(typ);
2635   int elem_per_lane = 16/esize;
2636   int eindex = elemindex % elem_per_lane;
2637   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2638 
2639   if (eindex == 0) {
2640     movq(dst, src);
2641   } else {
2642     if (typ == T_FLOAT) {
2643       if (UseAVX == 0) {
2644         movdqu(dst, src);
2645         shufps(dst, dst, eindex);
2646       } else {
2647         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2648       }
2649     } else {
2650       if (UseAVX == 0) {
2651         movdqu(dst, src);
2652         psrldq(dst, eindex*esize);
2653       } else {
2654         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2655       }
2656       movq(dst, dst);
2657     }
2658   }
2659   // Zero upper bits
2660   if (typ == T_FLOAT) {
2661     if (UseAVX == 0) {
2662       assert(vtmp != xnoreg, "required.");
2663       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2664       pand(dst, vtmp);
2665     } else {
2666       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2667     }
2668   }
2669 }
2670 
2671 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2672   switch(typ) {
2673     case T_BYTE:
2674     case T_BOOLEAN:
2675       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2676       break;
2677     case T_SHORT:
2678     case T_CHAR:
2679       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2680       break;
2681     case T_INT:
2682     case T_FLOAT:
2683       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2684       break;
2685     case T_LONG:
2686     case T_DOUBLE:
2687       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2688       break;
2689     default:
2690       assert(false,"Should not reach here.");
2691       break;
2692   }
2693 }
2694 
2695 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2696   assert(rscratch != noreg || always_reachable(src2), "missing");
2697 
2698   switch(typ) {
2699     case T_BOOLEAN:
2700     case T_BYTE:
2701       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2702       break;
2703     case T_CHAR:
2704     case T_SHORT:
2705       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2706       break;
2707     case T_INT:
2708     case T_FLOAT:
2709       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2710       break;
2711     case T_LONG:
2712     case T_DOUBLE:
2713       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2714       break;
2715     default:
2716       assert(false,"Should not reach here.");
2717       break;
2718   }
2719 }
2720 
2721 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2722   switch(typ) {
2723     case T_BYTE:
2724       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2725       break;
2726     case T_SHORT:
2727       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2728       break;
2729     case T_INT:
2730     case T_FLOAT:
2731       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2732       break;
2733     case T_LONG:
2734     case T_DOUBLE:
2735       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2736       break;
2737     default:
2738       assert(false,"Should not reach here.");
2739       break;
2740   }
2741 }
2742 
2743 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2744   assert(vlen_in_bytes <= 32, "");
2745   int esize = type2aelembytes(bt);
2746   if (vlen_in_bytes == 32) {
2747     assert(vtmp == xnoreg, "required.");
2748     if (esize >= 4) {
2749       vtestps(src1, src2, AVX_256bit);
2750     } else {
2751       vptest(src1, src2, AVX_256bit);
2752     }
2753     return;
2754   }
2755   if (vlen_in_bytes < 16) {
2756     // Duplicate the lower part to fill the whole register,
2757     // Don't need to do so for src2
2758     assert(vtmp != xnoreg, "required");
2759     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2760     pshufd(vtmp, src1, shuffle_imm);
2761   } else {
2762     assert(vtmp == xnoreg, "required");
2763     vtmp = src1;
2764   }
2765   if (esize >= 4 && VM_Version::supports_avx()) {
2766     vtestps(vtmp, src2, AVX_128bit);
2767   } else {
2768     ptest(vtmp, src2);
2769   }
2770 }
2771 
2772 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2773 #ifdef ASSERT
2774   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2775   bool is_bw_supported = VM_Version::supports_avx512bw();
2776   if (is_bw && !is_bw_supported) {
2777     assert(vlen_enc != Assembler::AVX_512bit, "required");
2778     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2779            "XMM register should be 0-15");
2780   }
2781 #endif // ASSERT
2782   switch (elem_bt) {
2783     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2784     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2785     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2786     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2787     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2788     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2789     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2790   }
2791 }
2792 
2793 #ifdef _LP64
2794 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2795   assert(UseAVX >= 2, "required");
2796   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2797   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2798   if ((UseAVX > 2) &&
2799       (!is_bw || VM_Version::supports_avx512bw()) &&
2800       (!is_vl || VM_Version::supports_avx512vl())) {
2801     switch (elem_bt) {
2802       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2803       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2804       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2805       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2806       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2807     }
2808   } else {
2809     assert(vlen_enc != Assembler::AVX_512bit, "required");
2810     assert((dst->encoding() < 16),"XMM register should be 0-15");
2811     switch (elem_bt) {
2812       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2813       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2814       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2815       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2816       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2817       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2818       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2819     }
2820   }
2821 }
2822 #endif
2823 
2824 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2825   switch (to_elem_bt) {
2826     case T_SHORT:
2827       vpmovsxbw(dst, src, vlen_enc);
2828       break;
2829     case T_INT:
2830       vpmovsxbd(dst, src, vlen_enc);
2831       break;
2832     case T_FLOAT:
2833       vpmovsxbd(dst, src, vlen_enc);
2834       vcvtdq2ps(dst, dst, vlen_enc);
2835       break;
2836     case T_LONG:
2837       vpmovsxbq(dst, src, vlen_enc);
2838       break;
2839     case T_DOUBLE: {
2840       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2841       vpmovsxbd(dst, src, mid_vlen_enc);
2842       vcvtdq2pd(dst, dst, vlen_enc);
2843       break;
2844     }
2845     default:
2846       fatal("Unsupported type %s", type2name(to_elem_bt));
2847       break;
2848   }
2849 }
2850 
2851 //-------------------------------------------------------------------------------------------
2852 
2853 // IndexOf for constant substrings with size >= 8 chars
2854 // which don't need to be loaded through stack.
2855 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2856                                          Register cnt1, Register cnt2,
2857                                          int int_cnt2,  Register result,
2858                                          XMMRegister vec, Register tmp,
2859                                          int ae) {
2860   ShortBranchVerifier sbv(this);
2861   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2862   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2863 
2864   // This method uses the pcmpestri instruction with bound registers
2865   //   inputs:
2866   //     xmm - substring
2867   //     rax - substring length (elements count)
2868   //     mem - scanned string
2869   //     rdx - string length (elements count)
2870   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2871   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2872   //   outputs:
2873   //     rcx - matched index in string
2874   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2875   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2876   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2877   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2878   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2879 
2880   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2881         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2882         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2883 
2884   // Note, inline_string_indexOf() generates checks:
2885   // if (substr.count > string.count) return -1;
2886   // if (substr.count == 0) return 0;
2887   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2888 
2889   // Load substring.
2890   if (ae == StrIntrinsicNode::UL) {
2891     pmovzxbw(vec, Address(str2, 0));
2892   } else {
2893     movdqu(vec, Address(str2, 0));
2894   }
2895   movl(cnt2, int_cnt2);
2896   movptr(result, str1); // string addr
2897 
2898   if (int_cnt2 > stride) {
2899     jmpb(SCAN_TO_SUBSTR);
2900 
2901     // Reload substr for rescan, this code
2902     // is executed only for large substrings (> 8 chars)
2903     bind(RELOAD_SUBSTR);
2904     if (ae == StrIntrinsicNode::UL) {
2905       pmovzxbw(vec, Address(str2, 0));
2906     } else {
2907       movdqu(vec, Address(str2, 0));
2908     }
2909     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2910 
2911     bind(RELOAD_STR);
2912     // We came here after the beginning of the substring was
2913     // matched but the rest of it was not so we need to search
2914     // again. Start from the next element after the previous match.
2915 
2916     // cnt2 is number of substring reminding elements and
2917     // cnt1 is number of string reminding elements when cmp failed.
2918     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2919     subl(cnt1, cnt2);
2920     addl(cnt1, int_cnt2);
2921     movl(cnt2, int_cnt2); // Now restore cnt2
2922 
2923     decrementl(cnt1);     // Shift to next element
2924     cmpl(cnt1, cnt2);
2925     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2926 
2927     addptr(result, (1<<scale1));
2928 
2929   } // (int_cnt2 > 8)
2930 
2931   // Scan string for start of substr in 16-byte vectors
2932   bind(SCAN_TO_SUBSTR);
2933   pcmpestri(vec, Address(result, 0), mode);
2934   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2935   subl(cnt1, stride);
2936   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2937   cmpl(cnt1, cnt2);
2938   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2939   addptr(result, 16);
2940   jmpb(SCAN_TO_SUBSTR);
2941 
2942   // Found a potential substr
2943   bind(FOUND_CANDIDATE);
2944   // Matched whole vector if first element matched (tmp(rcx) == 0).
2945   if (int_cnt2 == stride) {
2946     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2947   } else { // int_cnt2 > 8
2948     jccb(Assembler::overflow, FOUND_SUBSTR);
2949   }
2950   // After pcmpestri tmp(rcx) contains matched element index
2951   // Compute start addr of substr
2952   lea(result, Address(result, tmp, scale1));
2953 
2954   // Make sure string is still long enough
2955   subl(cnt1, tmp);
2956   cmpl(cnt1, cnt2);
2957   if (int_cnt2 == stride) {
2958     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2959   } else { // int_cnt2 > 8
2960     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2961   }
2962   // Left less then substring.
2963 
2964   bind(RET_NOT_FOUND);
2965   movl(result, -1);
2966   jmp(EXIT);
2967 
2968   if (int_cnt2 > stride) {
2969     // This code is optimized for the case when whole substring
2970     // is matched if its head is matched.
2971     bind(MATCH_SUBSTR_HEAD);
2972     pcmpestri(vec, Address(result, 0), mode);
2973     // Reload only string if does not match
2974     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2975 
2976     Label CONT_SCAN_SUBSTR;
2977     // Compare the rest of substring (> 8 chars).
2978     bind(FOUND_SUBSTR);
2979     // First 8 chars are already matched.
2980     negptr(cnt2);
2981     addptr(cnt2, stride);
2982 
2983     bind(SCAN_SUBSTR);
2984     subl(cnt1, stride);
2985     cmpl(cnt2, -stride); // Do not read beyond substring
2986     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2987     // Back-up strings to avoid reading beyond substring:
2988     // cnt1 = cnt1 - cnt2 + 8
2989     addl(cnt1, cnt2); // cnt2 is negative
2990     addl(cnt1, stride);
2991     movl(cnt2, stride); negptr(cnt2);
2992     bind(CONT_SCAN_SUBSTR);
2993     if (int_cnt2 < (int)G) {
2994       int tail_off1 = int_cnt2<<scale1;
2995       int tail_off2 = int_cnt2<<scale2;
2996       if (ae == StrIntrinsicNode::UL) {
2997         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2998       } else {
2999         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
3000       }
3001       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
3002     } else {
3003       // calculate index in register to avoid integer overflow (int_cnt2*2)
3004       movl(tmp, int_cnt2);
3005       addptr(tmp, cnt2);
3006       if (ae == StrIntrinsicNode::UL) {
3007         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
3008       } else {
3009         movdqu(vec, Address(str2, tmp, scale2, 0));
3010       }
3011       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
3012     }
3013     // Need to reload strings pointers if not matched whole vector
3014     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3015     addptr(cnt2, stride);
3016     jcc(Assembler::negative, SCAN_SUBSTR);
3017     // Fall through if found full substring
3018 
3019   } // (int_cnt2 > 8)
3020 
3021   bind(RET_FOUND);
3022   // Found result if we matched full small substring.
3023   // Compute substr offset
3024   subptr(result, str1);
3025   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3026     shrl(result, 1); // index
3027   }
3028   bind(EXIT);
3029 
3030 } // string_indexofC8
3031 
3032 // Small strings are loaded through stack if they cross page boundary.
3033 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
3034                                        Register cnt1, Register cnt2,
3035                                        int int_cnt2,  Register result,
3036                                        XMMRegister vec, Register tmp,
3037                                        int ae) {
3038   ShortBranchVerifier sbv(this);
3039   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3040   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3041 
3042   //
3043   // int_cnt2 is length of small (< 8 chars) constant substring
3044   // or (-1) for non constant substring in which case its length
3045   // is in cnt2 register.
3046   //
3047   // Note, inline_string_indexOf() generates checks:
3048   // if (substr.count > string.count) return -1;
3049   // if (substr.count == 0) return 0;
3050   //
3051   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3052   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3053   // This method uses the pcmpestri instruction with bound registers
3054   //   inputs:
3055   //     xmm - substring
3056   //     rax - substring length (elements count)
3057   //     mem - scanned string
3058   //     rdx - string length (elements count)
3059   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3060   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3061   //   outputs:
3062   //     rcx - matched index in string
3063   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3064   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3065   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3066   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3067 
3068   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3069         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3070         FOUND_CANDIDATE;
3071 
3072   { //========================================================
3073     // We don't know where these strings are located
3074     // and we can't read beyond them. Load them through stack.
3075     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3076 
3077     movptr(tmp, rsp); // save old SP
3078 
3079     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3080       if (int_cnt2 == (1>>scale2)) { // One byte
3081         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3082         load_unsigned_byte(result, Address(str2, 0));
3083         movdl(vec, result); // move 32 bits
3084       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3085         // Not enough header space in 32-bit VM: 12+3 = 15.
3086         movl(result, Address(str2, -1));
3087         shrl(result, 8);
3088         movdl(vec, result); // move 32 bits
3089       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3090         load_unsigned_short(result, Address(str2, 0));
3091         movdl(vec, result); // move 32 bits
3092       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3093         movdl(vec, Address(str2, 0)); // move 32 bits
3094       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3095         movq(vec, Address(str2, 0));  // move 64 bits
3096       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3097         // Array header size is 12 bytes in 32-bit VM
3098         // + 6 bytes for 3 chars == 18 bytes,
3099         // enough space to load vec and shift.
3100         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3101         if (ae == StrIntrinsicNode::UL) {
3102           int tail_off = int_cnt2-8;
3103           pmovzxbw(vec, Address(str2, tail_off));
3104           psrldq(vec, -2*tail_off);
3105         }
3106         else {
3107           int tail_off = int_cnt2*(1<<scale2);
3108           movdqu(vec, Address(str2, tail_off-16));
3109           psrldq(vec, 16-tail_off);
3110         }
3111       }
3112     } else { // not constant substring
3113       cmpl(cnt2, stride);
3114       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3115 
3116       // We can read beyond string if srt+16 does not cross page boundary
3117       // since heaps are aligned and mapped by pages.
3118       assert(os::vm_page_size() < (int)G, "default page should be small");
3119       movl(result, str2); // We need only low 32 bits
3120       andl(result, ((int)os::vm_page_size()-1));
3121       cmpl(result, ((int)os::vm_page_size()-16));
3122       jccb(Assembler::belowEqual, CHECK_STR);
3123 
3124       // Move small strings to stack to allow load 16 bytes into vec.
3125       subptr(rsp, 16);
3126       int stk_offset = wordSize-(1<<scale2);
3127       push(cnt2);
3128 
3129       bind(COPY_SUBSTR);
3130       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3131         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3132         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3133       } else if (ae == StrIntrinsicNode::UU) {
3134         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3135         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3136       }
3137       decrement(cnt2);
3138       jccb(Assembler::notZero, COPY_SUBSTR);
3139 
3140       pop(cnt2);
3141       movptr(str2, rsp);  // New substring address
3142     } // non constant
3143 
3144     bind(CHECK_STR);
3145     cmpl(cnt1, stride);
3146     jccb(Assembler::aboveEqual, BIG_STRINGS);
3147 
3148     // Check cross page boundary.
3149     movl(result, str1); // We need only low 32 bits
3150     andl(result, ((int)os::vm_page_size()-1));
3151     cmpl(result, ((int)os::vm_page_size()-16));
3152     jccb(Assembler::belowEqual, BIG_STRINGS);
3153 
3154     subptr(rsp, 16);
3155     int stk_offset = -(1<<scale1);
3156     if (int_cnt2 < 0) { // not constant
3157       push(cnt2);
3158       stk_offset += wordSize;
3159     }
3160     movl(cnt2, cnt1);
3161 
3162     bind(COPY_STR);
3163     if (ae == StrIntrinsicNode::LL) {
3164       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3165       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3166     } else {
3167       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3168       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3169     }
3170     decrement(cnt2);
3171     jccb(Assembler::notZero, COPY_STR);
3172 
3173     if (int_cnt2 < 0) { // not constant
3174       pop(cnt2);
3175     }
3176     movptr(str1, rsp);  // New string address
3177 
3178     bind(BIG_STRINGS);
3179     // Load substring.
3180     if (int_cnt2 < 0) { // -1
3181       if (ae == StrIntrinsicNode::UL) {
3182         pmovzxbw(vec, Address(str2, 0));
3183       } else {
3184         movdqu(vec, Address(str2, 0));
3185       }
3186       push(cnt2);       // substr count
3187       push(str2);       // substr addr
3188       push(str1);       // string addr
3189     } else {
3190       // Small (< 8 chars) constant substrings are loaded already.
3191       movl(cnt2, int_cnt2);
3192     }
3193     push(tmp);  // original SP
3194 
3195   } // Finished loading
3196 
3197   //========================================================
3198   // Start search
3199   //
3200 
3201   movptr(result, str1); // string addr
3202 
3203   if (int_cnt2  < 0) {  // Only for non constant substring
3204     jmpb(SCAN_TO_SUBSTR);
3205 
3206     // SP saved at sp+0
3207     // String saved at sp+1*wordSize
3208     // Substr saved at sp+2*wordSize
3209     // Substr count saved at sp+3*wordSize
3210 
3211     // Reload substr for rescan, this code
3212     // is executed only for large substrings (> 8 chars)
3213     bind(RELOAD_SUBSTR);
3214     movptr(str2, Address(rsp, 2*wordSize));
3215     movl(cnt2, Address(rsp, 3*wordSize));
3216     if (ae == StrIntrinsicNode::UL) {
3217       pmovzxbw(vec, Address(str2, 0));
3218     } else {
3219       movdqu(vec, Address(str2, 0));
3220     }
3221     // We came here after the beginning of the substring was
3222     // matched but the rest of it was not so we need to search
3223     // again. Start from the next element after the previous match.
3224     subptr(str1, result); // Restore counter
3225     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3226       shrl(str1, 1);
3227     }
3228     addl(cnt1, str1);
3229     decrementl(cnt1);   // Shift to next element
3230     cmpl(cnt1, cnt2);
3231     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3232 
3233     addptr(result, (1<<scale1));
3234   } // non constant
3235 
3236   // Scan string for start of substr in 16-byte vectors
3237   bind(SCAN_TO_SUBSTR);
3238   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3239   pcmpestri(vec, Address(result, 0), mode);
3240   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3241   subl(cnt1, stride);
3242   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3243   cmpl(cnt1, cnt2);
3244   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3245   addptr(result, 16);
3246 
3247   bind(ADJUST_STR);
3248   cmpl(cnt1, stride); // Do not read beyond string
3249   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3250   // Back-up string to avoid reading beyond string.
3251   lea(result, Address(result, cnt1, scale1, -16));
3252   movl(cnt1, stride);
3253   jmpb(SCAN_TO_SUBSTR);
3254 
3255   // Found a potential substr
3256   bind(FOUND_CANDIDATE);
3257   // After pcmpestri tmp(rcx) contains matched element index
3258 
3259   // Make sure string is still long enough
3260   subl(cnt1, tmp);
3261   cmpl(cnt1, cnt2);
3262   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3263   // Left less then substring.
3264 
3265   bind(RET_NOT_FOUND);
3266   movl(result, -1);
3267   jmp(CLEANUP);
3268 
3269   bind(FOUND_SUBSTR);
3270   // Compute start addr of substr
3271   lea(result, Address(result, tmp, scale1));
3272   if (int_cnt2 > 0) { // Constant substring
3273     // Repeat search for small substring (< 8 chars)
3274     // from new point without reloading substring.
3275     // Have to check that we don't read beyond string.
3276     cmpl(tmp, stride-int_cnt2);
3277     jccb(Assembler::greater, ADJUST_STR);
3278     // Fall through if matched whole substring.
3279   } else { // non constant
3280     assert(int_cnt2 == -1, "should be != 0");
3281 
3282     addl(tmp, cnt2);
3283     // Found result if we matched whole substring.
3284     cmpl(tmp, stride);
3285     jcc(Assembler::lessEqual, RET_FOUND);
3286 
3287     // Repeat search for small substring (<= 8 chars)
3288     // from new point 'str1' without reloading substring.
3289     cmpl(cnt2, stride);
3290     // Have to check that we don't read beyond string.
3291     jccb(Assembler::lessEqual, ADJUST_STR);
3292 
3293     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3294     // Compare the rest of substring (> 8 chars).
3295     movptr(str1, result);
3296 
3297     cmpl(tmp, cnt2);
3298     // First 8 chars are already matched.
3299     jccb(Assembler::equal, CHECK_NEXT);
3300 
3301     bind(SCAN_SUBSTR);
3302     pcmpestri(vec, Address(str1, 0), mode);
3303     // Need to reload strings pointers if not matched whole vector
3304     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3305 
3306     bind(CHECK_NEXT);
3307     subl(cnt2, stride);
3308     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3309     addptr(str1, 16);
3310     if (ae == StrIntrinsicNode::UL) {
3311       addptr(str2, 8);
3312     } else {
3313       addptr(str2, 16);
3314     }
3315     subl(cnt1, stride);
3316     cmpl(cnt2, stride); // Do not read beyond substring
3317     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3318     // Back-up strings to avoid reading beyond substring.
3319 
3320     if (ae == StrIntrinsicNode::UL) {
3321       lea(str2, Address(str2, cnt2, scale2, -8));
3322       lea(str1, Address(str1, cnt2, scale1, -16));
3323     } else {
3324       lea(str2, Address(str2, cnt2, scale2, -16));
3325       lea(str1, Address(str1, cnt2, scale1, -16));
3326     }
3327     subl(cnt1, cnt2);
3328     movl(cnt2, stride);
3329     addl(cnt1, stride);
3330     bind(CONT_SCAN_SUBSTR);
3331     if (ae == StrIntrinsicNode::UL) {
3332       pmovzxbw(vec, Address(str2, 0));
3333     } else {
3334       movdqu(vec, Address(str2, 0));
3335     }
3336     jmp(SCAN_SUBSTR);
3337 
3338     bind(RET_FOUND_LONG);
3339     movptr(str1, Address(rsp, wordSize));
3340   } // non constant
3341 
3342   bind(RET_FOUND);
3343   // Compute substr offset
3344   subptr(result, str1);
3345   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3346     shrl(result, 1); // index
3347   }
3348   bind(CLEANUP);
3349   pop(rsp); // restore SP
3350 
3351 } // string_indexof
3352 
3353 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3354                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3355   ShortBranchVerifier sbv(this);
3356   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3357 
3358   int stride = 8;
3359 
3360   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3361         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3362         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3363         FOUND_SEQ_CHAR, DONE_LABEL;
3364 
3365   movptr(result, str1);
3366   if (UseAVX >= 2) {
3367     cmpl(cnt1, stride);
3368     jcc(Assembler::less, SCAN_TO_CHAR);
3369     cmpl(cnt1, 2*stride);
3370     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3371     movdl(vec1, ch);
3372     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3373     vpxor(vec2, vec2);
3374     movl(tmp, cnt1);
3375     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3376     andl(cnt1,0x0000000F);  //tail count (in chars)
3377 
3378     bind(SCAN_TO_16_CHAR_LOOP);
3379     vmovdqu(vec3, Address(result, 0));
3380     vpcmpeqw(vec3, vec3, vec1, 1);
3381     vptest(vec2, vec3);
3382     jcc(Assembler::carryClear, FOUND_CHAR);
3383     addptr(result, 32);
3384     subl(tmp, 2*stride);
3385     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3386     jmp(SCAN_TO_8_CHAR);
3387     bind(SCAN_TO_8_CHAR_INIT);
3388     movdl(vec1, ch);
3389     pshuflw(vec1, vec1, 0x00);
3390     pshufd(vec1, vec1, 0);
3391     pxor(vec2, vec2);
3392   }
3393   bind(SCAN_TO_8_CHAR);
3394   cmpl(cnt1, stride);
3395   jcc(Assembler::less, SCAN_TO_CHAR);
3396   if (UseAVX < 2) {
3397     movdl(vec1, ch);
3398     pshuflw(vec1, vec1, 0x00);
3399     pshufd(vec1, vec1, 0);
3400     pxor(vec2, vec2);
3401   }
3402   movl(tmp, cnt1);
3403   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3404   andl(cnt1,0x00000007);  //tail count (in chars)
3405 
3406   bind(SCAN_TO_8_CHAR_LOOP);
3407   movdqu(vec3, Address(result, 0));
3408   pcmpeqw(vec3, vec1);
3409   ptest(vec2, vec3);
3410   jcc(Assembler::carryClear, FOUND_CHAR);
3411   addptr(result, 16);
3412   subl(tmp, stride);
3413   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3414   bind(SCAN_TO_CHAR);
3415   testl(cnt1, cnt1);
3416   jcc(Assembler::zero, RET_NOT_FOUND);
3417   bind(SCAN_TO_CHAR_LOOP);
3418   load_unsigned_short(tmp, Address(result, 0));
3419   cmpl(ch, tmp);
3420   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3421   addptr(result, 2);
3422   subl(cnt1, 1);
3423   jccb(Assembler::zero, RET_NOT_FOUND);
3424   jmp(SCAN_TO_CHAR_LOOP);
3425 
3426   bind(RET_NOT_FOUND);
3427   movl(result, -1);
3428   jmpb(DONE_LABEL);
3429 
3430   bind(FOUND_CHAR);
3431   if (UseAVX >= 2) {
3432     vpmovmskb(tmp, vec3);
3433   } else {
3434     pmovmskb(tmp, vec3);
3435   }
3436   bsfl(ch, tmp);
3437   addptr(result, ch);
3438 
3439   bind(FOUND_SEQ_CHAR);
3440   subptr(result, str1);
3441   shrl(result, 1);
3442 
3443   bind(DONE_LABEL);
3444 } // string_indexof_char
3445 
3446 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3447                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3448   ShortBranchVerifier sbv(this);
3449   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3450 
3451   int stride = 16;
3452 
3453   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3454         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3455         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3456         FOUND_SEQ_CHAR, DONE_LABEL;
3457 
3458   movptr(result, str1);
3459   if (UseAVX >= 2) {
3460     cmpl(cnt1, stride);
3461     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3462     cmpl(cnt1, stride*2);
3463     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3464     movdl(vec1, ch);
3465     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3466     vpxor(vec2, vec2);
3467     movl(tmp, cnt1);
3468     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3469     andl(cnt1,0x0000001F);  //tail count (in chars)
3470 
3471     bind(SCAN_TO_32_CHAR_LOOP);
3472     vmovdqu(vec3, Address(result, 0));
3473     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3474     vptest(vec2, vec3);
3475     jcc(Assembler::carryClear, FOUND_CHAR);
3476     addptr(result, 32);
3477     subl(tmp, stride*2);
3478     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3479     jmp(SCAN_TO_16_CHAR);
3480 
3481     bind(SCAN_TO_16_CHAR_INIT);
3482     movdl(vec1, ch);
3483     pxor(vec2, vec2);
3484     pshufb(vec1, vec2);
3485   }
3486 
3487   bind(SCAN_TO_16_CHAR);
3488   cmpl(cnt1, stride);
3489   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3490   if (UseAVX < 2) {
3491     movdl(vec1, ch);
3492     pxor(vec2, vec2);
3493     pshufb(vec1, vec2);
3494   }
3495   movl(tmp, cnt1);
3496   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3497   andl(cnt1,0x0000000F);  //tail count (in bytes)
3498 
3499   bind(SCAN_TO_16_CHAR_LOOP);
3500   movdqu(vec3, Address(result, 0));
3501   pcmpeqb(vec3, vec1);
3502   ptest(vec2, vec3);
3503   jcc(Assembler::carryClear, FOUND_CHAR);
3504   addptr(result, 16);
3505   subl(tmp, stride);
3506   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3507 
3508   bind(SCAN_TO_CHAR_INIT);
3509   testl(cnt1, cnt1);
3510   jcc(Assembler::zero, RET_NOT_FOUND);
3511   bind(SCAN_TO_CHAR_LOOP);
3512   load_unsigned_byte(tmp, Address(result, 0));
3513   cmpl(ch, tmp);
3514   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3515   addptr(result, 1);
3516   subl(cnt1, 1);
3517   jccb(Assembler::zero, RET_NOT_FOUND);
3518   jmp(SCAN_TO_CHAR_LOOP);
3519 
3520   bind(RET_NOT_FOUND);
3521   movl(result, -1);
3522   jmpb(DONE_LABEL);
3523 
3524   bind(FOUND_CHAR);
3525   if (UseAVX >= 2) {
3526     vpmovmskb(tmp, vec3);
3527   } else {
3528     pmovmskb(tmp, vec3);
3529   }
3530   bsfl(ch, tmp);
3531   addptr(result, ch);
3532 
3533   bind(FOUND_SEQ_CHAR);
3534   subptr(result, str1);
3535 
3536   bind(DONE_LABEL);
3537 } // stringL_indexof_char
3538 
3539 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3540   switch (eltype) {
3541   case T_BOOLEAN: return sizeof(jboolean);
3542   case T_BYTE:  return sizeof(jbyte);
3543   case T_SHORT: return sizeof(jshort);
3544   case T_CHAR:  return sizeof(jchar);
3545   case T_INT:   return sizeof(jint);
3546   default:
3547     ShouldNotReachHere();
3548     return -1;
3549   }
3550 }
3551 
3552 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3553   switch (eltype) {
3554   // T_BOOLEAN used as surrogate for unsigned byte
3555   case T_BOOLEAN: movzbl(dst, src);   break;
3556   case T_BYTE:    movsbl(dst, src);   break;
3557   case T_SHORT:   movswl(dst, src);   break;
3558   case T_CHAR:    movzwl(dst, src);   break;
3559   case T_INT:     movl(dst, src);     break;
3560   default:
3561     ShouldNotReachHere();
3562   }
3563 }
3564 
3565 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3566   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3567 }
3568 
3569 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3570   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3571 }
3572 
3573 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3574   const int vlen = Assembler::AVX_256bit;
3575   switch (eltype) {
3576   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3577   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3578   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3579   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3580   case T_INT:
3581     // do nothing
3582     break;
3583   default:
3584     ShouldNotReachHere();
3585   }
3586 }
3587 
3588 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3589                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3590                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3591                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3592                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3593                                         BasicType eltype) {
3594   ShortBranchVerifier sbv(this);
3595   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3596   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3597   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3598 
3599   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3600         SHORT_UNROLLED_LOOP_EXIT,
3601         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3602         UNROLLED_VECTOR_LOOP_BEGIN,
3603         END;
3604   switch (eltype) {
3605   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3606   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3607   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3608   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3609   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3610   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3611   }
3612 
3613   // For "renaming" for readibility of the code
3614   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3615                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3616                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3617 
3618   const int elsize = arrays_hashcode_elsize(eltype);
3619 
3620   /*
3621     if (cnt1 >= 2) {
3622       if (cnt1 >= 32) {
3623         UNROLLED VECTOR LOOP
3624       }
3625       UNROLLED SCALAR LOOP
3626     }
3627     SINGLE SCALAR
3628    */
3629 
3630   cmpl(cnt1, 32);
3631   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3632 
3633   // cnt1 >= 32 && generate_vectorized_loop
3634   xorl(index, index);
3635 
3636   // vresult = IntVector.zero(I256);
3637   for (int idx = 0; idx < 4; idx++) {
3638     vpxor(vresult[idx], vresult[idx]);
3639   }
3640   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3641   Register bound = tmp2;
3642   Register next = tmp3;
3643   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3644   movl(next, Address(tmp2, 0));
3645   movdl(vnext, next);
3646   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3647 
3648   // index = 0;
3649   // bound = cnt1 & ~(32 - 1);
3650   movl(bound, cnt1);
3651   andl(bound, ~(32 - 1));
3652   // for (; index < bound; index += 32) {
3653   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3654   // result *= next;
3655   imull(result, next);
3656   // loop fission to upfront the cost of fetching from memory, OOO execution
3657   // can then hopefully do a better job of prefetching
3658   for (int idx = 0; idx < 4; idx++) {
3659     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3660   }
3661   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3662   for (int idx = 0; idx < 4; idx++) {
3663     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3664     arrays_hashcode_elvcast(vtmp[idx], eltype);
3665     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3666   }
3667   // index += 32;
3668   addl(index, 32);
3669   // index < bound;
3670   cmpl(index, bound);
3671   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3672   // }
3673 
3674   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3675   subl(cnt1, bound);
3676   // release bound
3677 
3678   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3679   for (int idx = 0; idx < 4; idx++) {
3680     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3681     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3682     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3683   }
3684   // result += vresult.reduceLanes(ADD);
3685   for (int idx = 0; idx < 4; idx++) {
3686     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3687   }
3688 
3689   // } else if (cnt1 < 32) {
3690 
3691   bind(SHORT_UNROLLED_BEGIN);
3692   // int i = 1;
3693   movl(index, 1);
3694   cmpl(index, cnt1);
3695   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3696 
3697   // for (; i < cnt1 ; i += 2) {
3698   bind(SHORT_UNROLLED_LOOP_BEGIN);
3699   movl(tmp3, 961);
3700   imull(result, tmp3);
3701   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3702   movl(tmp3, tmp2);
3703   shll(tmp3, 5);
3704   subl(tmp3, tmp2);
3705   addl(result, tmp3);
3706   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3707   addl(result, tmp3);
3708   addl(index, 2);
3709   cmpl(index, cnt1);
3710   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3711 
3712   // }
3713   // if (i >= cnt1) {
3714   bind(SHORT_UNROLLED_LOOP_EXIT);
3715   jccb(Assembler::greater, END);
3716   movl(tmp2, result);
3717   shll(result, 5);
3718   subl(result, tmp2);
3719   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3720   addl(result, tmp3);
3721   // }
3722   bind(END);
3723 
3724   BLOCK_COMMENT("} // arrays_hashcode");
3725 
3726 } // arrays_hashcode
3727 
3728 // helper function for string_compare
3729 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3730                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3731                                            Address::ScaleFactor scale2, Register index, int ae) {
3732   if (ae == StrIntrinsicNode::LL) {
3733     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3734     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3735   } else if (ae == StrIntrinsicNode::UU) {
3736     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3737     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3738   } else {
3739     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3740     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3741   }
3742 }
3743 
3744 // Compare strings, used for char[] and byte[].
3745 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3746                                        Register cnt1, Register cnt2, Register result,
3747                                        XMMRegister vec1, int ae, KRegister mask) {
3748   ShortBranchVerifier sbv(this);
3749   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3750   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3751   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3752   int stride2x2 = 0x40;
3753   Address::ScaleFactor scale = Address::no_scale;
3754   Address::ScaleFactor scale1 = Address::no_scale;
3755   Address::ScaleFactor scale2 = Address::no_scale;
3756 
3757   if (ae != StrIntrinsicNode::LL) {
3758     stride2x2 = 0x20;
3759   }
3760 
3761   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3762     shrl(cnt2, 1);
3763   }
3764   // Compute the minimum of the string lengths and the
3765   // difference of the string lengths (stack).
3766   // Do the conditional move stuff
3767   movl(result, cnt1);
3768   subl(cnt1, cnt2);
3769   push(cnt1);
3770   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3771 
3772   // Is the minimum length zero?
3773   testl(cnt2, cnt2);
3774   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3775   if (ae == StrIntrinsicNode::LL) {
3776     // Load first bytes
3777     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3778     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3779   } else if (ae == StrIntrinsicNode::UU) {
3780     // Load first characters
3781     load_unsigned_short(result, Address(str1, 0));
3782     load_unsigned_short(cnt1, Address(str2, 0));
3783   } else {
3784     load_unsigned_byte(result, Address(str1, 0));
3785     load_unsigned_short(cnt1, Address(str2, 0));
3786   }
3787   subl(result, cnt1);
3788   jcc(Assembler::notZero,  POP_LABEL);
3789 
3790   if (ae == StrIntrinsicNode::UU) {
3791     // Divide length by 2 to get number of chars
3792     shrl(cnt2, 1);
3793   }
3794   cmpl(cnt2, 1);
3795   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3796 
3797   // Check if the strings start at the same location and setup scale and stride
3798   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3799     cmpptr(str1, str2);
3800     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3801     if (ae == StrIntrinsicNode::LL) {
3802       scale = Address::times_1;
3803       stride = 16;
3804     } else {
3805       scale = Address::times_2;
3806       stride = 8;
3807     }
3808   } else {
3809     scale1 = Address::times_1;
3810     scale2 = Address::times_2;
3811     // scale not used
3812     stride = 8;
3813   }
3814 
3815   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3816     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3817     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3818     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3819     Label COMPARE_TAIL_LONG;
3820     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3821 
3822     int pcmpmask = 0x19;
3823     if (ae == StrIntrinsicNode::LL) {
3824       pcmpmask &= ~0x01;
3825     }
3826 
3827     // Setup to compare 16-chars (32-bytes) vectors,
3828     // start from first character again because it has aligned address.
3829     if (ae == StrIntrinsicNode::LL) {
3830       stride2 = 32;
3831     } else {
3832       stride2 = 16;
3833     }
3834     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3835       adr_stride = stride << scale;
3836     } else {
3837       adr_stride1 = 8;  //stride << scale1;
3838       adr_stride2 = 16; //stride << scale2;
3839     }
3840 
3841     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3842     // rax and rdx are used by pcmpestri as elements counters
3843     movl(result, cnt2);
3844     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3845     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3846 
3847     // fast path : compare first 2 8-char vectors.
3848     bind(COMPARE_16_CHARS);
3849     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3850       movdqu(vec1, Address(str1, 0));
3851     } else {
3852       pmovzxbw(vec1, Address(str1, 0));
3853     }
3854     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3855     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3856 
3857     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3858       movdqu(vec1, Address(str1, adr_stride));
3859       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3860     } else {
3861       pmovzxbw(vec1, Address(str1, adr_stride1));
3862       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3863     }
3864     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3865     addl(cnt1, stride);
3866 
3867     // Compare the characters at index in cnt1
3868     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3869     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3870     subl(result, cnt2);
3871     jmp(POP_LABEL);
3872 
3873     // Setup the registers to start vector comparison loop
3874     bind(COMPARE_WIDE_VECTORS);
3875     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3876       lea(str1, Address(str1, result, scale));
3877       lea(str2, Address(str2, result, scale));
3878     } else {
3879       lea(str1, Address(str1, result, scale1));
3880       lea(str2, Address(str2, result, scale2));
3881     }
3882     subl(result, stride2);
3883     subl(cnt2, stride2);
3884     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3885     negptr(result);
3886 
3887     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3888     bind(COMPARE_WIDE_VECTORS_LOOP);
3889 
3890 #ifdef _LP64
3891     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3892       cmpl(cnt2, stride2x2);
3893       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3894       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3895       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3896 
3897       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3898       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3899         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3900         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3901       } else {
3902         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3903         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3904       }
3905       kortestql(mask, mask);
3906       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3907       addptr(result, stride2x2);  // update since we already compared at this addr
3908       subl(cnt2, stride2x2);      // and sub the size too
3909       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3910 
3911       vpxor(vec1, vec1);
3912       jmpb(COMPARE_WIDE_TAIL);
3913     }//if (VM_Version::supports_avx512vlbw())
3914 #endif // _LP64
3915 
3916 
3917     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3918     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3919       vmovdqu(vec1, Address(str1, result, scale));
3920       vpxor(vec1, Address(str2, result, scale));
3921     } else {
3922       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3923       vpxor(vec1, Address(str2, result, scale2));
3924     }
3925     vptest(vec1, vec1);
3926     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3927     addptr(result, stride2);
3928     subl(cnt2, stride2);
3929     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3930     // clean upper bits of YMM registers
3931     vpxor(vec1, vec1);
3932 
3933     // compare wide vectors tail
3934     bind(COMPARE_WIDE_TAIL);
3935     testptr(result, result);
3936     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3937 
3938     movl(result, stride2);
3939     movl(cnt2, result);
3940     negptr(result);
3941     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3942 
3943     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3944     bind(VECTOR_NOT_EQUAL);
3945     // clean upper bits of YMM registers
3946     vpxor(vec1, vec1);
3947     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3948       lea(str1, Address(str1, result, scale));
3949       lea(str2, Address(str2, result, scale));
3950     } else {
3951       lea(str1, Address(str1, result, scale1));
3952       lea(str2, Address(str2, result, scale2));
3953     }
3954     jmp(COMPARE_16_CHARS);
3955 
3956     // Compare tail chars, length between 1 to 15 chars
3957     bind(COMPARE_TAIL_LONG);
3958     movl(cnt2, result);
3959     cmpl(cnt2, stride);
3960     jcc(Assembler::less, COMPARE_SMALL_STR);
3961 
3962     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3963       movdqu(vec1, Address(str1, 0));
3964     } else {
3965       pmovzxbw(vec1, Address(str1, 0));
3966     }
3967     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3968     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3969     subptr(cnt2, stride);
3970     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3971     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3972       lea(str1, Address(str1, result, scale));
3973       lea(str2, Address(str2, result, scale));
3974     } else {
3975       lea(str1, Address(str1, result, scale1));
3976       lea(str2, Address(str2, result, scale2));
3977     }
3978     negptr(cnt2);
3979     jmpb(WHILE_HEAD_LABEL);
3980 
3981     bind(COMPARE_SMALL_STR);
3982   } else if (UseSSE42Intrinsics) {
3983     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3984     int pcmpmask = 0x19;
3985     // Setup to compare 8-char (16-byte) vectors,
3986     // start from first character again because it has aligned address.
3987     movl(result, cnt2);
3988     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3989     if (ae == StrIntrinsicNode::LL) {
3990       pcmpmask &= ~0x01;
3991     }
3992     jcc(Assembler::zero, COMPARE_TAIL);
3993     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3994       lea(str1, Address(str1, result, scale));
3995       lea(str2, Address(str2, result, scale));
3996     } else {
3997       lea(str1, Address(str1, result, scale1));
3998       lea(str2, Address(str2, result, scale2));
3999     }
4000     negptr(result);
4001 
4002     // pcmpestri
4003     //   inputs:
4004     //     vec1- substring
4005     //     rax - negative string length (elements count)
4006     //     mem - scanned string
4007     //     rdx - string length (elements count)
4008     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
4009     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
4010     //   outputs:
4011     //     rcx - first mismatched element index
4012     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
4013 
4014     bind(COMPARE_WIDE_VECTORS);
4015     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4016       movdqu(vec1, Address(str1, result, scale));
4017       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4018     } else {
4019       pmovzxbw(vec1, Address(str1, result, scale1));
4020       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4021     }
4022     // After pcmpestri cnt1(rcx) contains mismatched element index
4023 
4024     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
4025     addptr(result, stride);
4026     subptr(cnt2, stride);
4027     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4028 
4029     // compare wide vectors tail
4030     testptr(result, result);
4031     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4032 
4033     movl(cnt2, stride);
4034     movl(result, stride);
4035     negptr(result);
4036     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4037       movdqu(vec1, Address(str1, result, scale));
4038       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4039     } else {
4040       pmovzxbw(vec1, Address(str1, result, scale1));
4041       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4042     }
4043     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4044 
4045     // Mismatched characters in the vectors
4046     bind(VECTOR_NOT_EQUAL);
4047     addptr(cnt1, result);
4048     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4049     subl(result, cnt2);
4050     jmpb(POP_LABEL);
4051 
4052     bind(COMPARE_TAIL); // limit is zero
4053     movl(cnt2, result);
4054     // Fallthru to tail compare
4055   }
4056   // Shift str2 and str1 to the end of the arrays, negate min
4057   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4058     lea(str1, Address(str1, cnt2, scale));
4059     lea(str2, Address(str2, cnt2, scale));
4060   } else {
4061     lea(str1, Address(str1, cnt2, scale1));
4062     lea(str2, Address(str2, cnt2, scale2));
4063   }
4064   decrementl(cnt2);  // first character was compared already
4065   negptr(cnt2);
4066 
4067   // Compare the rest of the elements
4068   bind(WHILE_HEAD_LABEL);
4069   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4070   subl(result, cnt1);
4071   jccb(Assembler::notZero, POP_LABEL);
4072   increment(cnt2);
4073   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4074 
4075   // Strings are equal up to min length.  Return the length difference.
4076   bind(LENGTH_DIFF_LABEL);
4077   pop(result);
4078   if (ae == StrIntrinsicNode::UU) {
4079     // Divide diff by 2 to get number of chars
4080     sarl(result, 1);
4081   }
4082   jmpb(DONE_LABEL);
4083 
4084 #ifdef _LP64
4085   if (VM_Version::supports_avx512vlbw()) {
4086 
4087     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4088 
4089     kmovql(cnt1, mask);
4090     notq(cnt1);
4091     bsfq(cnt2, cnt1);
4092     if (ae != StrIntrinsicNode::LL) {
4093       // Divide diff by 2 to get number of chars
4094       sarl(cnt2, 1);
4095     }
4096     addq(result, cnt2);
4097     if (ae == StrIntrinsicNode::LL) {
4098       load_unsigned_byte(cnt1, Address(str2, result));
4099       load_unsigned_byte(result, Address(str1, result));
4100     } else if (ae == StrIntrinsicNode::UU) {
4101       load_unsigned_short(cnt1, Address(str2, result, scale));
4102       load_unsigned_short(result, Address(str1, result, scale));
4103     } else {
4104       load_unsigned_short(cnt1, Address(str2, result, scale2));
4105       load_unsigned_byte(result, Address(str1, result, scale1));
4106     }
4107     subl(result, cnt1);
4108     jmpb(POP_LABEL);
4109   }//if (VM_Version::supports_avx512vlbw())
4110 #endif // _LP64
4111 
4112   // Discard the stored length difference
4113   bind(POP_LABEL);
4114   pop(cnt1);
4115 
4116   // That's it
4117   bind(DONE_LABEL);
4118   if(ae == StrIntrinsicNode::UL) {
4119     negl(result);
4120   }
4121 
4122 }
4123 
4124 // Search for Non-ASCII character (Negative byte value) in a byte array,
4125 // return the index of the first such character, otherwise the length
4126 // of the array segment searched.
4127 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4128 //   @IntrinsicCandidate
4129 //   public static int countPositives(byte[] ba, int off, int len) {
4130 //     for (int i = off; i < off + len; i++) {
4131 //       if (ba[i] < 0) {
4132 //         return i - off;
4133 //       }
4134 //     }
4135 //     return len;
4136 //   }
4137 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4138   Register result, Register tmp1,
4139   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4140   // rsi: byte array
4141   // rcx: len
4142   // rax: result
4143   ShortBranchVerifier sbv(this);
4144   assert_different_registers(ary1, len, result, tmp1);
4145   assert_different_registers(vec1, vec2);
4146   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4147 
4148   movl(result, len); // copy
4149   // len == 0
4150   testl(len, len);
4151   jcc(Assembler::zero, DONE);
4152 
4153   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4154     VM_Version::supports_avx512vlbw() &&
4155     VM_Version::supports_bmi2()) {
4156 
4157     Label test_64_loop, test_tail, BREAK_LOOP;
4158     movl(tmp1, len);
4159     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4160 
4161     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4162     andl(len,  0xffffffc0); // vector count (in chars)
4163     jccb(Assembler::zero, test_tail);
4164 
4165     lea(ary1, Address(ary1, len, Address::times_1));
4166     negptr(len);
4167 
4168     bind(test_64_loop);
4169     // Check whether our 64 elements of size byte contain negatives
4170     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4171     kortestql(mask1, mask1);
4172     jcc(Assembler::notZero, BREAK_LOOP);
4173 
4174     addptr(len, 64);
4175     jccb(Assembler::notZero, test_64_loop);
4176 
4177     bind(test_tail);
4178     // bail out when there is nothing to be done
4179     testl(tmp1, -1);
4180     jcc(Assembler::zero, DONE);
4181 
4182 
4183     // check the tail for absense of negatives
4184     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4185 #ifdef _LP64
4186     {
4187       Register tmp3_aliased = len;
4188       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4189       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4190       notq(tmp3_aliased);
4191       kmovql(mask2, tmp3_aliased);
4192     }
4193 #else
4194     Label k_init;
4195     jmp(k_init);
4196 
4197     // We could not read 64-bits from a general purpose register thus we move
4198     // data required to compose 64 1's to the instruction stream
4199     // We emit 64 byte wide series of elements from 0..63 which later on would
4200     // be used as a compare targets with tail count contained in tmp1 register.
4201     // Result would be a k register having tmp1 consecutive number or 1
4202     // counting from least significant bit.
4203     address tmp = pc();
4204     emit_int64(0x0706050403020100);
4205     emit_int64(0x0F0E0D0C0B0A0908);
4206     emit_int64(0x1716151413121110);
4207     emit_int64(0x1F1E1D1C1B1A1918);
4208     emit_int64(0x2726252423222120);
4209     emit_int64(0x2F2E2D2C2B2A2928);
4210     emit_int64(0x3736353433323130);
4211     emit_int64(0x3F3E3D3C3B3A3938);
4212 
4213     bind(k_init);
4214     lea(len, InternalAddress(tmp));
4215     // create mask to test for negative byte inside a vector
4216     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4217     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4218 
4219 #endif
4220     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4221     ktestq(mask1, mask2);
4222     jcc(Assembler::zero, DONE);
4223 
4224     // do a full check for negative registers in the tail
4225     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4226                      // ary1 already pointing to the right place
4227     jmpb(TAIL_START);
4228 
4229     bind(BREAK_LOOP);
4230     // At least one byte in the last 64 byte block was negative.
4231     // Set up to look at the last 64 bytes as if they were a tail
4232     lea(ary1, Address(ary1, len, Address::times_1));
4233     addptr(result, len);
4234     // Ignore the very last byte: if all others are positive,
4235     // it must be negative, so we can skip right to the 2+1 byte
4236     // end comparison at this point
4237     orl(result, 63);
4238     movl(len, 63);
4239     // Fallthru to tail compare
4240   } else {
4241 
4242     if (UseAVX >= 2 && UseSSE >= 2) {
4243       // With AVX2, use 32-byte vector compare
4244       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4245 
4246       // Compare 32-byte vectors
4247       testl(len, 0xffffffe0);   // vector count (in bytes)
4248       jccb(Assembler::zero, TAIL_START);
4249 
4250       andl(len, 0xffffffe0);
4251       lea(ary1, Address(ary1, len, Address::times_1));
4252       negptr(len);
4253 
4254       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4255       movdl(vec2, tmp1);
4256       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4257 
4258       bind(COMPARE_WIDE_VECTORS);
4259       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4260       vptest(vec1, vec2);
4261       jccb(Assembler::notZero, BREAK_LOOP);
4262       addptr(len, 32);
4263       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4264 
4265       testl(result, 0x0000001f);   // any bytes remaining?
4266       jcc(Assembler::zero, DONE);
4267 
4268       // Quick test using the already prepared vector mask
4269       movl(len, result);
4270       andl(len, 0x0000001f);
4271       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4272       vptest(vec1, vec2);
4273       jcc(Assembler::zero, DONE);
4274       // There are zeros, jump to the tail to determine exactly where
4275       jmpb(TAIL_START);
4276 
4277       bind(BREAK_LOOP);
4278       // At least one byte in the last 32-byte vector is negative.
4279       // Set up to look at the last 32 bytes as if they were a tail
4280       lea(ary1, Address(ary1, len, Address::times_1));
4281       addptr(result, len);
4282       // Ignore the very last byte: if all others are positive,
4283       // it must be negative, so we can skip right to the 2+1 byte
4284       // end comparison at this point
4285       orl(result, 31);
4286       movl(len, 31);
4287       // Fallthru to tail compare
4288     } else if (UseSSE42Intrinsics) {
4289       // With SSE4.2, use double quad vector compare
4290       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4291 
4292       // Compare 16-byte vectors
4293       testl(len, 0xfffffff0);   // vector count (in bytes)
4294       jcc(Assembler::zero, TAIL_START);
4295 
4296       andl(len, 0xfffffff0);
4297       lea(ary1, Address(ary1, len, Address::times_1));
4298       negptr(len);
4299 
4300       movl(tmp1, 0x80808080);
4301       movdl(vec2, tmp1);
4302       pshufd(vec2, vec2, 0);
4303 
4304       bind(COMPARE_WIDE_VECTORS);
4305       movdqu(vec1, Address(ary1, len, Address::times_1));
4306       ptest(vec1, vec2);
4307       jccb(Assembler::notZero, BREAK_LOOP);
4308       addptr(len, 16);
4309       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4310 
4311       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4312       jcc(Assembler::zero, DONE);
4313 
4314       // Quick test using the already prepared vector mask
4315       movl(len, result);
4316       andl(len, 0x0000000f);   // tail count (in bytes)
4317       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4318       ptest(vec1, vec2);
4319       jcc(Assembler::zero, DONE);
4320       jmpb(TAIL_START);
4321 
4322       bind(BREAK_LOOP);
4323       // At least one byte in the last 16-byte vector is negative.
4324       // Set up and look at the last 16 bytes as if they were a tail
4325       lea(ary1, Address(ary1, len, Address::times_1));
4326       addptr(result, len);
4327       // Ignore the very last byte: if all others are positive,
4328       // it must be negative, so we can skip right to the 2+1 byte
4329       // end comparison at this point
4330       orl(result, 15);
4331       movl(len, 15);
4332       // Fallthru to tail compare
4333     }
4334   }
4335 
4336   bind(TAIL_START);
4337   // Compare 4-byte vectors
4338   andl(len, 0xfffffffc); // vector count (in bytes)
4339   jccb(Assembler::zero, COMPARE_CHAR);
4340 
4341   lea(ary1, Address(ary1, len, Address::times_1));
4342   negptr(len);
4343 
4344   bind(COMPARE_VECTORS);
4345   movl(tmp1, Address(ary1, len, Address::times_1));
4346   andl(tmp1, 0x80808080);
4347   jccb(Assembler::notZero, TAIL_ADJUST);
4348   addptr(len, 4);
4349   jccb(Assembler::notZero, COMPARE_VECTORS);
4350 
4351   // Compare trailing char (final 2-3 bytes), if any
4352   bind(COMPARE_CHAR);
4353 
4354   testl(result, 0x2);   // tail  char
4355   jccb(Assembler::zero, COMPARE_BYTE);
4356   load_unsigned_short(tmp1, Address(ary1, 0));
4357   andl(tmp1, 0x00008080);
4358   jccb(Assembler::notZero, CHAR_ADJUST);
4359   lea(ary1, Address(ary1, 2));
4360 
4361   bind(COMPARE_BYTE);
4362   testl(result, 0x1);   // tail  byte
4363   jccb(Assembler::zero, DONE);
4364   load_unsigned_byte(tmp1, Address(ary1, 0));
4365   testl(tmp1, 0x00000080);
4366   jccb(Assembler::zero, DONE);
4367   subptr(result, 1);
4368   jmpb(DONE);
4369 
4370   bind(TAIL_ADJUST);
4371   // there are negative bits in the last 4 byte block.
4372   // Adjust result and check the next three bytes
4373   addptr(result, len);
4374   orl(result, 3);
4375   lea(ary1, Address(ary1, len, Address::times_1));
4376   jmpb(COMPARE_CHAR);
4377 
4378   bind(CHAR_ADJUST);
4379   // We are looking at a char + optional byte tail, and found that one
4380   // of the bytes in the char is negative. Adjust the result, check the
4381   // first byte and readjust if needed.
4382   andl(result, 0xfffffffc);
4383   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4384   jccb(Assembler::notZero, DONE);
4385   addptr(result, 1);
4386 
4387   // That's it
4388   bind(DONE);
4389   if (UseAVX >= 2 && UseSSE >= 2) {
4390     // clean upper bits of YMM registers
4391     vpxor(vec1, vec1);
4392     vpxor(vec2, vec2);
4393   }
4394 }
4395 
4396 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4397 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4398                                       Register limit, Register result, Register chr,
4399                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4400                                       KRegister mask, bool expand_ary2) {
4401   // for expand_ary2, limit is the (smaller) size of the second array.
4402   ShortBranchVerifier sbv(this);
4403   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4404 
4405   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4406          "Expansion only implemented for AVX2");
4407 
4408   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4409   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4410 
4411   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4412   int scaleIncr = expand_ary2 ? 8 : 16;
4413 
4414   if (is_array_equ) {
4415     // Check the input args
4416     cmpoop(ary1, ary2);
4417     jcc(Assembler::equal, TRUE_LABEL);
4418 
4419     // Need additional checks for arrays_equals.
4420     testptr(ary1, ary1);
4421     jcc(Assembler::zero, FALSE_LABEL);
4422     testptr(ary2, ary2);
4423     jcc(Assembler::zero, FALSE_LABEL);
4424 
4425     // Check the lengths
4426     movl(limit, Address(ary1, length_offset));
4427     cmpl(limit, Address(ary2, length_offset));
4428     jcc(Assembler::notEqual, FALSE_LABEL);
4429   }
4430 
4431   // count == 0
4432   testl(limit, limit);
4433   jcc(Assembler::zero, TRUE_LABEL);
4434 
4435   if (is_array_equ) {
4436     // Load array address
4437     lea(ary1, Address(ary1, base_offset));
4438     lea(ary2, Address(ary2, base_offset));
4439   }
4440 
4441   if (is_array_equ && is_char) {
4442     // arrays_equals when used for char[].
4443     shll(limit, 1);      // byte count != 0
4444   }
4445   movl(result, limit); // copy
4446 
4447   if (UseAVX >= 2) {
4448     // With AVX2, use 32-byte vector compare
4449     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4450 
4451     // Compare 32-byte vectors
4452     if (expand_ary2) {
4453       andl(result, 0x0000000f);  //   tail count (in bytes)
4454       andl(limit, 0xfffffff0);   // vector count (in bytes)
4455       jcc(Assembler::zero, COMPARE_TAIL);
4456     } else {
4457       andl(result, 0x0000001f);  //   tail count (in bytes)
4458       andl(limit, 0xffffffe0);   // vector count (in bytes)
4459       jcc(Assembler::zero, COMPARE_TAIL_16);
4460     }
4461 
4462     lea(ary1, Address(ary1, limit, scaleFactor));
4463     lea(ary2, Address(ary2, limit, Address::times_1));
4464     negptr(limit);
4465 
4466 #ifdef _LP64
4467     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4468       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4469 
4470       cmpl(limit, -64);
4471       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4472 
4473       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4474 
4475       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4476       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4477       kortestql(mask, mask);
4478       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4479       addptr(limit, 64);  // update since we already compared at this addr
4480       cmpl(limit, -64);
4481       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4482 
4483       // At this point we may still need to compare -limit+result bytes.
4484       // We could execute the next two instruction and just continue via non-wide path:
4485       //  cmpl(limit, 0);
4486       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4487       // But since we stopped at the points ary{1,2}+limit which are
4488       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4489       // (|limit| <= 32 and result < 32),
4490       // we may just compare the last 64 bytes.
4491       //
4492       addptr(result, -64);   // it is safe, bc we just came from this area
4493       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4494       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4495       kortestql(mask, mask);
4496       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4497 
4498       jmp(TRUE_LABEL);
4499 
4500       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4501 
4502     }//if (VM_Version::supports_avx512vlbw())
4503 #endif //_LP64
4504     bind(COMPARE_WIDE_VECTORS);
4505     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4506     if (expand_ary2) {
4507       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4508     } else {
4509       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4510     }
4511     vpxor(vec1, vec2);
4512 
4513     vptest(vec1, vec1);
4514     jcc(Assembler::notZero, FALSE_LABEL);
4515     addptr(limit, scaleIncr * 2);
4516     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4517 
4518     testl(result, result);
4519     jcc(Assembler::zero, TRUE_LABEL);
4520 
4521     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4522     if (expand_ary2) {
4523       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4524     } else {
4525       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4526     }
4527     vpxor(vec1, vec2);
4528 
4529     vptest(vec1, vec1);
4530     jcc(Assembler::notZero, FALSE_LABEL);
4531     jmp(TRUE_LABEL);
4532 
4533     bind(COMPARE_TAIL_16); // limit is zero
4534     movl(limit, result);
4535 
4536     // Compare 16-byte chunks
4537     andl(result, 0x0000000f);  //   tail count (in bytes)
4538     andl(limit, 0xfffffff0);   // vector count (in bytes)
4539     jcc(Assembler::zero, COMPARE_TAIL);
4540 
4541     lea(ary1, Address(ary1, limit, scaleFactor));
4542     lea(ary2, Address(ary2, limit, Address::times_1));
4543     negptr(limit);
4544 
4545     bind(COMPARE_WIDE_VECTORS_16);
4546     movdqu(vec1, Address(ary1, limit, scaleFactor));
4547     if (expand_ary2) {
4548       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4549     } else {
4550       movdqu(vec2, Address(ary2, limit, Address::times_1));
4551     }
4552     pxor(vec1, vec2);
4553 
4554     ptest(vec1, vec1);
4555     jcc(Assembler::notZero, FALSE_LABEL);
4556     addptr(limit, scaleIncr);
4557     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4558 
4559     bind(COMPARE_TAIL); // limit is zero
4560     movl(limit, result);
4561     // Fallthru to tail compare
4562   } else if (UseSSE42Intrinsics) {
4563     // With SSE4.2, use double quad vector compare
4564     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4565 
4566     // Compare 16-byte vectors
4567     andl(result, 0x0000000f);  //   tail count (in bytes)
4568     andl(limit, 0xfffffff0);   // vector count (in bytes)
4569     jcc(Assembler::zero, COMPARE_TAIL);
4570 
4571     lea(ary1, Address(ary1, limit, Address::times_1));
4572     lea(ary2, Address(ary2, limit, Address::times_1));
4573     negptr(limit);
4574 
4575     bind(COMPARE_WIDE_VECTORS);
4576     movdqu(vec1, Address(ary1, limit, Address::times_1));
4577     movdqu(vec2, Address(ary2, limit, Address::times_1));
4578     pxor(vec1, vec2);
4579 
4580     ptest(vec1, vec1);
4581     jcc(Assembler::notZero, FALSE_LABEL);
4582     addptr(limit, 16);
4583     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4584 
4585     testl(result, result);
4586     jcc(Assembler::zero, TRUE_LABEL);
4587 
4588     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4589     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4590     pxor(vec1, vec2);
4591 
4592     ptest(vec1, vec1);
4593     jccb(Assembler::notZero, FALSE_LABEL);
4594     jmpb(TRUE_LABEL);
4595 
4596     bind(COMPARE_TAIL); // limit is zero
4597     movl(limit, result);
4598     // Fallthru to tail compare
4599   }
4600 
4601   // Compare 4-byte vectors
4602   if (expand_ary2) {
4603     testl(result, result);
4604     jccb(Assembler::zero, TRUE_LABEL);
4605   } else {
4606     andl(limit, 0xfffffffc); // vector count (in bytes)
4607     jccb(Assembler::zero, COMPARE_CHAR);
4608   }
4609 
4610   lea(ary1, Address(ary1, limit, scaleFactor));
4611   lea(ary2, Address(ary2, limit, Address::times_1));
4612   negptr(limit);
4613 
4614   bind(COMPARE_VECTORS);
4615   if (expand_ary2) {
4616     // There are no "vector" operations for bytes to shorts
4617     movzbl(chr, Address(ary2, limit, Address::times_1));
4618     cmpw(Address(ary1, limit, Address::times_2), chr);
4619     jccb(Assembler::notEqual, FALSE_LABEL);
4620     addptr(limit, 1);
4621     jcc(Assembler::notZero, COMPARE_VECTORS);
4622     jmp(TRUE_LABEL);
4623   } else {
4624     movl(chr, Address(ary1, limit, Address::times_1));
4625     cmpl(chr, Address(ary2, limit, Address::times_1));
4626     jccb(Assembler::notEqual, FALSE_LABEL);
4627     addptr(limit, 4);
4628     jcc(Assembler::notZero, COMPARE_VECTORS);
4629   }
4630 
4631   // Compare trailing char (final 2 bytes), if any
4632   bind(COMPARE_CHAR);
4633   testl(result, 0x2);   // tail  char
4634   jccb(Assembler::zero, COMPARE_BYTE);
4635   load_unsigned_short(chr, Address(ary1, 0));
4636   load_unsigned_short(limit, Address(ary2, 0));
4637   cmpl(chr, limit);
4638   jccb(Assembler::notEqual, FALSE_LABEL);
4639 
4640   if (is_array_equ && is_char) {
4641     bind(COMPARE_BYTE);
4642   } else {
4643     lea(ary1, Address(ary1, 2));
4644     lea(ary2, Address(ary2, 2));
4645 
4646     bind(COMPARE_BYTE);
4647     testl(result, 0x1);   // tail  byte
4648     jccb(Assembler::zero, TRUE_LABEL);
4649     load_unsigned_byte(chr, Address(ary1, 0));
4650     load_unsigned_byte(limit, Address(ary2, 0));
4651     cmpl(chr, limit);
4652     jccb(Assembler::notEqual, FALSE_LABEL);
4653   }
4654   bind(TRUE_LABEL);
4655   movl(result, 1);   // return true
4656   jmpb(DONE);
4657 
4658   bind(FALSE_LABEL);
4659   xorl(result, result); // return false
4660 
4661   // That's it
4662   bind(DONE);
4663   if (UseAVX >= 2) {
4664     // clean upper bits of YMM registers
4665     vpxor(vec1, vec1);
4666     vpxor(vec2, vec2);
4667   }
4668 }
4669 
4670 #ifdef _LP64
4671 
4672 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4673 #define __ masm.
4674   Register dst = stub.data<0>();
4675   XMMRegister src = stub.data<1>();
4676   address target = stub.data<2>();
4677   __ bind(stub.entry());
4678   __ subptr(rsp, 8);
4679   __ movdbl(Address(rsp), src);
4680   __ call(RuntimeAddress(target));
4681   __ pop(dst);
4682   __ jmp(stub.continuation());
4683 #undef __
4684 }
4685 
4686 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4687   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4688   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4689 
4690   address slowpath_target;
4691   if (dst_bt == T_INT) {
4692     if (src_bt == T_FLOAT) {
4693       cvttss2sil(dst, src);
4694       cmpl(dst, 0x80000000);
4695       slowpath_target = StubRoutines::x86::f2i_fixup();
4696     } else {
4697       cvttsd2sil(dst, src);
4698       cmpl(dst, 0x80000000);
4699       slowpath_target = StubRoutines::x86::d2i_fixup();
4700     }
4701   } else {
4702     if (src_bt == T_FLOAT) {
4703       cvttss2siq(dst, src);
4704       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4705       slowpath_target = StubRoutines::x86::f2l_fixup();
4706     } else {
4707       cvttsd2siq(dst, src);
4708       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4709       slowpath_target = StubRoutines::x86::d2l_fixup();
4710     }
4711   }
4712 
4713   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4714   jcc(Assembler::equal, stub->entry());
4715   bind(stub->continuation());
4716 }
4717 
4718 #endif // _LP64
4719 
4720 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4721                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4722   switch(ideal_opc) {
4723     case Op_LShiftVS:
4724       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4725     case Op_LShiftVI:
4726       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4727     case Op_LShiftVL:
4728       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4729     case Op_RShiftVS:
4730       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4731     case Op_RShiftVI:
4732       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4733     case Op_RShiftVL:
4734       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4735     case Op_URShiftVS:
4736       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4737     case Op_URShiftVI:
4738       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4739     case Op_URShiftVL:
4740       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4741     case Op_RotateRightV:
4742       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4743     case Op_RotateLeftV:
4744       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4745     default:
4746       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4747       break;
4748   }
4749 }
4750 
4751 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4752                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4753   if (is_unsigned) {
4754     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4755   } else {
4756     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4757   }
4758 }
4759 
4760 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4761                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4762   switch (elem_bt) {
4763     case T_BYTE:
4764       if (ideal_opc == Op_SaturatingAddV) {
4765         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4766       } else {
4767         assert(ideal_opc == Op_SaturatingSubV, "");
4768         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4769       }
4770       break;
4771     case T_SHORT:
4772       if (ideal_opc == Op_SaturatingAddV) {
4773         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4774       } else {
4775         assert(ideal_opc == Op_SaturatingSubV, "");
4776         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4777       }
4778       break;
4779     default:
4780       fatal("Unsupported type %s", type2name(elem_bt));
4781       break;
4782   }
4783 }
4784 
4785 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4786                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4787   switch (elem_bt) {
4788     case T_BYTE:
4789       if (ideal_opc == Op_SaturatingAddV) {
4790         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4791       } else {
4792         assert(ideal_opc == Op_SaturatingSubV, "");
4793         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4794       }
4795       break;
4796     case T_SHORT:
4797       if (ideal_opc == Op_SaturatingAddV) {
4798         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4799       } else {
4800         assert(ideal_opc == Op_SaturatingSubV, "");
4801         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4802       }
4803       break;
4804     default:
4805       fatal("Unsupported type %s", type2name(elem_bt));
4806       break;
4807   }
4808 }
4809 
4810 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4811                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4812   if (is_unsigned) {
4813     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4814   } else {
4815     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4816   }
4817 }
4818 
4819 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4820                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4821   switch (elem_bt) {
4822     case T_BYTE:
4823       if (ideal_opc == Op_SaturatingAddV) {
4824         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4825       } else {
4826         assert(ideal_opc == Op_SaturatingSubV, "");
4827         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4828       }
4829       break;
4830     case T_SHORT:
4831       if (ideal_opc == Op_SaturatingAddV) {
4832         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4833       } else {
4834         assert(ideal_opc == Op_SaturatingSubV, "");
4835         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4836       }
4837       break;
4838     default:
4839       fatal("Unsupported type %s", type2name(elem_bt));
4840       break;
4841   }
4842 }
4843 
4844 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4845                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4846   switch (elem_bt) {
4847     case T_BYTE:
4848       if (ideal_opc == Op_SaturatingAddV) {
4849         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4850       } else {
4851         assert(ideal_opc == Op_SaturatingSubV, "");
4852         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4853       }
4854       break;
4855     case T_SHORT:
4856       if (ideal_opc == Op_SaturatingAddV) {
4857         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4858       } else {
4859         assert(ideal_opc == Op_SaturatingSubV, "");
4860         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4861       }
4862       break;
4863     default:
4864       fatal("Unsupported type %s", type2name(elem_bt));
4865       break;
4866   }
4867 }
4868 
4869 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4870                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4871                                     bool is_varshift) {
4872   switch (ideal_opc) {
4873     case Op_AddVB:
4874       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4875     case Op_AddVS:
4876       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4877     case Op_AddVI:
4878       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4879     case Op_AddVL:
4880       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4881     case Op_AddVF:
4882       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4883     case Op_AddVD:
4884       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4885     case Op_SubVB:
4886       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4887     case Op_SubVS:
4888       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4889     case Op_SubVI:
4890       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4891     case Op_SubVL:
4892       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4893     case Op_SubVF:
4894       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4895     case Op_SubVD:
4896       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4897     case Op_MulVS:
4898       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4899     case Op_MulVI:
4900       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4901     case Op_MulVL:
4902       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4903     case Op_MulVF:
4904       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4905     case Op_MulVD:
4906       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4907     case Op_DivVF:
4908       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4909     case Op_DivVD:
4910       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4911     case Op_SqrtVF:
4912       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4913     case Op_SqrtVD:
4914       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4915     case Op_AbsVB:
4916       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4917     case Op_AbsVS:
4918       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4919     case Op_AbsVI:
4920       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4921     case Op_AbsVL:
4922       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4923     case Op_FmaVF:
4924       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4925     case Op_FmaVD:
4926       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4927     case Op_VectorRearrange:
4928       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4929     case Op_LShiftVS:
4930       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4931     case Op_LShiftVI:
4932       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4933     case Op_LShiftVL:
4934       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4935     case Op_RShiftVS:
4936       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4937     case Op_RShiftVI:
4938       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4939     case Op_RShiftVL:
4940       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4941     case Op_URShiftVS:
4942       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4943     case Op_URShiftVI:
4944       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4945     case Op_URShiftVL:
4946       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4947     case Op_RotateLeftV:
4948       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4949     case Op_RotateRightV:
4950       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4951     case Op_MaxV:
4952       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4953     case Op_MinV:
4954       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4955     case Op_UMinV:
4956       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4957     case Op_UMaxV:
4958       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4959     case Op_XorV:
4960       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4961     case Op_OrV:
4962       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4963     case Op_AndV:
4964       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4965     default:
4966       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4967       break;
4968   }
4969 }
4970 
4971 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4972                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4973   switch (ideal_opc) {
4974     case Op_AddVB:
4975       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4976     case Op_AddVS:
4977       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4978     case Op_AddVI:
4979       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4980     case Op_AddVL:
4981       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4982     case Op_AddVF:
4983       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4984     case Op_AddVD:
4985       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4986     case Op_SubVB:
4987       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4988     case Op_SubVS:
4989       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4990     case Op_SubVI:
4991       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4992     case Op_SubVL:
4993       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4994     case Op_SubVF:
4995       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4996     case Op_SubVD:
4997       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4998     case Op_MulVS:
4999       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
5000     case Op_MulVI:
5001       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
5002     case Op_MulVL:
5003       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
5004     case Op_MulVF:
5005       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
5006     case Op_MulVD:
5007       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
5008     case Op_DivVF:
5009       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
5010     case Op_DivVD:
5011       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
5012     case Op_FmaVF:
5013       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
5014     case Op_FmaVD:
5015       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
5016     case Op_MaxV:
5017       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5018     case Op_MinV:
5019       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5020     case Op_UMaxV:
5021       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5022     case Op_UMinV:
5023       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5024     case Op_XorV:
5025       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5026     case Op_OrV:
5027       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5028     case Op_AndV:
5029       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5030     default:
5031       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
5032       break;
5033   }
5034 }
5035 
5036 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
5037                                   KRegister src1, KRegister src2) {
5038   BasicType etype = T_ILLEGAL;
5039   switch(mask_len) {
5040     case 2:
5041     case 4:
5042     case 8:  etype = T_BYTE; break;
5043     case 16: etype = T_SHORT; break;
5044     case 32: etype = T_INT; break;
5045     case 64: etype = T_LONG; break;
5046     default: fatal("Unsupported type"); break;
5047   }
5048   assert(etype != T_ILLEGAL, "");
5049   switch(ideal_opc) {
5050     case Op_AndVMask:
5051       kand(etype, dst, src1, src2); break;
5052     case Op_OrVMask:
5053       kor(etype, dst, src1, src2); break;
5054     case Op_XorVMask:
5055       kxor(etype, dst, src1, src2); break;
5056     default:
5057       fatal("Unsupported masked operation"); break;
5058   }
5059 }
5060 
5061 /*
5062  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5063  * If src is NaN, the result is 0.
5064  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
5065  * the result is equal to the value of Integer.MIN_VALUE.
5066  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
5067  * the result is equal to the value of Integer.MAX_VALUE.
5068  */
5069 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5070                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5071                                                                    Register rscratch, AddressLiteral float_sign_flip,
5072                                                                    int vec_enc) {
5073   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5074   Label done;
5075   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
5076   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5077   vptest(xtmp2, xtmp2, vec_enc);
5078   jccb(Assembler::equal, done);
5079 
5080   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
5081   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
5082 
5083   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5084   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
5085   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
5086 
5087   // Recompute the mask for remaining special value.
5088   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
5089   // Extract SRC values corresponding to TRUE mask lanes.
5090   vpand(xtmp4, xtmp2, src, vec_enc);
5091   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
5092   // values are set.
5093   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5094 
5095   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5096   bind(done);
5097 }
5098 
5099 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5100                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5101                                                                     Register rscratch, AddressLiteral float_sign_flip,
5102                                                                     int vec_enc) {
5103   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5104   Label done;
5105   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5106   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5107   kortestwl(ktmp1, ktmp1);
5108   jccb(Assembler::equal, done);
5109 
5110   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5111   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5112   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5113 
5114   kxorwl(ktmp1, ktmp1, ktmp2);
5115   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5116   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5117   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5118   bind(done);
5119 }
5120 
5121 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5122                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5123                                                                      Register rscratch, AddressLiteral double_sign_flip,
5124                                                                      int vec_enc) {
5125   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5126 
5127   Label done;
5128   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5129   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5130   kortestwl(ktmp1, ktmp1);
5131   jccb(Assembler::equal, done);
5132 
5133   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5134   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5135   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5136 
5137   kxorwl(ktmp1, ktmp1, ktmp2);
5138   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5139   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5140   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5141   bind(done);
5142 }
5143 
5144 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5145                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5146                                                                      Register rscratch, AddressLiteral float_sign_flip,
5147                                                                      int vec_enc) {
5148   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5149   Label done;
5150   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5151   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5152   kortestwl(ktmp1, ktmp1);
5153   jccb(Assembler::equal, done);
5154 
5155   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5156   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5157   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5158 
5159   kxorwl(ktmp1, ktmp1, ktmp2);
5160   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5161   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5162   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5163   bind(done);
5164 }
5165 
5166 /*
5167  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5168  * If src is NaN, the result is 0.
5169  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5170  * the result is equal to the value of Long.MIN_VALUE.
5171  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5172  * the result is equal to the value of Long.MAX_VALUE.
5173  */
5174 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5175                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5176                                                                       Register rscratch, AddressLiteral double_sign_flip,
5177                                                                       int vec_enc) {
5178   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5179 
5180   Label done;
5181   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5182   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5183   kortestwl(ktmp1, ktmp1);
5184   jccb(Assembler::equal, done);
5185 
5186   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5187   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5188   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5189 
5190   kxorwl(ktmp1, ktmp1, ktmp2);
5191   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5192   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5193   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5194   bind(done);
5195 }
5196 
5197 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5198                                                              XMMRegister xtmp, int index, int vec_enc) {
5199    assert(vec_enc < Assembler::AVX_512bit, "");
5200    if (vec_enc == Assembler::AVX_256bit) {
5201      vextractf128_high(xtmp, src);
5202      vshufps(dst, src, xtmp, index, vec_enc);
5203    } else {
5204      vshufps(dst, src, zero, index, vec_enc);
5205    }
5206 }
5207 
5208 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5209                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5210                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5211   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5212 
5213   Label done;
5214   // Compare the destination lanes with float_sign_flip
5215   // value to get mask for all special values.
5216   movdqu(xtmp1, float_sign_flip, rscratch);
5217   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5218   ptest(xtmp2, xtmp2);
5219   jccb(Assembler::equal, done);
5220 
5221   // Flip float_sign_flip to get max integer value.
5222   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5223   pxor(xtmp1, xtmp4);
5224 
5225   // Set detination lanes corresponding to unordered source lanes as zero.
5226   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5227   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5228 
5229   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5230   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5231   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5232 
5233   // Recompute the mask for remaining special value.
5234   pxor(xtmp2, xtmp3);
5235   // Extract mask corresponding to non-negative source lanes.
5236   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5237 
5238   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5239   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5240   pand(xtmp3, xtmp2);
5241 
5242   // Replace destination lanes holding special value(0x80000000) with max int
5243   // if corresponding source lane holds a +ve value.
5244   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5245   bind(done);
5246 }
5247 
5248 
5249 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5250                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5251   switch(to_elem_bt) {
5252     case T_SHORT:
5253       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5254       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5255       vpackusdw(dst, dst, zero, vec_enc);
5256       if (vec_enc == Assembler::AVX_256bit) {
5257         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5258       }
5259       break;
5260     case  T_BYTE:
5261       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5262       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5263       vpackusdw(dst, dst, zero, vec_enc);
5264       if (vec_enc == Assembler::AVX_256bit) {
5265         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5266       }
5267       vpackuswb(dst, dst, zero, vec_enc);
5268       break;
5269     default: assert(false, "%s", type2name(to_elem_bt));
5270   }
5271 }
5272 
5273 /*
5274  * Algorithm for vector D2L and F2I conversions:-
5275  * a) Perform vector D2L/F2I cast.
5276  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5277  *    It signifies that source value could be any of the special floating point
5278  *    values(NaN,-Inf,Inf,Max,-Min).
5279  * c) Set destination to zero if source is NaN value.
5280  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5281  */
5282 
5283 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5284                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5285                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5286   int to_elem_sz = type2aelembytes(to_elem_bt);
5287   assert(to_elem_sz <= 4, "");
5288   vcvttps2dq(dst, src, vec_enc);
5289   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5290   if (to_elem_sz < 4) {
5291     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5292     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5293   }
5294 }
5295 
5296 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5297                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5298                                             Register rscratch, int vec_enc) {
5299   int to_elem_sz = type2aelembytes(to_elem_bt);
5300   assert(to_elem_sz <= 4, "");
5301   vcvttps2dq(dst, src, vec_enc);
5302   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5303   switch(to_elem_bt) {
5304     case T_INT:
5305       break;
5306     case T_SHORT:
5307       evpmovdw(dst, dst, vec_enc);
5308       break;
5309     case T_BYTE:
5310       evpmovdb(dst, dst, vec_enc);
5311       break;
5312     default: assert(false, "%s", type2name(to_elem_bt));
5313   }
5314 }
5315 
5316 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5317                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5318                                             Register rscratch, int vec_enc) {
5319   evcvttps2qq(dst, src, vec_enc);
5320   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5321 }
5322 
5323 // Handling for downcasting from double to integer or sub-word types on AVX2.
5324 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5325                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5326                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5327   int to_elem_sz = type2aelembytes(to_elem_bt);
5328   assert(to_elem_sz < 8, "");
5329   vcvttpd2dq(dst, src, vec_enc);
5330   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5331                                               float_sign_flip, vec_enc);
5332   if (to_elem_sz < 4) {
5333     // xtmp4 holds all zero lanes.
5334     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5335   }
5336 }
5337 
5338 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5339                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5340                                             KRegister ktmp2, AddressLiteral sign_flip,
5341                                             Register rscratch, int vec_enc) {
5342   if (VM_Version::supports_avx512dq()) {
5343     evcvttpd2qq(dst, src, vec_enc);
5344     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5345     switch(to_elem_bt) {
5346       case T_LONG:
5347         break;
5348       case T_INT:
5349         evpmovsqd(dst, dst, vec_enc);
5350         break;
5351       case T_SHORT:
5352         evpmovsqd(dst, dst, vec_enc);
5353         evpmovdw(dst, dst, vec_enc);
5354         break;
5355       case T_BYTE:
5356         evpmovsqd(dst, dst, vec_enc);
5357         evpmovdb(dst, dst, vec_enc);
5358         break;
5359       default: assert(false, "%s", type2name(to_elem_bt));
5360     }
5361   } else {
5362     assert(type2aelembytes(to_elem_bt) <= 4, "");
5363     vcvttpd2dq(dst, src, vec_enc);
5364     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5365     switch(to_elem_bt) {
5366       case T_INT:
5367         break;
5368       case T_SHORT:
5369         evpmovdw(dst, dst, vec_enc);
5370         break;
5371       case T_BYTE:
5372         evpmovdb(dst, dst, vec_enc);
5373         break;
5374       default: assert(false, "%s", type2name(to_elem_bt));
5375     }
5376   }
5377 }
5378 
5379 #ifdef _LP64
5380 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5381                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5382                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5383   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5384   // and re-instantiate original MXCSR.RC mode after that.
5385   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5386 
5387   mov64(tmp, julong_cast(0.5L));
5388   evpbroadcastq(xtmp1, tmp, vec_enc);
5389   vaddpd(xtmp1, src , xtmp1, vec_enc);
5390   evcvtpd2qq(dst, xtmp1, vec_enc);
5391   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5392                                                 double_sign_flip, vec_enc);;
5393 
5394   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5395 }
5396 
5397 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5398                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5399                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5400   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5401   // and re-instantiate original MXCSR.RC mode after that.
5402   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5403 
5404   movl(tmp, jint_cast(0.5));
5405   movq(xtmp1, tmp);
5406   vbroadcastss(xtmp1, xtmp1, vec_enc);
5407   vaddps(xtmp1, src , xtmp1, vec_enc);
5408   vcvtps2dq(dst, xtmp1, vec_enc);
5409   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5410                                               float_sign_flip, vec_enc);
5411 
5412   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5413 }
5414 
5415 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5416                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5417                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5418   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5419   // and re-instantiate original MXCSR.RC mode after that.
5420   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5421 
5422   movl(tmp, jint_cast(0.5));
5423   movq(xtmp1, tmp);
5424   vbroadcastss(xtmp1, xtmp1, vec_enc);
5425   vaddps(xtmp1, src , xtmp1, vec_enc);
5426   vcvtps2dq(dst, xtmp1, vec_enc);
5427   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5428 
5429   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5430 }
5431 #endif // _LP64
5432 
5433 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5434                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5435   switch (from_elem_bt) {
5436     case T_BYTE:
5437       switch (to_elem_bt) {
5438         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5439         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5440         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5441         default: ShouldNotReachHere();
5442       }
5443       break;
5444     case T_SHORT:
5445       switch (to_elem_bt) {
5446         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5447         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5448         default: ShouldNotReachHere();
5449       }
5450       break;
5451     case T_INT:
5452       assert(to_elem_bt == T_LONG, "");
5453       vpmovzxdq(dst, src, vlen_enc);
5454       break;
5455     default:
5456       ShouldNotReachHere();
5457   }
5458 }
5459 
5460 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5461                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5462   switch (from_elem_bt) {
5463     case T_BYTE:
5464       switch (to_elem_bt) {
5465         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5466         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5467         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5468         default: ShouldNotReachHere();
5469       }
5470       break;
5471     case T_SHORT:
5472       switch (to_elem_bt) {
5473         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5474         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5475         default: ShouldNotReachHere();
5476       }
5477       break;
5478     case T_INT:
5479       assert(to_elem_bt == T_LONG, "");
5480       vpmovsxdq(dst, src, vlen_enc);
5481       break;
5482     default:
5483       ShouldNotReachHere();
5484   }
5485 }
5486 
5487 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5488                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5489   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5490   assert(vlen_enc != AVX_512bit, "");
5491 
5492   int dst_bt_size = type2aelembytes(dst_bt);
5493   int src_bt_size = type2aelembytes(src_bt);
5494   if (dst_bt_size > src_bt_size) {
5495     switch (dst_bt_size / src_bt_size) {
5496       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5497       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5498       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5499       default: ShouldNotReachHere();
5500     }
5501   } else {
5502     assert(dst_bt_size < src_bt_size, "");
5503     switch (src_bt_size / dst_bt_size) {
5504       case 2: {
5505         if (vlen_enc == AVX_128bit) {
5506           vpacksswb(dst, src, src, vlen_enc);
5507         } else {
5508           vpacksswb(dst, src, src, vlen_enc);
5509           vpermq(dst, dst, 0x08, vlen_enc);
5510         }
5511         break;
5512       }
5513       case 4: {
5514         if (vlen_enc == AVX_128bit) {
5515           vpackssdw(dst, src, src, vlen_enc);
5516           vpacksswb(dst, dst, dst, vlen_enc);
5517         } else {
5518           vpackssdw(dst, src, src, vlen_enc);
5519           vpermq(dst, dst, 0x08, vlen_enc);
5520           vpacksswb(dst, dst, dst, AVX_128bit);
5521         }
5522         break;
5523       }
5524       case 8: {
5525         if (vlen_enc == AVX_128bit) {
5526           vpshufd(dst, src, 0x08, vlen_enc);
5527           vpackssdw(dst, dst, dst, vlen_enc);
5528           vpacksswb(dst, dst, dst, vlen_enc);
5529         } else {
5530           vpshufd(dst, src, 0x08, vlen_enc);
5531           vpermq(dst, dst, 0x08, vlen_enc);
5532           vpackssdw(dst, dst, dst, AVX_128bit);
5533           vpacksswb(dst, dst, dst, AVX_128bit);
5534         }
5535         break;
5536       }
5537       default: ShouldNotReachHere();
5538     }
5539   }
5540 }
5541 
5542 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5543                                    bool merge, BasicType bt, int vlen_enc) {
5544   if (bt == T_INT) {
5545     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5546   } else {
5547     assert(bt == T_LONG, "");
5548     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5549   }
5550 }
5551 
5552 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5553                                    bool merge, BasicType bt, int vlen_enc) {
5554   if (bt == T_INT) {
5555     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5556   } else {
5557     assert(bt == T_LONG, "");
5558     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5559   }
5560 }
5561 
5562 #ifdef _LP64
5563 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5564                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5565                                                int vec_enc) {
5566   int index = 0;
5567   int vindex = 0;
5568   mov64(rtmp1, 0x0101010101010101L);
5569   pdepq(rtmp1, src, rtmp1);
5570   if (mask_len > 8) {
5571     movq(rtmp2, src);
5572     vpxor(xtmp, xtmp, xtmp, vec_enc);
5573     movq(xtmp, rtmp1);
5574   }
5575   movq(dst, rtmp1);
5576 
5577   mask_len -= 8;
5578   while (mask_len > 0) {
5579     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5580     index++;
5581     if ((index % 2) == 0) {
5582       pxor(xtmp, xtmp);
5583     }
5584     mov64(rtmp1, 0x0101010101010101L);
5585     shrq(rtmp2, 8);
5586     pdepq(rtmp1, rtmp2, rtmp1);
5587     pinsrq(xtmp, rtmp1, index % 2);
5588     vindex = index / 2;
5589     if (vindex) {
5590       // Write entire 16 byte vector when both 64 bit
5591       // lanes are update to save redundant instructions.
5592       if (index % 2) {
5593         vinsertf128(dst, dst, xtmp, vindex);
5594       }
5595     } else {
5596       vmovdqu(dst, xtmp);
5597     }
5598     mask_len -= 8;
5599   }
5600 }
5601 
5602 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5603   switch(opc) {
5604     case Op_VectorMaskTrueCount:
5605       popcntq(dst, tmp);
5606       break;
5607     case Op_VectorMaskLastTrue:
5608       if (VM_Version::supports_lzcnt()) {
5609         lzcntq(tmp, tmp);
5610         movl(dst, 63);
5611         subl(dst, tmp);
5612       } else {
5613         movl(dst, -1);
5614         bsrq(tmp, tmp);
5615         cmov32(Assembler::notZero, dst, tmp);
5616       }
5617       break;
5618     case Op_VectorMaskFirstTrue:
5619       if (VM_Version::supports_bmi1()) {
5620         if (masklen < 32) {
5621           orl(tmp, 1 << masklen);
5622           tzcntl(dst, tmp);
5623         } else if (masklen == 32) {
5624           tzcntl(dst, tmp);
5625         } else {
5626           assert(masklen == 64, "");
5627           tzcntq(dst, tmp);
5628         }
5629       } else {
5630         if (masklen < 32) {
5631           orl(tmp, 1 << masklen);
5632           bsfl(dst, tmp);
5633         } else {
5634           assert(masklen == 32 || masklen == 64, "");
5635           movl(dst, masklen);
5636           if (masklen == 32)  {
5637             bsfl(tmp, tmp);
5638           } else {
5639             bsfq(tmp, tmp);
5640           }
5641           cmov32(Assembler::notZero, dst, tmp);
5642         }
5643       }
5644       break;
5645     case Op_VectorMaskToLong:
5646       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5647       break;
5648     default: assert(false, "Unhandled mask operation");
5649   }
5650 }
5651 
5652 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5653                                               int masklen, int masksize, int vec_enc) {
5654   assert(VM_Version::supports_popcnt(), "");
5655 
5656   if(VM_Version::supports_avx512bw()) {
5657     kmovql(tmp, mask);
5658   } else {
5659     assert(masklen <= 16, "");
5660     kmovwl(tmp, mask);
5661   }
5662 
5663   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5664   // operations needs to be clipped.
5665   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5666     andq(tmp, (1 << masklen) - 1);
5667   }
5668 
5669   vector_mask_operation_helper(opc, dst, tmp, masklen);
5670 }
5671 
5672 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5673                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5674   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5675          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5676   assert(VM_Version::supports_popcnt(), "");
5677 
5678   bool need_clip = false;
5679   switch(bt) {
5680     case T_BOOLEAN:
5681       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5682       vpxor(xtmp, xtmp, xtmp, vec_enc);
5683       vpsubb(xtmp, xtmp, mask, vec_enc);
5684       vpmovmskb(tmp, xtmp, vec_enc);
5685       need_clip = masklen < 16;
5686       break;
5687     case T_BYTE:
5688       vpmovmskb(tmp, mask, vec_enc);
5689       need_clip = masklen < 16;
5690       break;
5691     case T_SHORT:
5692       vpacksswb(xtmp, mask, mask, vec_enc);
5693       if (masklen >= 16) {
5694         vpermpd(xtmp, xtmp, 8, vec_enc);
5695       }
5696       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5697       need_clip = masklen < 16;
5698       break;
5699     case T_INT:
5700     case T_FLOAT:
5701       vmovmskps(tmp, mask, vec_enc);
5702       need_clip = masklen < 4;
5703       break;
5704     case T_LONG:
5705     case T_DOUBLE:
5706       vmovmskpd(tmp, mask, vec_enc);
5707       need_clip = masklen < 2;
5708       break;
5709     default: assert(false, "Unhandled type, %s", type2name(bt));
5710   }
5711 
5712   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5713   // operations needs to be clipped.
5714   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5715     // need_clip implies masklen < 32
5716     andq(tmp, (1 << masklen) - 1);
5717   }
5718 
5719   vector_mask_operation_helper(opc, dst, tmp, masklen);
5720 }
5721 
5722 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5723                                              Register rtmp2, int mask_len) {
5724   kmov(rtmp1, src);
5725   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5726   mov64(rtmp2, -1L);
5727   pextq(rtmp2, rtmp2, rtmp1);
5728   kmov(dst, rtmp2);
5729 }
5730 
5731 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5732                                                     XMMRegister mask, Register rtmp, Register rscratch,
5733                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5734                                                     int vec_enc) {
5735   assert(type2aelembytes(bt) >= 4, "");
5736   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5737   address compress_perm_table = nullptr;
5738   address expand_perm_table = nullptr;
5739   if (type2aelembytes(bt) == 8) {
5740     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5741     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5742     vmovmskpd(rtmp, mask, vec_enc);
5743   } else {
5744     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5745     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5746     vmovmskps(rtmp, mask, vec_enc);
5747   }
5748   shlq(rtmp, 5); // for 32 byte permute row.
5749   if (opcode == Op_CompressV) {
5750     lea(rscratch, ExternalAddress(compress_perm_table));
5751   } else {
5752     lea(rscratch, ExternalAddress(expand_perm_table));
5753   }
5754   addptr(rtmp, rscratch);
5755   vmovdqu(permv, Address(rtmp));
5756   vpermps(dst, permv, src, Assembler::AVX_256bit);
5757   vpxor(xtmp, xtmp, xtmp, vec_enc);
5758   // Blend the result with zero vector using permute mask, each column entry
5759   // in a permute table row contains either a valid permute index or a -1 (default)
5760   // value, this can potentially be used as a blending mask after
5761   // compressing/expanding the source vector lanes.
5762   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5763 }
5764 
5765 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5766                                                bool merge, BasicType bt, int vec_enc) {
5767   if (opcode == Op_CompressV) {
5768     switch(bt) {
5769     case T_BYTE:
5770       evpcompressb(dst, mask, src, merge, vec_enc);
5771       break;
5772     case T_CHAR:
5773     case T_SHORT:
5774       evpcompressw(dst, mask, src, merge, vec_enc);
5775       break;
5776     case T_INT:
5777       evpcompressd(dst, mask, src, merge, vec_enc);
5778       break;
5779     case T_FLOAT:
5780       evcompressps(dst, mask, src, merge, vec_enc);
5781       break;
5782     case T_LONG:
5783       evpcompressq(dst, mask, src, merge, vec_enc);
5784       break;
5785     case T_DOUBLE:
5786       evcompresspd(dst, mask, src, merge, vec_enc);
5787       break;
5788     default:
5789       fatal("Unsupported type %s", type2name(bt));
5790       break;
5791     }
5792   } else {
5793     assert(opcode == Op_ExpandV, "");
5794     switch(bt) {
5795     case T_BYTE:
5796       evpexpandb(dst, mask, src, merge, vec_enc);
5797       break;
5798     case T_CHAR:
5799     case T_SHORT:
5800       evpexpandw(dst, mask, src, merge, vec_enc);
5801       break;
5802     case T_INT:
5803       evpexpandd(dst, mask, src, merge, vec_enc);
5804       break;
5805     case T_FLOAT:
5806       evexpandps(dst, mask, src, merge, vec_enc);
5807       break;
5808     case T_LONG:
5809       evpexpandq(dst, mask, src, merge, vec_enc);
5810       break;
5811     case T_DOUBLE:
5812       evexpandpd(dst, mask, src, merge, vec_enc);
5813       break;
5814     default:
5815       fatal("Unsupported type %s", type2name(bt));
5816       break;
5817     }
5818   }
5819 }
5820 #endif
5821 
5822 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5823                                            KRegister ktmp1, int vec_enc) {
5824   if (opcode == Op_SignumVD) {
5825     vsubpd(dst, zero, one, vec_enc);
5826     // if src < 0 ? -1 : 1
5827     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5828     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5829     // if src == NaN, -0.0 or 0.0 return src.
5830     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5831     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5832   } else {
5833     assert(opcode == Op_SignumVF, "");
5834     vsubps(dst, zero, one, vec_enc);
5835     // if src < 0 ? -1 : 1
5836     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5837     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5838     // if src == NaN, -0.0 or 0.0 return src.
5839     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5840     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5841   }
5842 }
5843 
5844 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5845                                           XMMRegister xtmp1, int vec_enc) {
5846   if (opcode == Op_SignumVD) {
5847     vsubpd(dst, zero, one, vec_enc);
5848     // if src < 0 ? -1 : 1
5849     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5850     // if src == NaN, -0.0 or 0.0 return src.
5851     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5852     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5853   } else {
5854     assert(opcode == Op_SignumVF, "");
5855     vsubps(dst, zero, one, vec_enc);
5856     // if src < 0 ? -1 : 1
5857     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5858     // if src == NaN, -0.0 or 0.0 return src.
5859     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5860     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5861   }
5862 }
5863 
5864 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5865   if (VM_Version::supports_avx512bw()) {
5866     if (mask_len > 32) {
5867       kmovql(dst, src);
5868     } else {
5869       kmovdl(dst, src);
5870       if (mask_len != 32) {
5871         kshiftrdl(dst, dst, 32 - mask_len);
5872       }
5873     }
5874   } else {
5875     assert(mask_len <= 16, "");
5876     kmovwl(dst, src);
5877     if (mask_len != 16) {
5878       kshiftrwl(dst, dst, 16 - mask_len);
5879     }
5880   }
5881 }
5882 
5883 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5884   int lane_size = type2aelembytes(bt);
5885   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5886   if ((is_LP64 || lane_size < 8) &&
5887       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5888        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5889     movptr(rtmp, imm32);
5890     switch(lane_size) {
5891       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5892       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5893       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5894       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5895       fatal("Unsupported lane size %d", lane_size);
5896       break;
5897     }
5898   } else {
5899     movptr(rtmp, imm32);
5900     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5901     switch(lane_size) {
5902       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5903       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5904       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5905       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5906       fatal("Unsupported lane size %d", lane_size);
5907       break;
5908     }
5909   }
5910 }
5911 
5912 //
5913 // Following is lookup table based popcount computation algorithm:-
5914 //       Index   Bit set count
5915 //     [ 0000 ->   0,
5916 //       0001 ->   1,
5917 //       0010 ->   1,
5918 //       0011 ->   2,
5919 //       0100 ->   1,
5920 //       0101 ->   2,
5921 //       0110 ->   2,
5922 //       0111 ->   3,
5923 //       1000 ->   1,
5924 //       1001 ->   2,
5925 //       1010 ->   3,
5926 //       1011 ->   3,
5927 //       1100 ->   2,
5928 //       1101 ->   3,
5929 //       1111 ->   4 ]
5930 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5931 //     shuffle indices for lookup table access.
5932 //  b. Right shift each byte of vector lane by 4 positions.
5933 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5934 //     shuffle indices for lookup table access.
5935 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5936 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5937 //     count of all the bytes of a quadword.
5938 //  f. Perform step e. for upper 128bit vector lane.
5939 //  g. Pack the bitset count of quadwords back to double word.
5940 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5941 
5942 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5943                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5944   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5945   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5946   vpsrlw(dst, src, 4, vec_enc);
5947   vpand(dst, dst, xtmp1, vec_enc);
5948   vpand(xtmp1, src, xtmp1, vec_enc);
5949   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5950   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5951   vpshufb(dst, xtmp2, dst, vec_enc);
5952   vpaddb(dst, dst, xtmp1, vec_enc);
5953 }
5954 
5955 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5956                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5957   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5958   // Following code is as per steps e,f,g and h of above algorithm.
5959   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5960   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5961   vpsadbw(dst, dst, xtmp2, vec_enc);
5962   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5963   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5964   vpackuswb(dst, xtmp1, dst, vec_enc);
5965 }
5966 
5967 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5968                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5969   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5970   // Add the popcount of upper and lower bytes of word.
5971   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5972   vpsrlw(dst, xtmp1, 8, vec_enc);
5973   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5974   vpaddw(dst, dst, xtmp1, vec_enc);
5975 }
5976 
5977 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5978                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5979   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5980   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5981   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5982 }
5983 
5984 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5985                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5986   switch(bt) {
5987     case T_LONG:
5988       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5989       break;
5990     case T_INT:
5991       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5992       break;
5993     case T_CHAR:
5994     case T_SHORT:
5995       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5996       break;
5997     case T_BYTE:
5998     case T_BOOLEAN:
5999       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
6000       break;
6001     default:
6002       fatal("Unsupported type %s", type2name(bt));
6003       break;
6004   }
6005 }
6006 
6007 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6008                                                       KRegister mask, bool merge, int vec_enc) {
6009   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6010   switch(bt) {
6011     case T_LONG:
6012       assert(VM_Version::supports_avx512_vpopcntdq(), "");
6013       evpopcntq(dst, mask, src, merge, vec_enc);
6014       break;
6015     case T_INT:
6016       assert(VM_Version::supports_avx512_vpopcntdq(), "");
6017       evpopcntd(dst, mask, src, merge, vec_enc);
6018       break;
6019     case T_CHAR:
6020     case T_SHORT:
6021       assert(VM_Version::supports_avx512_bitalg(), "");
6022       evpopcntw(dst, mask, src, merge, vec_enc);
6023       break;
6024     case T_BYTE:
6025     case T_BOOLEAN:
6026       assert(VM_Version::supports_avx512_bitalg(), "");
6027       evpopcntb(dst, mask, src, merge, vec_enc);
6028       break;
6029     default:
6030       fatal("Unsupported type %s", type2name(bt));
6031       break;
6032   }
6033 }
6034 
6035 #ifndef _LP64
6036 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
6037   assert(VM_Version::supports_avx512bw(), "");
6038   kmovdl(tmp, src);
6039   kunpckdql(dst, tmp, tmp);
6040 }
6041 #endif
6042 
6043 // Bit reversal algorithm first reverses the bits of each byte followed by
6044 // a byte level reversal for multi-byte primitive types (short/int/long).
6045 // Algorithm performs a lookup table access to get reverse bit sequence
6046 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6047 // is obtained by swapping the reverse bit sequences of upper and lower
6048 // nibble of a byte.
6049 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6050                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
6051   if (VM_Version::supports_avx512vlbw()) {
6052 
6053     // Get the reverse bit sequence of lower nibble of each byte.
6054     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6055     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6056     evpandq(dst, xtmp2, src, vec_enc);
6057     vpshufb(dst, xtmp1, dst, vec_enc);
6058     vpsllq(dst, dst, 4, vec_enc);
6059 
6060     // Get the reverse bit sequence of upper nibble of each byte.
6061     vpandn(xtmp2, xtmp2, src, vec_enc);
6062     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6063     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6064 
6065     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6066     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6067     evporq(xtmp2, dst, xtmp2, vec_enc);
6068     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6069 
6070   } else if(vec_enc == Assembler::AVX_512bit) {
6071     // Shift based bit reversal.
6072     assert(bt == T_LONG || bt == T_INT, "");
6073 
6074     // Swap lower and upper nibble of each byte.
6075     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6076 
6077     // Swap two least and most significant bits of each nibble.
6078     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6079 
6080     // Swap adjacent pair of bits.
6081     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6082     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6083 
6084     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6085     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6086   } else {
6087     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6088     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6089 
6090     // Get the reverse bit sequence of lower nibble of each byte.
6091     vpand(dst, xtmp2, src, vec_enc);
6092     vpshufb(dst, xtmp1, dst, vec_enc);
6093     vpsllq(dst, dst, 4, vec_enc);
6094 
6095     // Get the reverse bit sequence of upper nibble of each byte.
6096     vpandn(xtmp2, xtmp2, src, vec_enc);
6097     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6098     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6099 
6100     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6101     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6102     vpor(xtmp2, dst, xtmp2, vec_enc);
6103     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6104   }
6105 }
6106 
6107 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6108                                                 XMMRegister xtmp, Register rscratch) {
6109   assert(VM_Version::supports_gfni(), "");
6110   assert(rscratch != noreg || always_reachable(mask), "missing");
6111 
6112   // Galois field instruction based bit reversal based on following algorithm.
6113   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6114   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6115   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6116   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6117 }
6118 
6119 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6120                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6121   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6122   evpandq(dst, xtmp1, src, vec_enc);
6123   vpsllq(dst, dst, nbits, vec_enc);
6124   vpandn(xtmp1, xtmp1, src, vec_enc);
6125   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6126   evporq(dst, dst, xtmp1, vec_enc);
6127 }
6128 
6129 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6130                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6131   // Shift based bit reversal.
6132   assert(VM_Version::supports_evex(), "");
6133   switch(bt) {
6134     case T_LONG:
6135       // Swap upper and lower double word of each quad word.
6136       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6137       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6138       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6139       break;
6140     case T_INT:
6141       // Swap upper and lower word of each double word.
6142       evprord(xtmp1, k0, src, 16, true, vec_enc);
6143       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6144       break;
6145     case T_CHAR:
6146     case T_SHORT:
6147       // Swap upper and lower byte of each word.
6148       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6149       break;
6150     case T_BYTE:
6151       evmovdquq(dst, k0, src, true, vec_enc);
6152       break;
6153     default:
6154       fatal("Unsupported type %s", type2name(bt));
6155       break;
6156   }
6157 }
6158 
6159 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6160   if (bt == T_BYTE) {
6161     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6162       evmovdquq(dst, k0, src, true, vec_enc);
6163     } else {
6164       vmovdqu(dst, src);
6165     }
6166     return;
6167   }
6168   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6169   // pre-computed shuffle indices.
6170   switch(bt) {
6171     case T_LONG:
6172       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6173       break;
6174     case T_INT:
6175       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6176       break;
6177     case T_CHAR:
6178     case T_SHORT:
6179       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6180       break;
6181     default:
6182       fatal("Unsupported type %s", type2name(bt));
6183       break;
6184   }
6185   vpshufb(dst, src, dst, vec_enc);
6186 }
6187 
6188 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6189                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6190                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6191   assert(is_integral_type(bt), "");
6192   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6193   assert(VM_Version::supports_avx512cd(), "");
6194   switch(bt) {
6195     case T_LONG:
6196       evplzcntq(dst, ktmp, src, merge, vec_enc);
6197       break;
6198     case T_INT:
6199       evplzcntd(dst, ktmp, src, merge, vec_enc);
6200       break;
6201     case T_SHORT:
6202       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6203       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6204       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6205       vpunpckhwd(dst, xtmp1, src, vec_enc);
6206       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6207       vpackusdw(dst, xtmp2, dst, vec_enc);
6208       break;
6209     case T_BYTE:
6210       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6211       // accessing the lookup table.
6212       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6213       // accessing the lookup table.
6214       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6215       assert(VM_Version::supports_avx512bw(), "");
6216       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6217       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6218       vpand(xtmp2, dst, src, vec_enc);
6219       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6220       vpsrlw(xtmp3, src, 4, vec_enc);
6221       vpand(xtmp3, dst, xtmp3, vec_enc);
6222       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6223       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6224       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6225       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6226       break;
6227     default:
6228       fatal("Unsupported type %s", type2name(bt));
6229       break;
6230   }
6231 }
6232 
6233 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6234                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6235   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6236   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6237   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6238   // accessing the lookup table.
6239   vpand(dst, xtmp2, src, vec_enc);
6240   vpshufb(dst, xtmp1, dst, vec_enc);
6241   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6242   // accessing the lookup table.
6243   vpsrlw(xtmp3, src, 4, vec_enc);
6244   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6245   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6246   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6247   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6248   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6249   vpaddb(dst, dst, xtmp2, vec_enc);
6250   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6251 }
6252 
6253 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6254                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6255   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6256   // Add zero counts of lower byte and upper byte of a word if
6257   // upper byte holds a zero value.
6258   vpsrlw(xtmp3, src, 8, vec_enc);
6259   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6260   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6261   vpsllw(xtmp2, dst, 8, vec_enc);
6262   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6263   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6264   vpsrlw(dst, dst, 8, vec_enc);
6265 }
6266 
6267 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6268                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6269   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6270   // hence biased exponent can be used to compute leading zero count as per
6271   // following formula:-
6272   // LZCNT = 32 - (biased_exp - 127)
6273   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6274 
6275   // Broadcast 0xFF
6276   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6277   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6278 
6279   // Extract biased exponent.
6280   vcvtdq2ps(dst, src, vec_enc);
6281   vpsrld(dst, dst, 23, vec_enc);
6282   vpand(dst, dst, xtmp1, vec_enc);
6283 
6284   // Broadcast 127.
6285   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6286   // Exponent = biased_exp - 127
6287   vpsubd(dst, dst, xtmp1, vec_enc);
6288 
6289   // Exponent = Exponent  + 1
6290   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6291   vpaddd(dst, dst, xtmp3, vec_enc);
6292 
6293   // Replace -ve exponent with zero, exponent is -ve when src
6294   // lane contains a zero value.
6295   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6296   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6297 
6298   // Rematerialize broadcast 32.
6299   vpslld(xtmp1, xtmp3, 5, vec_enc);
6300   // Exponent is 32 if corresponding source lane contains max_int value.
6301   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6302   // LZCNT = 32 - exponent
6303   vpsubd(dst, xtmp1, dst, vec_enc);
6304 
6305   // Replace LZCNT with a value 1 if corresponding source lane
6306   // contains max_int value.
6307   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6308 
6309   // Replace biased_exp with 0 if source lane value is less than zero.
6310   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6311   vblendvps(dst, dst, xtmp2, src, vec_enc);
6312 }
6313 
6314 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6315                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6316   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6317   // Add zero counts of lower word and upper word of a double word if
6318   // upper word holds a zero value.
6319   vpsrld(xtmp3, src, 16, vec_enc);
6320   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6321   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6322   vpslld(xtmp2, dst, 16, vec_enc);
6323   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6324   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6325   vpsrld(dst, dst, 16, vec_enc);
6326   // Add zero counts of lower doubleword and upper doubleword of a
6327   // quadword if upper doubleword holds a zero value.
6328   vpsrlq(xtmp3, src, 32, vec_enc);
6329   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6330   vpsllq(xtmp2, dst, 32, vec_enc);
6331   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6332   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6333   vpsrlq(dst, dst, 32, vec_enc);
6334 }
6335 
6336 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6337                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6338                                                        Register rtmp, int vec_enc) {
6339   assert(is_integral_type(bt), "unexpected type");
6340   assert(vec_enc < Assembler::AVX_512bit, "");
6341   switch(bt) {
6342     case T_LONG:
6343       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6344       break;
6345     case T_INT:
6346       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6347       break;
6348     case T_SHORT:
6349       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6350       break;
6351     case T_BYTE:
6352       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6353       break;
6354     default:
6355       fatal("Unsupported type %s", type2name(bt));
6356       break;
6357   }
6358 }
6359 
6360 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6361   switch(bt) {
6362     case T_BYTE:
6363       vpsubb(dst, src1, src2, vec_enc);
6364       break;
6365     case T_SHORT:
6366       vpsubw(dst, src1, src2, vec_enc);
6367       break;
6368     case T_INT:
6369       vpsubd(dst, src1, src2, vec_enc);
6370       break;
6371     case T_LONG:
6372       vpsubq(dst, src1, src2, vec_enc);
6373       break;
6374     default:
6375       fatal("Unsupported type %s", type2name(bt));
6376       break;
6377   }
6378 }
6379 
6380 // Trailing zero count computation is based on leading zero count operation as per
6381 // following equation. All AVX3 targets support AVX512CD feature which offers
6382 // direct vector instruction to compute leading zero count.
6383 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6384 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6385                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6386                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6387   assert(is_integral_type(bt), "");
6388   // xtmp = -1
6389   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6390   // xtmp = xtmp + src
6391   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6392   // xtmp = xtmp & ~src
6393   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6394   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6395   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6396   vpsub(bt, dst, xtmp4, dst, vec_enc);
6397 }
6398 
6399 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6400 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6401 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6402                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6403   assert(is_integral_type(bt), "");
6404   // xtmp = 0
6405   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6406   // xtmp = 0 - src
6407   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6408   // xtmp = xtmp | src
6409   vpor(xtmp3, xtmp3, src, vec_enc);
6410   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6411   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6412   vpsub(bt, dst, xtmp1, dst, vec_enc);
6413 }
6414 
6415 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6416   Label done;
6417   Label neg_divisor_fastpath;
6418   cmpl(divisor, 0);
6419   jccb(Assembler::less, neg_divisor_fastpath);
6420   xorl(rdx, rdx);
6421   divl(divisor);
6422   jmpb(done);
6423   bind(neg_divisor_fastpath);
6424   // Fastpath for divisor < 0:
6425   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6426   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6427   movl(rdx, rax);
6428   subl(rdx, divisor);
6429   if (VM_Version::supports_bmi1()) {
6430     andnl(rax, rdx, rax);
6431   } else {
6432     notl(rdx);
6433     andl(rax, rdx);
6434   }
6435   shrl(rax, 31);
6436   bind(done);
6437 }
6438 
6439 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6440   Label done;
6441   Label neg_divisor_fastpath;
6442   cmpl(divisor, 0);
6443   jccb(Assembler::less, neg_divisor_fastpath);
6444   xorl(rdx, rdx);
6445   divl(divisor);
6446   jmpb(done);
6447   bind(neg_divisor_fastpath);
6448   // Fastpath when divisor < 0:
6449   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6450   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6451   movl(rdx, rax);
6452   subl(rax, divisor);
6453   if (VM_Version::supports_bmi1()) {
6454     andnl(rax, rax, rdx);
6455   } else {
6456     notl(rax);
6457     andl(rax, rdx);
6458   }
6459   sarl(rax, 31);
6460   andl(rax, divisor);
6461   subl(rdx, rax);
6462   bind(done);
6463 }
6464 
6465 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6466   Label done;
6467   Label neg_divisor_fastpath;
6468 
6469   cmpl(divisor, 0);
6470   jccb(Assembler::less, neg_divisor_fastpath);
6471   xorl(rdx, rdx);
6472   divl(divisor);
6473   jmpb(done);
6474   bind(neg_divisor_fastpath);
6475   // Fastpath for divisor < 0:
6476   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6477   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6478   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6479   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6480   movl(rdx, rax);
6481   subl(rax, divisor);
6482   if (VM_Version::supports_bmi1()) {
6483     andnl(rax, rax, rdx);
6484   } else {
6485     notl(rax);
6486     andl(rax, rdx);
6487   }
6488   movl(tmp, rax);
6489   shrl(rax, 31); // quotient
6490   sarl(tmp, 31);
6491   andl(tmp, divisor);
6492   subl(rdx, tmp); // remainder
6493   bind(done);
6494 }
6495 
6496 #ifdef _LP64
6497 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6498                                  XMMRegister xtmp2, Register rtmp) {
6499   if(VM_Version::supports_gfni()) {
6500     // Galois field instruction based bit reversal based on following algorithm.
6501     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6502     mov64(rtmp, 0x8040201008040201L);
6503     movq(xtmp1, src);
6504     movq(xtmp2, rtmp);
6505     gf2p8affineqb(xtmp1, xtmp2, 0);
6506     movq(dst, xtmp1);
6507   } else {
6508     // Swap even and odd numbered bits.
6509     movl(rtmp, src);
6510     andl(rtmp, 0x55555555);
6511     shll(rtmp, 1);
6512     movl(dst, src);
6513     andl(dst, 0xAAAAAAAA);
6514     shrl(dst, 1);
6515     orl(dst, rtmp);
6516 
6517     // Swap LSB and MSB 2 bits of each nibble.
6518     movl(rtmp, dst);
6519     andl(rtmp, 0x33333333);
6520     shll(rtmp, 2);
6521     andl(dst, 0xCCCCCCCC);
6522     shrl(dst, 2);
6523     orl(dst, rtmp);
6524 
6525     // Swap LSB and MSB 4 bits of each byte.
6526     movl(rtmp, dst);
6527     andl(rtmp, 0x0F0F0F0F);
6528     shll(rtmp, 4);
6529     andl(dst, 0xF0F0F0F0);
6530     shrl(dst, 4);
6531     orl(dst, rtmp);
6532   }
6533   bswapl(dst);
6534 }
6535 
6536 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6537                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6538   if(VM_Version::supports_gfni()) {
6539     // Galois field instruction based bit reversal based on following algorithm.
6540     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6541     mov64(rtmp1, 0x8040201008040201L);
6542     movq(xtmp1, src);
6543     movq(xtmp2, rtmp1);
6544     gf2p8affineqb(xtmp1, xtmp2, 0);
6545     movq(dst, xtmp1);
6546   } else {
6547     // Swap even and odd numbered bits.
6548     movq(rtmp1, src);
6549     mov64(rtmp2, 0x5555555555555555L);
6550     andq(rtmp1, rtmp2);
6551     shlq(rtmp1, 1);
6552     movq(dst, src);
6553     notq(rtmp2);
6554     andq(dst, rtmp2);
6555     shrq(dst, 1);
6556     orq(dst, rtmp1);
6557 
6558     // Swap LSB and MSB 2 bits of each nibble.
6559     movq(rtmp1, dst);
6560     mov64(rtmp2, 0x3333333333333333L);
6561     andq(rtmp1, rtmp2);
6562     shlq(rtmp1, 2);
6563     notq(rtmp2);
6564     andq(dst, rtmp2);
6565     shrq(dst, 2);
6566     orq(dst, rtmp1);
6567 
6568     // Swap LSB and MSB 4 bits of each byte.
6569     movq(rtmp1, dst);
6570     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6571     andq(rtmp1, rtmp2);
6572     shlq(rtmp1, 4);
6573     notq(rtmp2);
6574     andq(dst, rtmp2);
6575     shrq(dst, 4);
6576     orq(dst, rtmp1);
6577   }
6578   bswapq(dst);
6579 }
6580 
6581 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6582   Label done;
6583   Label neg_divisor_fastpath;
6584   cmpq(divisor, 0);
6585   jccb(Assembler::less, neg_divisor_fastpath);
6586   xorl(rdx, rdx);
6587   divq(divisor);
6588   jmpb(done);
6589   bind(neg_divisor_fastpath);
6590   // Fastpath for divisor < 0:
6591   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6592   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6593   movq(rdx, rax);
6594   subq(rdx, divisor);
6595   if (VM_Version::supports_bmi1()) {
6596     andnq(rax, rdx, rax);
6597   } else {
6598     notq(rdx);
6599     andq(rax, rdx);
6600   }
6601   shrq(rax, 63);
6602   bind(done);
6603 }
6604 
6605 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6606   Label done;
6607   Label neg_divisor_fastpath;
6608   cmpq(divisor, 0);
6609   jccb(Assembler::less, neg_divisor_fastpath);
6610   xorq(rdx, rdx);
6611   divq(divisor);
6612   jmp(done);
6613   bind(neg_divisor_fastpath);
6614   // Fastpath when divisor < 0:
6615   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6616   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6617   movq(rdx, rax);
6618   subq(rax, divisor);
6619   if (VM_Version::supports_bmi1()) {
6620     andnq(rax, rax, rdx);
6621   } else {
6622     notq(rax);
6623     andq(rax, rdx);
6624   }
6625   sarq(rax, 63);
6626   andq(rax, divisor);
6627   subq(rdx, rax);
6628   bind(done);
6629 }
6630 
6631 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6632   Label done;
6633   Label neg_divisor_fastpath;
6634   cmpq(divisor, 0);
6635   jccb(Assembler::less, neg_divisor_fastpath);
6636   xorq(rdx, rdx);
6637   divq(divisor);
6638   jmp(done);
6639   bind(neg_divisor_fastpath);
6640   // Fastpath for divisor < 0:
6641   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6642   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6643   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6644   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6645   movq(rdx, rax);
6646   subq(rax, divisor);
6647   if (VM_Version::supports_bmi1()) {
6648     andnq(rax, rax, rdx);
6649   } else {
6650     notq(rax);
6651     andq(rax, rdx);
6652   }
6653   movq(tmp, rax);
6654   shrq(rax, 63); // quotient
6655   sarq(tmp, 63);
6656   andq(tmp, divisor);
6657   subq(rdx, tmp); // remainder
6658   bind(done);
6659 }
6660 #endif
6661 
6662 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6663                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6664                                         int vlen_enc) {
6665   assert(VM_Version::supports_avx512bw(), "");
6666   // Byte shuffles are inlane operations and indices are determined using
6667   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6668   // normalized to index range 0-15. This makes sure that all the multiples
6669   // of an index value are placed at same relative position in 128 bit
6670   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6671   // will be 16th element in their respective 128 bit lanes.
6672   movl(rtmp, 16);
6673   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6674 
6675   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6676   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6677   // original shuffle indices and move the shuffled lanes corresponding to true
6678   // mask to destination vector.
6679   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6680   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6681   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6682 
6683   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6684   // and broadcasting second 128 bit lane.
6685   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6686   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6687   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6688   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6689   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6690 
6691   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6692   // and broadcasting third 128 bit lane.
6693   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6694   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6695   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6696   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6697   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6698 
6699   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6700   // and broadcasting third 128 bit lane.
6701   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6702   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6703   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6704   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6705   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6706 }
6707 
6708 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6709                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6710   if (vlen_enc == AVX_128bit) {
6711     vpermilps(dst, src, shuffle, vlen_enc);
6712   } else if (bt == T_INT) {
6713     vpermd(dst, shuffle, src, vlen_enc);
6714   } else {
6715     assert(bt == T_FLOAT, "");
6716     vpermps(dst, shuffle, src, vlen_enc);
6717   }
6718 }
6719 
6720 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6721   switch(elem_bt) {
6722     case T_BYTE:
6723       if (ideal_opc == Op_SaturatingAddV) {
6724         vpaddsb(dst, src1, src2, vlen_enc);
6725       } else {
6726         assert(ideal_opc == Op_SaturatingSubV, "");
6727         vpsubsb(dst, src1, src2, vlen_enc);
6728       }
6729       break;
6730     case T_SHORT:
6731       if (ideal_opc == Op_SaturatingAddV) {
6732         vpaddsw(dst, src1, src2, vlen_enc);
6733       } else {
6734         assert(ideal_opc == Op_SaturatingSubV, "");
6735         vpsubsw(dst, src1, src2, vlen_enc);
6736       }
6737       break;
6738     default:
6739       fatal("Unsupported type %s", type2name(elem_bt));
6740       break;
6741   }
6742 }
6743 
6744 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6745   switch(elem_bt) {
6746     case T_BYTE:
6747       if (ideal_opc == Op_SaturatingAddV) {
6748         vpaddusb(dst, src1, src2, vlen_enc);
6749       } else {
6750         assert(ideal_opc == Op_SaturatingSubV, "");
6751         vpsubusb(dst, src1, src2, vlen_enc);
6752       }
6753       break;
6754     case T_SHORT:
6755       if (ideal_opc == Op_SaturatingAddV) {
6756         vpaddusw(dst, src1, src2, vlen_enc);
6757       } else {
6758         assert(ideal_opc == Op_SaturatingSubV, "");
6759         vpsubusw(dst, src1, src2, vlen_enc);
6760       }
6761       break;
6762     default:
6763       fatal("Unsupported type %s", type2name(elem_bt));
6764       break;
6765   }
6766 }
6767 
6768 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6769                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6770   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6771   // overflow_mask = Inp1 <u Inp2
6772   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6773   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6774   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6775 }
6776 
6777 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6778                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6779   // Emulate unsigned comparison using signed comparison
6780   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6781   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6782   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6783   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6784 
6785   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6786 
6787   // Res = INP1 - INP2 (non-commutative and non-associative)
6788   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6789   // Res = Mask ? Zero : Res
6790   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6791   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6792 }
6793 
6794 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6795                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6796   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6797   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6798   // Res = Signed Add INP1, INP2
6799   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6800   // T1 = SRC1 | SRC2
6801   vpor(xtmp1, src1, src2, vlen_enc);
6802   // Max_Unsigned = -1
6803   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6804   // Unsigned compare:  Mask = Res <u T1
6805   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6806   // res  = Mask ? Max_Unsigned : Res
6807   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6808 }
6809 
6810 //
6811 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6812 // unsigned addition operation.
6813 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6814 //
6815 // We empirically determined its semantic equivalence to following reduced expression
6816 //    overflow_mask =  (a + b) <u (a | b)
6817 //
6818 // and also verified it though Alive2 solver.
6819 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6820 //
6821 
6822 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6823                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6824   // Res = Signed Add INP1, INP2
6825   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6826   // Compute T1 = INP1 | INP2
6827   vpor(xtmp3, src1, src2, vlen_enc);
6828   // T1 = Minimum signed value.
6829   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6830   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6831   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6832   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6833   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6834   // Compute overflow detection mask = Res<1> <s T1
6835   if (elem_bt == T_INT) {
6836     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6837   } else {
6838     assert(elem_bt == T_LONG, "");
6839     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6840   }
6841   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6842 }
6843 
6844 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6845                                       int vlen_enc, bool xtmp2_hold_M1) {
6846   if (VM_Version::supports_avx512dq()) {
6847     evpmovq2m(ktmp, src, vlen_enc);
6848   } else {
6849     assert(VM_Version::supports_evex(), "");
6850     if (!xtmp2_hold_M1) {
6851       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6852     }
6853     evpsraq(xtmp1, src, 63, vlen_enc);
6854     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6855   }
6856 }
6857 
6858 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6859                                       int vlen_enc, bool xtmp2_hold_M1) {
6860   if (VM_Version::supports_avx512dq()) {
6861     evpmovd2m(ktmp, src, vlen_enc);
6862   } else {
6863     assert(VM_Version::supports_evex(), "");
6864     if (!xtmp2_hold_M1) {
6865       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6866     }
6867     vpsrad(xtmp1, src, 31, vlen_enc);
6868     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6869   }
6870 }
6871 
6872 
6873 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6874   if (elem_bt == T_LONG) {
6875     if (VM_Version::supports_evex()) {
6876       evpsraq(dst, src, 63, vlen_enc);
6877     } else {
6878       vpsrad(dst, src, 31, vlen_enc);
6879       vpshufd(dst, dst, 0xF5, vlen_enc);
6880     }
6881   } else {
6882     assert(elem_bt == T_INT, "");
6883     vpsrad(dst, src, 31, vlen_enc);
6884   }
6885 }
6886 
6887 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6888   if (compute_allones) {
6889     if (vlen_enc == Assembler::AVX_512bit) {
6890       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6891     } else {
6892       vpcmpeqq(allones, allones, allones, vlen_enc);
6893     }
6894   }
6895   if (elem_bt == T_LONG) {
6896     vpsrlq(dst, allones, 1, vlen_enc);
6897   } else {
6898     assert(elem_bt == T_INT, "");
6899     vpsrld(dst, allones, 1, vlen_enc);
6900   }
6901 }
6902 
6903 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6904   if (compute_allones) {
6905     if (vlen_enc == Assembler::AVX_512bit) {
6906       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6907     } else {
6908       vpcmpeqq(allones, allones, allones, vlen_enc);
6909     }
6910   }
6911   if (elem_bt == T_LONG) {
6912     vpsllq(dst, allones, 63, vlen_enc);
6913   } else {
6914     assert(elem_bt == T_INT, "");
6915     vpslld(dst, allones, 31, vlen_enc);
6916   }
6917 }
6918 
6919 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6920                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6921   switch(elem_bt) {
6922     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6923     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6924     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6925     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6926     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6927   }
6928 }
6929 
6930 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6931   switch(elem_bt) {
6932     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6933     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6934     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6935     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6936     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6937   }
6938 }
6939 
6940 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6941                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6942   if (elem_bt == T_LONG) {
6943     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6944   } else {
6945     assert(elem_bt == T_INT, "");
6946     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6947   }
6948 }
6949 
6950 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6951                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6952                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6953   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6954   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6955   // Overflow detection based on Hacker's delight section 2-13.
6956   if (ideal_opc == Op_SaturatingAddV) {
6957     // res = src1 + src2
6958     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6959     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6960     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6961     vpxor(xtmp1, dst, src1, vlen_enc);
6962     vpxor(xtmp2, dst, src2, vlen_enc);
6963     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6964   } else {
6965     assert(ideal_opc == Op_SaturatingSubV, "");
6966     // res = src1 - src2
6967     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6968     // Overflow occurs when both inputs have opposite polarity and
6969     // result polarity does not comply with first input polarity.
6970     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6971     vpxor(xtmp1, src1, src2, vlen_enc);
6972     vpxor(xtmp2, dst, src1, vlen_enc);
6973     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6974   }
6975 
6976   // Compute overflow detection mask.
6977   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6978   // Note: xtmp1 hold -1 in all its lanes after above call.
6979 
6980   // Compute mask based on first input polarity.
6981   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6982 
6983   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6984   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6985 
6986   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6987   // set bits in first input polarity mask holds a min value.
6988   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6989   // Blend destination lanes with saturated values using overflow detection mask.
6990   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6991 }
6992 
6993 
6994 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6995                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6996                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6997   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6998   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6999   // Overflow detection based on Hacker's delight section 2-13.
7000   if (ideal_opc == Op_SaturatingAddV) {
7001     // res = src1 + src2
7002     vpadd(elem_bt, dst, src1, src2, vlen_enc);
7003     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
7004     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
7005     vpxor(xtmp1, dst, src1, vlen_enc);
7006     vpxor(xtmp2, dst, src2, vlen_enc);
7007     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
7008   } else {
7009     assert(ideal_opc == Op_SaturatingSubV, "");
7010     // res = src1 - src2
7011     vpsub(elem_bt, dst, src1, src2, vlen_enc);
7012     // Overflow occurs when both inputs have opposite polarity and
7013     // result polarity does not comply with first input polarity.
7014     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
7015     vpxor(xtmp1, src1, src2, vlen_enc);
7016     vpxor(xtmp2, dst, src1, vlen_enc);
7017     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
7018   }
7019 
7020   // Sign-extend to compute overflow detection mask.
7021   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
7022 
7023   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
7024   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
7025   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
7026 
7027   // Compose saturating min/max vector using first input polarity mask.
7028   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
7029   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
7030 
7031   // Blend result with saturating vector using overflow detection mask.
7032   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
7033 }
7034 
7035 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7036   switch(elem_bt) {
7037     case T_BYTE:
7038       if (ideal_opc == Op_SaturatingAddV) {
7039         vpaddsb(dst, src1, src2, vlen_enc);
7040       } else {
7041         assert(ideal_opc == Op_SaturatingSubV, "");
7042         vpsubsb(dst, src1, src2, vlen_enc);
7043       }
7044       break;
7045     case T_SHORT:
7046       if (ideal_opc == Op_SaturatingAddV) {
7047         vpaddsw(dst, src1, src2, vlen_enc);
7048       } else {
7049         assert(ideal_opc == Op_SaturatingSubV, "");
7050         vpsubsw(dst, src1, src2, vlen_enc);
7051       }
7052       break;
7053     default:
7054       fatal("Unsupported type %s", type2name(elem_bt));
7055       break;
7056   }
7057 }
7058 
7059 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7060   switch(elem_bt) {
7061     case T_BYTE:
7062       if (ideal_opc == Op_SaturatingAddV) {
7063         vpaddusb(dst, src1, src2, vlen_enc);
7064       } else {
7065         assert(ideal_opc == Op_SaturatingSubV, "");
7066         vpsubusb(dst, src1, src2, vlen_enc);
7067       }
7068       break;
7069     case T_SHORT:
7070       if (ideal_opc == Op_SaturatingAddV) {
7071         vpaddusw(dst, src1, src2, vlen_enc);
7072       } else {
7073         assert(ideal_opc == Op_SaturatingSubV, "");
7074         vpsubusw(dst, src1, src2, vlen_enc);
7075       }
7076       break;
7077     default:
7078       fatal("Unsupported type %s", type2name(elem_bt));
7079       break;
7080   }
7081 }
7082 
7083 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7084                                                      XMMRegister src2, int vlen_enc) {
7085   switch(elem_bt) {
7086     case T_BYTE:
7087       evpermi2b(dst, src1, src2, vlen_enc);
7088       break;
7089     case T_SHORT:
7090       evpermi2w(dst, src1, src2, vlen_enc);
7091       break;
7092     case T_INT:
7093       evpermi2d(dst, src1, src2, vlen_enc);
7094       break;
7095     case T_LONG:
7096       evpermi2q(dst, src1, src2, vlen_enc);
7097       break;
7098     case T_FLOAT:
7099       evpermi2ps(dst, src1, src2, vlen_enc);
7100       break;
7101     case T_DOUBLE:
7102       evpermi2pd(dst, src1, src2, vlen_enc);
7103       break;
7104     default:
7105       fatal("Unsupported type %s", type2name(elem_bt));
7106       break;
7107   }
7108 }
7109 
7110 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7111   if (is_unsigned) {
7112     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7113   } else {
7114     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7115   }
7116 }
7117 
7118 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7119   if (is_unsigned) {
7120     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7121   } else {
7122     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7123   }
7124 }