1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  54   if (C->clinit_barrier_on_entry()) {
  55     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  56     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  57 
  58     Label L_skip_barrier;
  59     Register klass = rscratch1;
  60 
  61     mov_metadata(klass, C->method()->holder()->constant_encoding());
  62     clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
  63 
  64     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  65 
  66     bind(L_skip_barrier);
  67   }
  68 
  69   int framesize = C->output()->frame_size_in_bytes();
  70   int bangsize = C->output()->bang_size_in_bytes();
  71   bool fp_mode_24b = false;
  72   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  73 
  74   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  75   // NativeJump::patch_verified_entry will be able to patch out the entry
  76   // code safely. The push to verify stack depth is ok at 5 bytes,
  77   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  78   // stack bang then we must use the 6 byte frame allocation even if
  79   // we have no frame. :-(
  80   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  81 
  82   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  83   // Remove word for return addr
  84   framesize -= wordSize;
  85   stack_bang_size -= wordSize;
  86 
  87   // Calls to C2R adapters often do not accept exceptional returns.
  88   // We require that their callers must bang for them.  But be careful, because
  89   // some VM calls (such as call site linkage) can use several kilobytes of
  90   // stack.  But the stack safety zone should account for that.
  91   // See bugs 4446381, 4468289, 4497237.
  92   if (stack_bang_size > 0) {
  93     generate_stack_overflow_check(stack_bang_size);
  94 
  95     // We always push rbp, so that on return to interpreter rbp, will be
  96     // restored correctly and we can correct the stack.
  97     push(rbp);
  98     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  99     if (PreserveFramePointer) {
 100       mov(rbp, rsp);
 101     }
 102     // Remove word for ebp
 103     framesize -= wordSize;
 104 
 105     // Create frame
 106     if (framesize) {
 107       subptr(rsp, framesize);
 108     }
 109   } else {
 110     // Create frame (force generation of a 4 byte immediate value)
 111     subptr_imm32(rsp, framesize);
 112 
 113     // Save RBP register now.
 114     framesize -= wordSize;
 115     movptr(Address(rsp, framesize), rbp);
 116     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 117     if (PreserveFramePointer) {
 118       movptr(rbp, rsp);
 119       if (framesize > 0) {
 120         addptr(rbp, framesize);
 121       }
 122     }
 123   }
 124 
 125   if (C->needs_stack_repair()) {
 126     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 127     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 128     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
 129   }
 130 
 131   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 132     framesize -= wordSize;
 133     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 134   }
 135 
 136 #ifndef _LP64
 137   // If method sets FPU control word do it now
 138   if (fp_mode_24b) {
 139     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 140   }
 141   if (UseSSE >= 2 && VerifyFPU) {
 142     verify_FPU(0, "FPU stack must be clean on entry");
 143   }
 144 #endif
 145 
 146 #ifdef ASSERT
 147   if (VerifyStackAtCalls) {
 148     Label L;
 149     push(rax);
 150     mov(rax, rsp);
 151     andptr(rax, StackAlignmentInBytes-1);
 152     cmpptr(rax, StackAlignmentInBytes-wordSize);
 153     pop(rax);
 154     jcc(Assembler::equal, L);
 155     STOP("Stack is not properly aligned!");
 156     bind(L);
 157   }
 158 #endif
 159 }
 160 
 161 void C2_MacroAssembler::entry_barrier() {
 162   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 163 #ifdef _LP64
 164   if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 165     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 166     Label dummy_slow_path;
 167     Label dummy_continuation;
 168     Label* slow_path = &dummy_slow_path;
 169     Label* continuation = &dummy_continuation;
 170     if (!Compile::current()->output()->in_scratch_emit_size()) {
 171       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 172       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 173       Compile::current()->output()->add_stub(stub);
 174       slow_path = &stub->entry();
 175       continuation = &stub->continuation();
 176     }
 177     bs->nmethod_entry_barrier(this, slow_path, continuation);
 178   }
 179 #else
 180   // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 181   bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 182 #endif
 183 }
 184 
 185 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 186   switch (vlen_in_bytes) {
 187     case  4: // fall-through
 188     case  8: // fall-through
 189     case 16: return Assembler::AVX_128bit;
 190     case 32: return Assembler::AVX_256bit;
 191     case 64: return Assembler::AVX_512bit;
 192 
 193     default: {
 194       ShouldNotReachHere();
 195       return Assembler::AVX_NoVec;
 196     }
 197   }
 198 }
 199 
 200 // fast_lock and fast_unlock used by C2
 201 
 202 // Because the transitions from emitted code to the runtime
 203 // monitorenter/exit helper stubs are so slow it's critical that
 204 // we inline both the stack-locking fast path and the inflated fast path.
 205 //
 206 // See also: cmpFastLock and cmpFastUnlock.
 207 //
 208 // What follows is a specialized inline transliteration of the code
 209 // in enter() and exit(). If we're concerned about I$ bloat another
 210 // option would be to emit TrySlowEnter and TrySlowExit methods
 211 // at startup-time.  These methods would accept arguments as
 212 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 213 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 214 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 215 // In practice, however, the # of lock sites is bounded and is usually small.
 216 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 217 // if the processor uses simple bimodal branch predictors keyed by EIP
 218 // Since the helper routines would be called from multiple synchronization
 219 // sites.
 220 //
 221 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 222 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 223 // to those specialized methods.  That'd give us a mostly platform-independent
 224 // implementation that the JITs could optimize and inline at their pleasure.
 225 // Done correctly, the only time we'd need to cross to native could would be
 226 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 227 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 228 // (b) explicit barriers or fence operations.
 229 //
 230 // TODO:
 231 //
 232 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 233 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 234 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 235 //    the lock operators would typically be faster than reifying Self.
 236 //
 237 // *  Ideally I'd define the primitives as:
 238 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 239 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 240 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 241 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 242 //    Furthermore the register assignments are overconstrained, possibly resulting in
 243 //    sub-optimal code near the synchronization site.
 244 //
 245 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 246 //    Alternately, use a better sp-proximity test.
 247 //
 248 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 249 //    Either one is sufficient to uniquely identify a thread.
 250 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 251 //
 252 // *  Intrinsify notify() and notifyAll() for the common cases where the
 253 //    object is locked by the calling thread but the waitlist is empty.
 254 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 255 //
 256 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 257 //    But beware of excessive branch density on AMD Opterons.
 258 //
 259 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 260 //    or failure of the fast path.  If the fast path fails then we pass
 261 //    control to the slow path, typically in C.  In fast_lock and
 262 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 263 //    will emit a conditional branch immediately after the node.
 264 //    So we have branches to branches and lots of ICC.ZF games.
 265 //    Instead, it might be better to have C2 pass a "FailureLabel"
 266 //    into fast_lock and fast_unlock.  In the case of success, control
 267 //    will drop through the node.  ICC.ZF is undefined at exit.
 268 //    In the case of failure, the node will branch directly to the
 269 //    FailureLabel
 270 
 271 
 272 // obj: object to lock
 273 // box: on-stack box address (displaced header location) - KILLED
 274 // rax,: tmp -- KILLED
 275 // scr: tmp -- KILLED
 276 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 277                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 278                                  Metadata* method_data) {
 279   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 280   // Ensure the register assignments are disjoint
 281   assert(tmpReg == rax, "");
 282   assert(cx1Reg == noreg, "");
 283   assert(cx2Reg == noreg, "");
 284   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 285 
 286   // Possible cases that we'll encounter in fast_lock
 287   // ------------------------------------------------
 288   // * Inflated
 289   //    -- unlocked
 290   //    -- Locked
 291   //       = by self
 292   //       = by other
 293   // * neutral
 294   // * stack-locked
 295   //    -- by self
 296   //       = sp-proximity test hits
 297   //       = sp-proximity test generates false-negative
 298   //    -- by other
 299   //
 300 
 301   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 302 
 303   if (DiagnoseSyncOnValueBasedClasses != 0) {
 304     load_klass(tmpReg, objReg, scrReg);
 305     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 306     jcc(Assembler::notZero, DONE_LABEL);
 307   }
 308 
 309   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 310   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 311   jcc(Assembler::notZero, IsInflated);
 312 
 313   if (LockingMode == LM_MONITOR) {
 314     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 315     testptr(objReg, objReg);
 316   } else {
 317     assert(LockingMode == LM_LEGACY, "must be");
 318     // Attempt stack-locking ...
 319     orptr (tmpReg, markWord::unlocked_value);
 320     if (EnableValhalla) {
 321       // Mask inline_type bit such that we go to the slow path if object is an inline type
 322       andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place));
 323     }
 324     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 325     lock();
 326     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 327     jcc(Assembler::equal, COUNT);           // Success
 328 
 329     // Recursive locking.
 330     // The object is stack-locked: markword contains stack pointer to BasicLock.
 331     // Locked by current thread if difference with current SP is less than one page.
 332     subptr(tmpReg, rsp);
 333     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 334     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 335     movptr(Address(boxReg, 0), tmpReg);
 336   }
 337   jmp(DONE_LABEL);
 338 
 339   bind(IsInflated);
 340   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 341 
 342 #ifndef _LP64
 343   // The object is inflated.
 344 
 345   // boxReg refers to the on-stack BasicLock in the current frame.
 346   // We'd like to write:
 347   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 348   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 349   // additional latency as we have another ST in the store buffer that must drain.
 350 
 351   // avoid ST-before-CAS
 352   // register juggle because we need tmpReg for cmpxchgptr below
 353   movptr(scrReg, boxReg);
 354   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 355 
 356   // Optimistic form: consider XORL tmpReg,tmpReg
 357   movptr(tmpReg, NULL_WORD);
 358 
 359   // Appears unlocked - try to swing _owner from null to non-null.
 360   // Ideally, I'd manifest "Self" with get_thread and then attempt
 361   // to CAS the register containing Self into m->Owner.
 362   // But we don't have enough registers, so instead we can either try to CAS
 363   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 364   // we later store "Self" into m->Owner.  Transiently storing a stack address
 365   // (rsp or the address of the box) into  m->owner is harmless.
 366   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 367   lock();
 368   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 369   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 370   // If we weren't able to swing _owner from null to the BasicLock
 371   // then take the slow path.
 372   jccb  (Assembler::notZero, NO_COUNT);
 373   // update _owner from BasicLock to thread
 374   get_thread (scrReg);                    // beware: clobbers ICCs
 375   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 376   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 377 
 378   // If the CAS fails we can either retry or pass control to the slow path.
 379   // We use the latter tactic.
 380   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 381   // If the CAS was successful ...
 382   //   Self has acquired the lock
 383   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 384   // Intentional fall-through into DONE_LABEL ...
 385 #else // _LP64
 386   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 387   movq(scrReg, tmpReg);
 388   xorq(tmpReg, tmpReg);
 389   lock();
 390   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 391   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 392   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 393   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 394   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 395   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 396 
 397   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 398   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 399   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 400   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 401 #endif // _LP64
 402   bind(DONE_LABEL);
 403 
 404   // ZFlag == 1 count in fast path
 405   // ZFlag == 0 count in slow path
 406   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 407 
 408   bind(COUNT);
 409   // Count monitors in fast path
 410   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 411 
 412   xorl(tmpReg, tmpReg); // Set ZF == 1
 413 
 414   bind(NO_COUNT);
 415 
 416   // At NO_COUNT the icc ZFlag is set as follows ...
 417   // fast_unlock uses the same protocol.
 418   // ZFlag == 1 -> Success
 419   // ZFlag == 0 -> Failure - force control through the slow path
 420 }
 421 
 422 // obj: object to unlock
 423 // box: box address (displaced header location), killed.  Must be EAX.
 424 // tmp: killed, cannot be obj nor box.
 425 //
 426 // Some commentary on balanced locking:
 427 //
 428 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 429 // Methods that don't have provably balanced locking are forced to run in the
 430 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 431 // The interpreter provides two properties:
 432 // I1:  At return-time the interpreter automatically and quietly unlocks any
 433 //      objects acquired the current activation (frame).  Recall that the
 434 //      interpreter maintains an on-stack list of locks currently held by
 435 //      a frame.
 436 // I2:  If a method attempts to unlock an object that is not held by the
 437 //      the frame the interpreter throws IMSX.
 438 //
 439 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 440 // B() doesn't have provably balanced locking so it runs in the interpreter.
 441 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 442 // is still locked by A().
 443 //
 444 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 445 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 446 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 447 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 448 // Arguably given that the spec legislates the JNI case as undefined our implementation
 449 // could reasonably *avoid* checking owner in fast_unlock().
 450 // In the interest of performance we elide m->Owner==Self check in unlock.
 451 // A perfectly viable alternative is to elide the owner check except when
 452 // Xcheck:jni is enabled.
 453 
 454 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 455   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 456   assert(boxReg == rax, "");
 457   assert_different_registers(objReg, boxReg, tmpReg);
 458 
 459   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 460 
 461   if (LockingMode == LM_LEGACY) {
 462     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 463     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 464   }
 465   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 466   if (LockingMode != LM_MONITOR) {
 467     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 468     jcc(Assembler::zero, Stacked);
 469   }
 470 
 471   // It's inflated.
 472 
 473   // Despite our balanced locking property we still check that m->_owner == Self
 474   // as java routines or native JNI code called by this thread might
 475   // have released the lock.
 476   // Refer to the comments in synchronizer.cpp for how we might encode extra
 477   // state in _succ so we can avoid fetching EntryList|cxq.
 478   //
 479   // If there's no contention try a 1-0 exit.  That is, exit without
 480   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 481   // we detect and recover from the race that the 1-0 exit admits.
 482   //
 483   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 484   // before it STs null into _owner, releasing the lock.  Updates
 485   // to data protected by the critical section must be visible before
 486   // we drop the lock (and thus before any other thread could acquire
 487   // the lock and observe the fields protected by the lock).
 488   // IA32's memory-model is SPO, so STs are ordered with respect to
 489   // each other and there's no need for an explicit barrier (fence).
 490   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 491   Label LSuccess, LNotRecursive;
 492 
 493   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 494   jccb(Assembler::equal, LNotRecursive);
 495 
 496   // Recursive inflated unlock
 497   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 498   jmpb(LSuccess);
 499 
 500   bind(LNotRecursive);
 501 
 502   // Set owner to null.
 503   // Release to satisfy the JMM
 504   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 505   // We need a full fence after clearing owner to avoid stranding.
 506   // StoreLoad achieves this.
 507   membar(StoreLoad);
 508 
 509   // Check if the entry lists are empty.
 510   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 511   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 512   jccb(Assembler::zero, LSuccess);    // If so we are done.
 513 
 514   // Check if there is a successor.
 515   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 516   jccb(Assembler::notZero, LSuccess); // If so we are done.
 517 
 518   // Save the monitor pointer in the current thread, so we can try to
 519   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 520   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 521 #ifndef _LP64
 522   get_thread(boxReg);
 523   movptr(Address(boxReg, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 524 #else // _LP64
 525   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 526 #endif
 527 
 528   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 529   jmpb  (DONE_LABEL);
 530 
 531   bind  (LSuccess);
 532   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 533   jmpb  (DONE_LABEL);
 534 
 535   if (LockingMode == LM_LEGACY) {
 536     bind  (Stacked);
 537     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 538     lock();
 539     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 540     // Intentional fall-thru into DONE_LABEL
 541   }
 542 
 543   bind(DONE_LABEL);
 544 
 545   // ZFlag == 1 count in fast path
 546   // ZFlag == 0 count in slow path
 547   jccb(Assembler::notZero, NO_COUNT);
 548 
 549   bind(COUNT);
 550   // Count monitors in fast path
 551 #ifndef _LP64
 552   get_thread(tmpReg);
 553   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 554 #else // _LP64
 555   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 556 #endif
 557 
 558   xorl(tmpReg, tmpReg); // Set ZF == 1
 559 
 560   bind(NO_COUNT);
 561 }
 562 
 563 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 564                                               Register t, Register thread) {
 565   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 566   assert(rax_reg == rax, "Used for CAS");
 567   assert_different_registers(obj, box, rax_reg, t, thread);
 568 
 569   // Handle inflated monitor.
 570   Label inflated;
 571   // Finish fast lock successfully. ZF value is irrelevant.
 572   Label locked;
 573   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 574   Label slow_path;
 575 
 576   if (UseObjectMonitorTable) {
 577     // Clear cache in case fast locking succeeds.
 578     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 579   }
 580 
 581   if (DiagnoseSyncOnValueBasedClasses != 0) {
 582     load_klass(rax_reg, obj, t);
 583     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 584     jcc(Assembler::notZero, slow_path);
 585   }
 586 
 587   const Register mark = t;
 588 
 589   { // Lightweight Lock
 590 
 591     Label push;
 592 
 593     const Register top = UseObjectMonitorTable ? rax_reg : box;
 594 
 595     // Load the mark.
 596     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 597 
 598     // Prefetch top.
 599     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 600 
 601     // Check for monitor (0b10).
 602     testptr(mark, markWord::monitor_value);
 603     jcc(Assembler::notZero, inflated);
 604 
 605     // Check if lock-stack is full.
 606     cmpl(top, LockStack::end_offset() - 1);
 607     jcc(Assembler::greater, slow_path);
 608 
 609     // Check if recursive.
 610     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 611     jccb(Assembler::equal, push);
 612 
 613     // Try to lock. Transition lock bits 0b01 => 0b00
 614     movptr(rax_reg, mark);
 615     orptr(rax_reg, markWord::unlocked_value);
 616     andptr(mark, ~(int32_t)markWord::unlocked_value);
 617     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 618     jcc(Assembler::notEqual, slow_path);
 619 
 620     if (UseObjectMonitorTable) {
 621       // Need to reload top, clobbered by CAS.
 622       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 623     }
 624     bind(push);
 625     // After successful lock, push object on lock-stack.
 626     movptr(Address(thread, top), obj);
 627     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 628     jmpb(locked);
 629   }
 630 
 631   { // Handle inflated monitor.
 632     bind(inflated);
 633 
 634     const Register monitor = t;
 635 
 636     if (!UseObjectMonitorTable) {
 637       assert(mark == monitor, "should be the same here");
 638     } else {
 639       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 640       // Fetch ObjectMonitor* from the cache or take the slow-path.
 641       Label monitor_found;
 642 
 643       // Load cache address
 644       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 645 
 646       const int num_unrolled = 2;
 647       for (int i = 0; i < num_unrolled; i++) {
 648         cmpptr(obj, Address(t));
 649         jccb(Assembler::equal, monitor_found);
 650         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 651       }
 652 
 653       Label loop;
 654 
 655       // Search for obj in cache.
 656       bind(loop);
 657 
 658       // Check for match.
 659       cmpptr(obj, Address(t));
 660       jccb(Assembler::equal, monitor_found);
 661 
 662       // Search until null encountered, guaranteed _null_sentinel at end.
 663       cmpptr(Address(t), 1);
 664       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 665       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 666       jmpb(loop);
 667 
 668       // Cache hit.
 669       bind(monitor_found);
 670       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 671     }
 672     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 673     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 674     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 675 
 676     Label monitor_locked;
 677     // Lock the monitor.
 678 
 679     // CAS owner (null => current thread).
 680     xorptr(rax_reg, rax_reg);
 681     lock(); cmpxchgptr(thread, owner_address);
 682     jccb(Assembler::equal, monitor_locked);
 683 
 684     // Check if recursive.
 685     cmpptr(thread, rax_reg);
 686     jccb(Assembler::notEqual, slow_path);
 687 
 688     // Recursive.
 689     increment(recursions_address);
 690 
 691     bind(monitor_locked);
 692     if (UseObjectMonitorTable) {
 693       // Cache the monitor for unlock
 694       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 695     }
 696   }
 697 
 698   bind(locked);
 699   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 700   // Set ZF = 1
 701   xorl(rax_reg, rax_reg);
 702 
 703 #ifdef ASSERT
 704   // Check that locked label is reached with ZF set.
 705   Label zf_correct;
 706   Label zf_bad_zero;
 707   jcc(Assembler::zero, zf_correct);
 708   jmp(zf_bad_zero);
 709 #endif
 710 
 711   bind(slow_path);
 712 #ifdef ASSERT
 713   // Check that slow_path label is reached with ZF not set.
 714   jcc(Assembler::notZero, zf_correct);
 715   stop("Fast Lock ZF != 0");
 716   bind(zf_bad_zero);
 717   stop("Fast Lock ZF != 1");
 718   bind(zf_correct);
 719 #endif
 720   // C2 uses the value of ZF to determine the continuation.
 721 }
 722 
 723 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 724   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 725   assert(reg_rax == rax, "Used for CAS");
 726   assert_different_registers(obj, reg_rax, t);
 727 
 728   // Handle inflated monitor.
 729   Label inflated, inflated_check_lock_stack;
 730   // Finish fast unlock successfully.  MUST jump with ZF == 1
 731   Label unlocked, slow_path;
 732 
 733   const Register mark = t;
 734   const Register monitor = t;
 735   const Register top = UseObjectMonitorTable ? t : reg_rax;
 736   const Register box = reg_rax;
 737 
 738   Label dummy;
 739   C2FastUnlockLightweightStub* stub = nullptr;
 740 
 741   if (!Compile::current()->output()->in_scratch_emit_size()) {
 742     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 743     Compile::current()->output()->add_stub(stub);
 744   }
 745 
 746   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 747 
 748   { // Lightweight Unlock
 749 
 750     // Load top.
 751     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 752 
 753     if (!UseObjectMonitorTable) {
 754       // Prefetch mark.
 755       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 756     }
 757 
 758     // Check if obj is top of lock-stack.
 759     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 760     // Top of lock stack was not obj. Must be monitor.
 761     jcc(Assembler::notEqual, inflated_check_lock_stack);
 762 
 763     // Pop lock-stack.
 764     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 765     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 766 
 767     // Check if recursive.
 768     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 769     jcc(Assembler::equal, unlocked);
 770 
 771     // We elide the monitor check, let the CAS fail instead.
 772 
 773     if (UseObjectMonitorTable) {
 774       // Load mark.
 775       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 776     }
 777 
 778     // Try to unlock. Transition lock bits 0b00 => 0b01
 779     movptr(reg_rax, mark);
 780     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 781     orptr(mark, markWord::unlocked_value);
 782     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 783     jcc(Assembler::notEqual, push_and_slow_path);
 784     jmp(unlocked);
 785   }
 786 
 787 
 788   { // Handle inflated monitor.
 789     bind(inflated_check_lock_stack);
 790 #ifdef ASSERT
 791     Label check_done;
 792     subl(top, oopSize);
 793     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 794     jcc(Assembler::below, check_done);
 795     cmpptr(obj, Address(thread, top));
 796     jccb(Assembler::notEqual, inflated_check_lock_stack);
 797     stop("Fast Unlock lock on stack");
 798     bind(check_done);
 799     if (UseObjectMonitorTable) {
 800       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 801     }
 802     testptr(mark, markWord::monitor_value);
 803     jccb(Assembler::notZero, inflated);
 804     stop("Fast Unlock not monitor");
 805 #endif
 806 
 807     bind(inflated);
 808 
 809     if (!UseObjectMonitorTable) {
 810       assert(mark == monitor, "should be the same here");
 811     } else {
 812       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 813       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 814       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 815       cmpptr(monitor, alignof(ObjectMonitor*));
 816       jcc(Assembler::below, slow_path);
 817     }
 818     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 819     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 820     const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag};
 821     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 822     const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag};
 823     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 824 
 825     Label recursive;
 826 
 827     // Check if recursive.
 828     cmpptr(recursions_address, 0);
 829     jccb(Assembler::notZero, recursive);
 830 
 831     // Set owner to null.
 832     // Release to satisfy the JMM
 833     movptr(owner_address, NULL_WORD);
 834     // We need a full fence after clearing owner to avoid stranding.
 835     // StoreLoad achieves this.
 836     membar(StoreLoad);
 837 
 838     // Check if the entry lists are empty.
 839     movptr(reg_rax, cxq_address);
 840     orptr(reg_rax, EntryList_address);
 841     jccb(Assembler::zero, unlocked);    // If so we are done.
 842 
 843     // Check if there is a successor.
 844     cmpptr(succ_address, NULL_WORD);
 845     jccb(Assembler::notZero, unlocked); // If so we are done.
 846 
 847     // Save the monitor pointer in the current thread, so we can try to
 848     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 849     if (!UseObjectMonitorTable) {
 850       andptr(monitor, ~(int32_t)markWord::monitor_value);
 851     }
 852     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 853 
 854     orl(t, 1); // Fast Unlock ZF = 0
 855     jmpb(slow_path);
 856 
 857     // Recursive unlock.
 858     bind(recursive);
 859     decrement(recursions_address);
 860   }
 861 
 862   bind(unlocked);
 863   decrement(Address(thread, JavaThread::held_monitor_count_offset()));
 864   xorl(t, t); // Fast Unlock ZF = 1
 865 
 866 #ifdef ASSERT
 867   // Check that unlocked label is reached with ZF set.
 868   Label zf_correct;
 869   jcc(Assembler::zero, zf_correct);
 870   stop("Fast Unlock ZF != 1");
 871 #endif
 872 
 873   bind(slow_path);
 874   if (stub != nullptr) {
 875     bind(stub->slow_path_continuation());
 876   }
 877 #ifdef ASSERT
 878   // Check that stub->continuation() label is reached with ZF not set.
 879   jccb(Assembler::notZero, zf_correct);
 880   stop("Fast Unlock ZF != 0");
 881   bind(zf_correct);
 882 #endif
 883   // C2 uses the value of ZF to determine the continuation.
 884 }
 885 
 886 //-------------------------------------------------------------------------------------------
 887 // Generic instructions support for use in .ad files C2 code generation
 888 
 889 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 890   if (dst != src) {
 891     movdqu(dst, src);
 892   }
 893   if (opcode == Op_AbsVD) {
 894     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 895   } else {
 896     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 897     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 898   }
 899 }
 900 
 901 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 902   if (opcode == Op_AbsVD) {
 903     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 904   } else {
 905     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 906     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 907   }
 908 }
 909 
 910 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 911   if (dst != src) {
 912     movdqu(dst, src);
 913   }
 914   if (opcode == Op_AbsVF) {
 915     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 916   } else {
 917     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 918     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 919   }
 920 }
 921 
 922 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 923   if (opcode == Op_AbsVF) {
 924     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 925   } else {
 926     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 927     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 928   }
 929 }
 930 
 931 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 932   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 933   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 934 
 935   if (opcode == Op_MinV) {
 936     if (elem_bt == T_BYTE) {
 937       pminsb(dst, src);
 938     } else if (elem_bt == T_SHORT) {
 939       pminsw(dst, src);
 940     } else if (elem_bt == T_INT) {
 941       pminsd(dst, src);
 942     } else {
 943       assert(elem_bt == T_LONG, "required");
 944       assert(tmp == xmm0, "required");
 945       assert_different_registers(dst, src, tmp);
 946       movdqu(xmm0, dst);
 947       pcmpgtq(xmm0, src);
 948       blendvpd(dst, src);  // xmm0 as mask
 949     }
 950   } else { // opcode == Op_MaxV
 951     if (elem_bt == T_BYTE) {
 952       pmaxsb(dst, src);
 953     } else if (elem_bt == T_SHORT) {
 954       pmaxsw(dst, src);
 955     } else if (elem_bt == T_INT) {
 956       pmaxsd(dst, src);
 957     } else {
 958       assert(elem_bt == T_LONG, "required");
 959       assert(tmp == xmm0, "required");
 960       assert_different_registers(dst, src, tmp);
 961       movdqu(xmm0, src);
 962       pcmpgtq(xmm0, dst);
 963       blendvpd(dst, src);  // xmm0 as mask
 964     }
 965   }
 966 }
 967 
 968 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 969                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 970                                  int vlen_enc) {
 971   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 972 
 973   if (opcode == Op_MinV) {
 974     if (elem_bt == T_BYTE) {
 975       vpminsb(dst, src1, src2, vlen_enc);
 976     } else if (elem_bt == T_SHORT) {
 977       vpminsw(dst, src1, src2, vlen_enc);
 978     } else if (elem_bt == T_INT) {
 979       vpminsd(dst, src1, src2, vlen_enc);
 980     } else {
 981       assert(elem_bt == T_LONG, "required");
 982       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 983         vpminsq(dst, src1, src2, vlen_enc);
 984       } else {
 985         assert_different_registers(dst, src1, src2);
 986         vpcmpgtq(dst, src1, src2, vlen_enc);
 987         vblendvpd(dst, src1, src2, dst, vlen_enc);
 988       }
 989     }
 990   } else { // opcode == Op_MaxV
 991     if (elem_bt == T_BYTE) {
 992       vpmaxsb(dst, src1, src2, vlen_enc);
 993     } else if (elem_bt == T_SHORT) {
 994       vpmaxsw(dst, src1, src2, vlen_enc);
 995     } else if (elem_bt == T_INT) {
 996       vpmaxsd(dst, src1, src2, vlen_enc);
 997     } else {
 998       assert(elem_bt == T_LONG, "required");
 999       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1000         vpmaxsq(dst, src1, src2, vlen_enc);
1001       } else {
1002         assert_different_registers(dst, src1, src2);
1003         vpcmpgtq(dst, src1, src2, vlen_enc);
1004         vblendvpd(dst, src2, src1, dst, vlen_enc);
1005       }
1006     }
1007   }
1008 }
1009 
1010 // Float/Double min max
1011 
1012 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1013                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1014                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1015                                    int vlen_enc) {
1016   assert(UseAVX > 0, "required");
1017   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1018          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1019   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1020   assert_different_registers(a, tmp, atmp, btmp);
1021   assert_different_registers(b, tmp, atmp, btmp);
1022 
1023   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1024   bool is_double_word = is_double_word_type(elem_bt);
1025 
1026   /* Note on 'non-obvious' assembly sequence:
1027    *
1028    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1029    * and Java on how they handle floats:
1030    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1031    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1032    *
1033    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1034    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1035    *                (only useful when signs differ, noop otherwise)
1036    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1037 
1038    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1039    *   btmp = (b < +0.0) ? a : b
1040    *   atmp = (b < +0.0) ? b : a
1041    *   Tmp  = Max_Float(atmp , btmp)
1042    *   Res  = (atmp == NaN) ? atmp : Tmp
1043    */
1044 
1045   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1046   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1047   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1048   XMMRegister mask;
1049 
1050   if (!is_double_word && is_min) {
1051     mask = a;
1052     vblend = &MacroAssembler::vblendvps;
1053     vmaxmin = &MacroAssembler::vminps;
1054     vcmp = &MacroAssembler::vcmpps;
1055   } else if (!is_double_word && !is_min) {
1056     mask = b;
1057     vblend = &MacroAssembler::vblendvps;
1058     vmaxmin = &MacroAssembler::vmaxps;
1059     vcmp = &MacroAssembler::vcmpps;
1060   } else if (is_double_word && is_min) {
1061     mask = a;
1062     vblend = &MacroAssembler::vblendvpd;
1063     vmaxmin = &MacroAssembler::vminpd;
1064     vcmp = &MacroAssembler::vcmppd;
1065   } else {
1066     assert(is_double_word && !is_min, "sanity");
1067     mask = b;
1068     vblend = &MacroAssembler::vblendvpd;
1069     vmaxmin = &MacroAssembler::vmaxpd;
1070     vcmp = &MacroAssembler::vcmppd;
1071   }
1072 
1073   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1074   XMMRegister maxmin, scratch;
1075   if (dst == btmp) {
1076     maxmin = btmp;
1077     scratch = tmp;
1078   } else {
1079     maxmin = tmp;
1080     scratch = btmp;
1081   }
1082 
1083   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1084   if (precompute_mask && !is_double_word) {
1085     vpsrad(tmp, mask, 32, vlen_enc);
1086     mask = tmp;
1087   } else if (precompute_mask && is_double_word) {
1088     vpxor(tmp, tmp, tmp, vlen_enc);
1089     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1090     mask = tmp;
1091   }
1092 
1093   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1094   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1095   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1096   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1097   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1098 }
1099 
1100 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1101                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1102                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1103                                     int vlen_enc) {
1104   assert(UseAVX > 2, "required");
1105   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1106          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1107   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1108   assert_different_registers(dst, a, atmp, btmp);
1109   assert_different_registers(dst, b, atmp, btmp);
1110 
1111   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1112   bool is_double_word = is_double_word_type(elem_bt);
1113   bool merge = true;
1114 
1115   if (!is_double_word && is_min) {
1116     evpmovd2m(ktmp, a, vlen_enc);
1117     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1118     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1119     vminps(dst, atmp, btmp, vlen_enc);
1120     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1121     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1122   } else if (!is_double_word && !is_min) {
1123     evpmovd2m(ktmp, b, vlen_enc);
1124     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1125     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1126     vmaxps(dst, atmp, btmp, vlen_enc);
1127     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1128     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1129   } else if (is_double_word && is_min) {
1130     evpmovq2m(ktmp, a, vlen_enc);
1131     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1132     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1133     vminpd(dst, atmp, btmp, vlen_enc);
1134     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1135     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1136   } else {
1137     assert(is_double_word && !is_min, "sanity");
1138     evpmovq2m(ktmp, b, vlen_enc);
1139     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1140     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1141     vmaxpd(dst, atmp, btmp, vlen_enc);
1142     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1143     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1144   }
1145 }
1146 
1147 // Float/Double signum
1148 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1149   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1150 
1151   Label DONE_LABEL;
1152 
1153   if (opcode == Op_SignumF) {
1154     assert(UseSSE > 0, "required");
1155     ucomiss(dst, zero);
1156     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1157     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1158     movflt(dst, one);
1159     jcc(Assembler::above, DONE_LABEL);
1160     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1161   } else if (opcode == Op_SignumD) {
1162     assert(UseSSE > 1, "required");
1163     ucomisd(dst, zero);
1164     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1165     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1166     movdbl(dst, one);
1167     jcc(Assembler::above, DONE_LABEL);
1168     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1169   }
1170 
1171   bind(DONE_LABEL);
1172 }
1173 
1174 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1175   if (sign) {
1176     pmovsxbw(dst, src);
1177   } else {
1178     pmovzxbw(dst, src);
1179   }
1180 }
1181 
1182 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1183   if (sign) {
1184     vpmovsxbw(dst, src, vector_len);
1185   } else {
1186     vpmovzxbw(dst, src, vector_len);
1187   }
1188 }
1189 
1190 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1191   if (sign) {
1192     vpmovsxbd(dst, src, vector_len);
1193   } else {
1194     vpmovzxbd(dst, src, vector_len);
1195   }
1196 }
1197 
1198 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1199   if (sign) {
1200     vpmovsxwd(dst, src, vector_len);
1201   } else {
1202     vpmovzxwd(dst, src, vector_len);
1203   }
1204 }
1205 
1206 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1207                                      int shift, int vector_len) {
1208   if (opcode == Op_RotateLeftV) {
1209     if (etype == T_INT) {
1210       evprold(dst, src, shift, vector_len);
1211     } else {
1212       assert(etype == T_LONG, "expected type T_LONG");
1213       evprolq(dst, src, shift, vector_len);
1214     }
1215   } else {
1216     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1217     if (etype == T_INT) {
1218       evprord(dst, src, shift, vector_len);
1219     } else {
1220       assert(etype == T_LONG, "expected type T_LONG");
1221       evprorq(dst, src, shift, vector_len);
1222     }
1223   }
1224 }
1225 
1226 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1227                                      XMMRegister shift, int vector_len) {
1228   if (opcode == Op_RotateLeftV) {
1229     if (etype == T_INT) {
1230       evprolvd(dst, src, shift, vector_len);
1231     } else {
1232       assert(etype == T_LONG, "expected type T_LONG");
1233       evprolvq(dst, src, shift, vector_len);
1234     }
1235   } else {
1236     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1237     if (etype == T_INT) {
1238       evprorvd(dst, src, shift, vector_len);
1239     } else {
1240       assert(etype == T_LONG, "expected type T_LONG");
1241       evprorvq(dst, src, shift, vector_len);
1242     }
1243   }
1244 }
1245 
1246 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1247   if (opcode == Op_RShiftVI) {
1248     psrad(dst, shift);
1249   } else if (opcode == Op_LShiftVI) {
1250     pslld(dst, shift);
1251   } else {
1252     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1253     psrld(dst, shift);
1254   }
1255 }
1256 
1257 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1258   switch (opcode) {
1259     case Op_RShiftVI:  psrad(dst, shift); break;
1260     case Op_LShiftVI:  pslld(dst, shift); break;
1261     case Op_URShiftVI: psrld(dst, shift); break;
1262 
1263     default: assert(false, "%s", NodeClassNames[opcode]);
1264   }
1265 }
1266 
1267 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1268   if (opcode == Op_RShiftVI) {
1269     vpsrad(dst, nds, shift, vector_len);
1270   } else if (opcode == Op_LShiftVI) {
1271     vpslld(dst, nds, shift, vector_len);
1272   } else {
1273     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1274     vpsrld(dst, nds, shift, vector_len);
1275   }
1276 }
1277 
1278 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1279   switch (opcode) {
1280     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1281     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1282     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1283 
1284     default: assert(false, "%s", NodeClassNames[opcode]);
1285   }
1286 }
1287 
1288 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1289   switch (opcode) {
1290     case Op_RShiftVB:  // fall-through
1291     case Op_RShiftVS:  psraw(dst, shift); break;
1292 
1293     case Op_LShiftVB:  // fall-through
1294     case Op_LShiftVS:  psllw(dst, shift);   break;
1295 
1296     case Op_URShiftVS: // fall-through
1297     case Op_URShiftVB: psrlw(dst, shift);  break;
1298 
1299     default: assert(false, "%s", NodeClassNames[opcode]);
1300   }
1301 }
1302 
1303 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1304   switch (opcode) {
1305     case Op_RShiftVB:  // fall-through
1306     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1307 
1308     case Op_LShiftVB:  // fall-through
1309     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1310 
1311     case Op_URShiftVS: // fall-through
1312     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1313 
1314     default: assert(false, "%s", NodeClassNames[opcode]);
1315   }
1316 }
1317 
1318 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1319   switch (opcode) {
1320     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1321     case Op_LShiftVL:  psllq(dst, shift); break;
1322     case Op_URShiftVL: psrlq(dst, shift); break;
1323 
1324     default: assert(false, "%s", NodeClassNames[opcode]);
1325   }
1326 }
1327 
1328 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1329   if (opcode == Op_RShiftVL) {
1330     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1331   } else if (opcode == Op_LShiftVL) {
1332     psllq(dst, shift);
1333   } else {
1334     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1335     psrlq(dst, shift);
1336   }
1337 }
1338 
1339 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1340   switch (opcode) {
1341     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1342     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1343     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1344 
1345     default: assert(false, "%s", NodeClassNames[opcode]);
1346   }
1347 }
1348 
1349 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1350   if (opcode == Op_RShiftVL) {
1351     evpsraq(dst, nds, shift, vector_len);
1352   } else if (opcode == Op_LShiftVL) {
1353     vpsllq(dst, nds, shift, vector_len);
1354   } else {
1355     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1356     vpsrlq(dst, nds, shift, vector_len);
1357   }
1358 }
1359 
1360 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1361   switch (opcode) {
1362     case Op_RShiftVB:  // fall-through
1363     case Op_RShiftVS:  // fall-through
1364     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1365 
1366     case Op_LShiftVB:  // fall-through
1367     case Op_LShiftVS:  // fall-through
1368     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1369 
1370     case Op_URShiftVB: // fall-through
1371     case Op_URShiftVS: // fall-through
1372     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1373 
1374     default: assert(false, "%s", NodeClassNames[opcode]);
1375   }
1376 }
1377 
1378 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1379   switch (opcode) {
1380     case Op_RShiftVB:  // fall-through
1381     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1382 
1383     case Op_LShiftVB:  // fall-through
1384     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1385 
1386     case Op_URShiftVB: // fall-through
1387     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1388 
1389     default: assert(false, "%s", NodeClassNames[opcode]);
1390   }
1391 }
1392 
1393 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1394   assert(UseAVX >= 2, "required");
1395   switch (opcode) {
1396     case Op_RShiftVL: {
1397       if (UseAVX > 2) {
1398         assert(tmp == xnoreg, "not used");
1399         if (!VM_Version::supports_avx512vl()) {
1400           vlen_enc = Assembler::AVX_512bit;
1401         }
1402         evpsravq(dst, src, shift, vlen_enc);
1403       } else {
1404         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1405         vpsrlvq(dst, src, shift, vlen_enc);
1406         vpsrlvq(tmp, tmp, shift, vlen_enc);
1407         vpxor(dst, dst, tmp, vlen_enc);
1408         vpsubq(dst, dst, tmp, vlen_enc);
1409       }
1410       break;
1411     }
1412     case Op_LShiftVL: {
1413       assert(tmp == xnoreg, "not used");
1414       vpsllvq(dst, src, shift, vlen_enc);
1415       break;
1416     }
1417     case Op_URShiftVL: {
1418       assert(tmp == xnoreg, "not used");
1419       vpsrlvq(dst, src, shift, vlen_enc);
1420       break;
1421     }
1422     default: assert(false, "%s", NodeClassNames[opcode]);
1423   }
1424 }
1425 
1426 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1427 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1428   assert(opcode == Op_LShiftVB ||
1429          opcode == Op_RShiftVB ||
1430          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1431   bool sign = (opcode != Op_URShiftVB);
1432   assert(vector_len == 0, "required");
1433   vextendbd(sign, dst, src, 1);
1434   vpmovzxbd(vtmp, shift, 1);
1435   varshiftd(opcode, dst, dst, vtmp, 1);
1436   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1437   vextracti128_high(vtmp, dst);
1438   vpackusdw(dst, dst, vtmp, 0);
1439 }
1440 
1441 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1442 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1443   assert(opcode == Op_LShiftVB ||
1444          opcode == Op_RShiftVB ||
1445          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1446   bool sign = (opcode != Op_URShiftVB);
1447   int ext_vector_len = vector_len + 1;
1448   vextendbw(sign, dst, src, ext_vector_len);
1449   vpmovzxbw(vtmp, shift, ext_vector_len);
1450   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1451   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1452   if (vector_len == 0) {
1453     vextracti128_high(vtmp, dst);
1454     vpackuswb(dst, dst, vtmp, vector_len);
1455   } else {
1456     vextracti64x4_high(vtmp, dst);
1457     vpackuswb(dst, dst, vtmp, vector_len);
1458     vpermq(dst, dst, 0xD8, vector_len);
1459   }
1460 }
1461 
1462 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1463   switch(typ) {
1464     case T_BYTE:
1465       pinsrb(dst, val, idx);
1466       break;
1467     case T_SHORT:
1468       pinsrw(dst, val, idx);
1469       break;
1470     case T_INT:
1471       pinsrd(dst, val, idx);
1472       break;
1473     case T_LONG:
1474       pinsrq(dst, val, idx);
1475       break;
1476     default:
1477       assert(false,"Should not reach here.");
1478       break;
1479   }
1480 }
1481 
1482 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1483   switch(typ) {
1484     case T_BYTE:
1485       vpinsrb(dst, src, val, idx);
1486       break;
1487     case T_SHORT:
1488       vpinsrw(dst, src, val, idx);
1489       break;
1490     case T_INT:
1491       vpinsrd(dst, src, val, idx);
1492       break;
1493     case T_LONG:
1494       vpinsrq(dst, src, val, idx);
1495       break;
1496     default:
1497       assert(false,"Should not reach here.");
1498       break;
1499   }
1500 }
1501 
1502 #ifdef _LP64
1503 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1504                                                 XMMRegister dst, Register base,
1505                                                 Register idx_base,
1506                                                 Register offset, Register mask,
1507                                                 Register mask_idx, Register rtmp,
1508                                                 int vlen_enc) {
1509   vpxor(dst, dst, dst, vlen_enc);
1510   if (elem_bt == T_SHORT) {
1511     for (int i = 0; i < 4; i++) {
1512       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1513       Label skip_load;
1514       btq(mask, mask_idx);
1515       jccb(Assembler::carryClear, skip_load);
1516       movl(rtmp, Address(idx_base, i * 4));
1517       if (offset != noreg) {
1518         addl(rtmp, offset);
1519       }
1520       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1521       bind(skip_load);
1522       incq(mask_idx);
1523     }
1524   } else {
1525     assert(elem_bt == T_BYTE, "");
1526     for (int i = 0; i < 8; i++) {
1527       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1528       Label skip_load;
1529       btq(mask, mask_idx);
1530       jccb(Assembler::carryClear, skip_load);
1531       movl(rtmp, Address(idx_base, i * 4));
1532       if (offset != noreg) {
1533         addl(rtmp, offset);
1534       }
1535       pinsrb(dst, Address(base, rtmp), i);
1536       bind(skip_load);
1537       incq(mask_idx);
1538     }
1539   }
1540 }
1541 #endif // _LP64
1542 
1543 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1544                                          Register base, Register idx_base,
1545                                          Register offset, Register rtmp,
1546                                          int vlen_enc) {
1547   vpxor(dst, dst, dst, vlen_enc);
1548   if (elem_bt == T_SHORT) {
1549     for (int i = 0; i < 4; i++) {
1550       // dst[i] = src[offset + idx_base[i]]
1551       movl(rtmp, Address(idx_base, i * 4));
1552       if (offset != noreg) {
1553         addl(rtmp, offset);
1554       }
1555       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1556     }
1557   } else {
1558     assert(elem_bt == T_BYTE, "");
1559     for (int i = 0; i < 8; i++) {
1560       // dst[i] = src[offset + idx_base[i]]
1561       movl(rtmp, Address(idx_base, i * 4));
1562       if (offset != noreg) {
1563         addl(rtmp, offset);
1564       }
1565       pinsrb(dst, Address(base, rtmp), i);
1566     }
1567   }
1568 }
1569 
1570 /*
1571  * Gather using hybrid algorithm, first partially unroll scalar loop
1572  * to accumulate values from gather indices into a quad-word(64bit) slice.
1573  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1574  * permutation to place the slice into appropriate vector lane
1575  * locations in destination vector. Following pseudo code describes the
1576  * algorithm in detail:
1577  *
1578  * DST_VEC = ZERO_VEC
1579  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1580  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1581  * FOREACH_ITER:
1582  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1583  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1584  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1585  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1586  *
1587  * With each iteration, doubleword permute indices (0,1) corresponding
1588  * to gathered quadword gets right shifted by two lane positions.
1589  *
1590  */
1591 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1592                                         Register base, Register idx_base,
1593                                         Register offset, Register mask,
1594                                         XMMRegister xtmp1, XMMRegister xtmp2,
1595                                         XMMRegister temp_dst, Register rtmp,
1596                                         Register mask_idx, Register length,
1597                                         int vector_len, int vlen_enc) {
1598   Label GATHER8_LOOP;
1599   assert(is_subword_type(elem_ty), "");
1600   movl(length, vector_len);
1601   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1602   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1603   vallones(xtmp2, vlen_enc);
1604   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1605   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1606   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1607 
1608   bind(GATHER8_LOOP);
1609     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1610     if (mask == noreg) {
1611       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1612     } else {
1613       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1614     }
1615     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1616     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1617     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1618     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1619     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1620     vpor(dst, dst, temp_dst, vlen_enc);
1621     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1622     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1623     jcc(Assembler::notEqual, GATHER8_LOOP);
1624 }
1625 
1626 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1627   switch(typ) {
1628     case T_INT:
1629       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1630       break;
1631     case T_FLOAT:
1632       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1633       break;
1634     case T_LONG:
1635       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1636       break;
1637     case T_DOUBLE:
1638       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1639       break;
1640     default:
1641       assert(false,"Should not reach here.");
1642       break;
1643   }
1644 }
1645 
1646 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1647   switch(typ) {
1648     case T_INT:
1649       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1650       break;
1651     case T_FLOAT:
1652       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1653       break;
1654     case T_LONG:
1655       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1656       break;
1657     case T_DOUBLE:
1658       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1659       break;
1660     default:
1661       assert(false,"Should not reach here.");
1662       break;
1663   }
1664 }
1665 
1666 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1667   switch(typ) {
1668     case T_INT:
1669       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1670       break;
1671     case T_FLOAT:
1672       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1673       break;
1674     case T_LONG:
1675       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1676       break;
1677     case T_DOUBLE:
1678       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1679       break;
1680     default:
1681       assert(false,"Should not reach here.");
1682       break;
1683   }
1684 }
1685 
1686 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1687   if (vlen_in_bytes <= 16) {
1688     pxor (dst, dst);
1689     psubb(dst, src);
1690     switch (elem_bt) {
1691       case T_BYTE:   /* nothing to do */ break;
1692       case T_SHORT:  pmovsxbw(dst, dst); break;
1693       case T_INT:    pmovsxbd(dst, dst); break;
1694       case T_FLOAT:  pmovsxbd(dst, dst); break;
1695       case T_LONG:   pmovsxbq(dst, dst); break;
1696       case T_DOUBLE: pmovsxbq(dst, dst); break;
1697 
1698       default: assert(false, "%s", type2name(elem_bt));
1699     }
1700   } else {
1701     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1702     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1703 
1704     vpxor (dst, dst, dst, vlen_enc);
1705     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1706 
1707     switch (elem_bt) {
1708       case T_BYTE:   /* nothing to do */            break;
1709       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1710       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1711       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1712       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1713       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1714 
1715       default: assert(false, "%s", type2name(elem_bt));
1716     }
1717   }
1718 }
1719 
1720 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1721   if (novlbwdq) {
1722     vpmovsxbd(xtmp, src, vlen_enc);
1723     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1724             Assembler::eq, true, vlen_enc, noreg);
1725   } else {
1726     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1727     vpsubb(xtmp, xtmp, src, vlen_enc);
1728     evpmovb2m(dst, xtmp, vlen_enc);
1729   }
1730 }
1731 
1732 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1733   switch (vlen_in_bytes) {
1734     case 4:  movdl(dst, src);   break;
1735     case 8:  movq(dst, src);    break;
1736     case 16: movdqu(dst, src);  break;
1737     case 32: vmovdqu(dst, src); break;
1738     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1739     default: ShouldNotReachHere();
1740   }
1741 }
1742 
1743 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1744   assert(rscratch != noreg || always_reachable(src), "missing");
1745 
1746   if (reachable(src)) {
1747     load_vector(dst, as_Address(src), vlen_in_bytes);
1748   } else {
1749     lea(rscratch, src);
1750     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1751   }
1752 }
1753 
1754 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1755   int vlen_enc = vector_length_encoding(vlen);
1756   if (VM_Version::supports_avx()) {
1757     if (bt == T_LONG) {
1758       if (VM_Version::supports_avx2()) {
1759         vpbroadcastq(dst, src, vlen_enc);
1760       } else {
1761         vmovddup(dst, src, vlen_enc);
1762       }
1763     } else if (bt == T_DOUBLE) {
1764       if (vlen_enc != Assembler::AVX_128bit) {
1765         vbroadcastsd(dst, src, vlen_enc, noreg);
1766       } else {
1767         vmovddup(dst, src, vlen_enc);
1768       }
1769     } else {
1770       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1771         vpbroadcastd(dst, src, vlen_enc);
1772       } else {
1773         vbroadcastss(dst, src, vlen_enc);
1774       }
1775     }
1776   } else if (VM_Version::supports_sse3()) {
1777     movddup(dst, src);
1778   } else {
1779     movq(dst, src);
1780     if (vlen == 16) {
1781       punpcklqdq(dst, dst);
1782     }
1783   }
1784 }
1785 
1786 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1787   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1788   int offset = exact_log2(type2aelembytes(bt)) << 6;
1789   if (is_floating_point_type(bt)) {
1790     offset += 128;
1791   }
1792   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1793   load_vector(dst, addr, vlen_in_bytes);
1794 }
1795 
1796 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1797 
1798 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1799   int vector_len = Assembler::AVX_128bit;
1800 
1801   switch (opcode) {
1802     case Op_AndReductionV:  pand(dst, src); break;
1803     case Op_OrReductionV:   por (dst, src); break;
1804     case Op_XorReductionV:  pxor(dst, src); break;
1805     case Op_MinReductionV:
1806       switch (typ) {
1807         case T_BYTE:        pminsb(dst, src); break;
1808         case T_SHORT:       pminsw(dst, src); break;
1809         case T_INT:         pminsd(dst, src); break;
1810         case T_LONG:        assert(UseAVX > 2, "required");
1811                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1812         default:            assert(false, "wrong type");
1813       }
1814       break;
1815     case Op_MaxReductionV:
1816       switch (typ) {
1817         case T_BYTE:        pmaxsb(dst, src); break;
1818         case T_SHORT:       pmaxsw(dst, src); break;
1819         case T_INT:         pmaxsd(dst, src); break;
1820         case T_LONG:        assert(UseAVX > 2, "required");
1821                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1822         default:            assert(false, "wrong type");
1823       }
1824       break;
1825     case Op_AddReductionVF: addss(dst, src); break;
1826     case Op_AddReductionVD: addsd(dst, src); break;
1827     case Op_AddReductionVI:
1828       switch (typ) {
1829         case T_BYTE:        paddb(dst, src); break;
1830         case T_SHORT:       paddw(dst, src); break;
1831         case T_INT:         paddd(dst, src); break;
1832         default:            assert(false, "wrong type");
1833       }
1834       break;
1835     case Op_AddReductionVL: paddq(dst, src); break;
1836     case Op_MulReductionVF: mulss(dst, src); break;
1837     case Op_MulReductionVD: mulsd(dst, src); break;
1838     case Op_MulReductionVI:
1839       switch (typ) {
1840         case T_SHORT:       pmullw(dst, src); break;
1841         case T_INT:         pmulld(dst, src); break;
1842         default:            assert(false, "wrong type");
1843       }
1844       break;
1845     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1846                             evpmullq(dst, dst, src, vector_len); break;
1847     default:                assert(false, "wrong opcode");
1848   }
1849 }
1850 
1851 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1852   switch (opcode) {
1853     case Op_AddReductionVF: addps(dst, src); break;
1854     case Op_AddReductionVD: addpd(dst, src); break;
1855     case Op_MulReductionVF: mulps(dst, src); break;
1856     case Op_MulReductionVD: mulpd(dst, src); break;
1857     default:                assert(false, "%s", NodeClassNames[opcode]);
1858   }
1859 }
1860 
1861 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1862   int vector_len = Assembler::AVX_256bit;
1863 
1864   switch (opcode) {
1865     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1866     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1867     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1868     case Op_MinReductionV:
1869       switch (typ) {
1870         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1871         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1872         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1873         case T_LONG:        assert(UseAVX > 2, "required");
1874                             vpminsq(dst, src1, src2, vector_len); break;
1875         default:            assert(false, "wrong type");
1876       }
1877       break;
1878     case Op_MaxReductionV:
1879       switch (typ) {
1880         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1881         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1882         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1883         case T_LONG:        assert(UseAVX > 2, "required");
1884                             vpmaxsq(dst, src1, src2, vector_len); break;
1885         default:            assert(false, "wrong type");
1886       }
1887       break;
1888     case Op_AddReductionVI:
1889       switch (typ) {
1890         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1891         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1892         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1893         default:            assert(false, "wrong type");
1894       }
1895       break;
1896     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1897     case Op_MulReductionVI:
1898       switch (typ) {
1899         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1900         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1901         default:            assert(false, "wrong type");
1902       }
1903       break;
1904     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1905     default:                assert(false, "wrong opcode");
1906   }
1907 }
1908 
1909 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1910   int vector_len = Assembler::AVX_256bit;
1911 
1912   switch (opcode) {
1913     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1914     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1915     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1916     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1917     default:                assert(false, "%s", NodeClassNames[opcode]);
1918   }
1919 }
1920 
1921 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1922                                   XMMRegister dst, XMMRegister src,
1923                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1924   switch (opcode) {
1925     case Op_AddReductionVF:
1926     case Op_MulReductionVF:
1927       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1928       break;
1929 
1930     case Op_AddReductionVD:
1931     case Op_MulReductionVD:
1932       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1933       break;
1934 
1935     default: assert(false, "wrong opcode");
1936   }
1937 }
1938 
1939 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1940                                             XMMRegister dst, XMMRegister src,
1941                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1942   switch (opcode) {
1943     case Op_AddReductionVF:
1944     case Op_MulReductionVF:
1945       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1946       break;
1947 
1948     case Op_AddReductionVD:
1949     case Op_MulReductionVD:
1950       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1951       break;
1952 
1953     default: assert(false, "%s", NodeClassNames[opcode]);
1954   }
1955 }
1956 
1957 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1958                              Register dst, Register src1, XMMRegister src2,
1959                              XMMRegister vtmp1, XMMRegister vtmp2) {
1960   switch (vlen) {
1961     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1962     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1963     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1964     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1965 
1966     default: assert(false, "wrong vector length");
1967   }
1968 }
1969 
1970 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1971                              Register dst, Register src1, XMMRegister src2,
1972                              XMMRegister vtmp1, XMMRegister vtmp2) {
1973   switch (vlen) {
1974     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1975     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1976     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1977     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1978 
1979     default: assert(false, "wrong vector length");
1980   }
1981 }
1982 
1983 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1984                              Register dst, Register src1, XMMRegister src2,
1985                              XMMRegister vtmp1, XMMRegister vtmp2) {
1986   switch (vlen) {
1987     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1988     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1989     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1990     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1991 
1992     default: assert(false, "wrong vector length");
1993   }
1994 }
1995 
1996 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1997                              Register dst, Register src1, XMMRegister src2,
1998                              XMMRegister vtmp1, XMMRegister vtmp2) {
1999   switch (vlen) {
2000     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2001     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2002     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2003     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2004 
2005     default: assert(false, "wrong vector length");
2006   }
2007 }
2008 
2009 #ifdef _LP64
2010 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2011                              Register dst, Register src1, XMMRegister src2,
2012                              XMMRegister vtmp1, XMMRegister vtmp2) {
2013   switch (vlen) {
2014     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2015     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2016     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2017 
2018     default: assert(false, "wrong vector length");
2019   }
2020 }
2021 #endif // _LP64
2022 
2023 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2024   switch (vlen) {
2025     case 2:
2026       assert(vtmp2 == xnoreg, "");
2027       reduce2F(opcode, dst, src, vtmp1);
2028       break;
2029     case 4:
2030       assert(vtmp2 == xnoreg, "");
2031       reduce4F(opcode, dst, src, vtmp1);
2032       break;
2033     case 8:
2034       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2035       break;
2036     case 16:
2037       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2038       break;
2039     default: assert(false, "wrong vector length");
2040   }
2041 }
2042 
2043 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2044   switch (vlen) {
2045     case 2:
2046       assert(vtmp2 == xnoreg, "");
2047       reduce2D(opcode, dst, src, vtmp1);
2048       break;
2049     case 4:
2050       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2051       break;
2052     case 8:
2053       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2054       break;
2055     default: assert(false, "wrong vector length");
2056   }
2057 }
2058 
2059 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2060   switch (vlen) {
2061     case 2:
2062       assert(vtmp1 == xnoreg, "");
2063       assert(vtmp2 == xnoreg, "");
2064       unorderedReduce2F(opcode, dst, src);
2065       break;
2066     case 4:
2067       assert(vtmp2 == xnoreg, "");
2068       unorderedReduce4F(opcode, dst, src, vtmp1);
2069       break;
2070     case 8:
2071       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2072       break;
2073     case 16:
2074       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2075       break;
2076     default: assert(false, "wrong vector length");
2077   }
2078 }
2079 
2080 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2081   switch (vlen) {
2082     case 2:
2083       assert(vtmp1 == xnoreg, "");
2084       assert(vtmp2 == xnoreg, "");
2085       unorderedReduce2D(opcode, dst, src);
2086       break;
2087     case 4:
2088       assert(vtmp2 == xnoreg, "");
2089       unorderedReduce4D(opcode, dst, src, vtmp1);
2090       break;
2091     case 8:
2092       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2093       break;
2094     default: assert(false, "wrong vector length");
2095   }
2096 }
2097 
2098 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2099   if (opcode == Op_AddReductionVI) {
2100     if (vtmp1 != src2) {
2101       movdqu(vtmp1, src2);
2102     }
2103     phaddd(vtmp1, vtmp1);
2104   } else {
2105     pshufd(vtmp1, src2, 0x1);
2106     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2107   }
2108   movdl(vtmp2, src1);
2109   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2110   movdl(dst, vtmp1);
2111 }
2112 
2113 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2114   if (opcode == Op_AddReductionVI) {
2115     if (vtmp1 != src2) {
2116       movdqu(vtmp1, src2);
2117     }
2118     phaddd(vtmp1, src2);
2119     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2120   } else {
2121     pshufd(vtmp2, src2, 0xE);
2122     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2123     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2124   }
2125 }
2126 
2127 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2128   if (opcode == Op_AddReductionVI) {
2129     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2130     vextracti128_high(vtmp2, vtmp1);
2131     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2132     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2133   } else {
2134     vextracti128_high(vtmp1, src2);
2135     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2136     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2137   }
2138 }
2139 
2140 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2141   vextracti64x4_high(vtmp2, src2);
2142   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2143   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2144 }
2145 
2146 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2147   pshufd(vtmp2, src2, 0x1);
2148   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2149   movdqu(vtmp1, vtmp2);
2150   psrldq(vtmp1, 2);
2151   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2152   movdqu(vtmp2, vtmp1);
2153   psrldq(vtmp2, 1);
2154   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2155   movdl(vtmp2, src1);
2156   pmovsxbd(vtmp1, vtmp1);
2157   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2158   pextrb(dst, vtmp1, 0x0);
2159   movsbl(dst, dst);
2160 }
2161 
2162 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2163   pshufd(vtmp1, src2, 0xE);
2164   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2165   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2166 }
2167 
2168 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2169   vextracti128_high(vtmp2, src2);
2170   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2171   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2172 }
2173 
2174 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2175   vextracti64x4_high(vtmp1, src2);
2176   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2177   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2178 }
2179 
2180 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2181   pmovsxbw(vtmp2, src2);
2182   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2183 }
2184 
2185 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2186   if (UseAVX > 1) {
2187     int vector_len = Assembler::AVX_256bit;
2188     vpmovsxbw(vtmp1, src2, vector_len);
2189     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2190   } else {
2191     pmovsxbw(vtmp2, src2);
2192     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2193     pshufd(vtmp2, src2, 0x1);
2194     pmovsxbw(vtmp2, src2);
2195     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2196   }
2197 }
2198 
2199 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2200   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2201     int vector_len = Assembler::AVX_512bit;
2202     vpmovsxbw(vtmp1, src2, vector_len);
2203     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2204   } else {
2205     assert(UseAVX >= 2,"Should not reach here.");
2206     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2207     vextracti128_high(vtmp2, src2);
2208     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2209   }
2210 }
2211 
2212 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2213   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2214   vextracti64x4_high(vtmp2, src2);
2215   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2216 }
2217 
2218 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2219   if (opcode == Op_AddReductionVI) {
2220     if (vtmp1 != src2) {
2221       movdqu(vtmp1, src2);
2222     }
2223     phaddw(vtmp1, vtmp1);
2224     phaddw(vtmp1, vtmp1);
2225   } else {
2226     pshufd(vtmp2, src2, 0x1);
2227     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2228     movdqu(vtmp1, vtmp2);
2229     psrldq(vtmp1, 2);
2230     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2231   }
2232   movdl(vtmp2, src1);
2233   pmovsxwd(vtmp1, vtmp1);
2234   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2235   pextrw(dst, vtmp1, 0x0);
2236   movswl(dst, dst);
2237 }
2238 
2239 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2240   if (opcode == Op_AddReductionVI) {
2241     if (vtmp1 != src2) {
2242       movdqu(vtmp1, src2);
2243     }
2244     phaddw(vtmp1, src2);
2245   } else {
2246     pshufd(vtmp1, src2, 0xE);
2247     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2248   }
2249   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2250 }
2251 
2252 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2253   if (opcode == Op_AddReductionVI) {
2254     int vector_len = Assembler::AVX_256bit;
2255     vphaddw(vtmp2, src2, src2, vector_len);
2256     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2257   } else {
2258     vextracti128_high(vtmp2, src2);
2259     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2260   }
2261   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2262 }
2263 
2264 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2265   int vector_len = Assembler::AVX_256bit;
2266   vextracti64x4_high(vtmp1, src2);
2267   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2268   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2269 }
2270 
2271 #ifdef _LP64
2272 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2273   pshufd(vtmp2, src2, 0xE);
2274   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2275   movdq(vtmp1, src1);
2276   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2277   movdq(dst, vtmp1);
2278 }
2279 
2280 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2281   vextracti128_high(vtmp1, src2);
2282   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2283   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2284 }
2285 
2286 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2287   vextracti64x4_high(vtmp2, src2);
2288   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2289   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2290 }
2291 
2292 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2293   mov64(temp, -1L);
2294   bzhiq(temp, temp, len);
2295   kmovql(dst, temp);
2296 }
2297 #endif // _LP64
2298 
2299 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2300   reduce_operation_128(T_FLOAT, opcode, dst, src);
2301   pshufd(vtmp, src, 0x1);
2302   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2303 }
2304 
2305 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2306   reduce2F(opcode, dst, src, vtmp);
2307   pshufd(vtmp, src, 0x2);
2308   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2309   pshufd(vtmp, src, 0x3);
2310   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2311 }
2312 
2313 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2314   reduce4F(opcode, dst, src, vtmp2);
2315   vextractf128_high(vtmp2, src);
2316   reduce4F(opcode, dst, vtmp2, vtmp1);
2317 }
2318 
2319 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2320   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2321   vextracti64x4_high(vtmp1, src);
2322   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2323 }
2324 
2325 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2326   pshufd(dst, src, 0x1);
2327   reduce_operation_128(T_FLOAT, opcode, dst, src);
2328 }
2329 
2330 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2331   pshufd(vtmp, src, 0xE);
2332   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2333   unorderedReduce2F(opcode, dst, vtmp);
2334 }
2335 
2336 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2337   vextractf128_high(vtmp1, src);
2338   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2339   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2340 }
2341 
2342 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2343   vextractf64x4_high(vtmp2, src);
2344   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2345   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2346 }
2347 
2348 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2349   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2350   pshufd(vtmp, src, 0xE);
2351   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2352 }
2353 
2354 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2355   reduce2D(opcode, dst, src, vtmp2);
2356   vextractf128_high(vtmp2, src);
2357   reduce2D(opcode, dst, vtmp2, vtmp1);
2358 }
2359 
2360 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2361   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2362   vextracti64x4_high(vtmp1, src);
2363   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2364 }
2365 
2366 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2367   pshufd(dst, src, 0xE);
2368   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2369 }
2370 
2371 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2372   vextractf128_high(vtmp, src);
2373   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2374   unorderedReduce2D(opcode, dst, vtmp);
2375 }
2376 
2377 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2378   vextractf64x4_high(vtmp2, src);
2379   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2380   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2381 }
2382 
2383 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2384   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2385 }
2386 
2387 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2388   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2389 }
2390 
2391 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2392                                  int vec_enc) {
2393   switch(elem_bt) {
2394     case T_INT:
2395     case T_FLOAT:
2396       vmaskmovps(dst, src, mask, vec_enc);
2397       break;
2398     case T_LONG:
2399     case T_DOUBLE:
2400       vmaskmovpd(dst, src, mask, vec_enc);
2401       break;
2402     default:
2403       fatal("Unsupported type %s", type2name(elem_bt));
2404       break;
2405   }
2406 }
2407 
2408 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2409                                  int vec_enc) {
2410   switch(elem_bt) {
2411     case T_INT:
2412     case T_FLOAT:
2413       vmaskmovps(dst, src, mask, vec_enc);
2414       break;
2415     case T_LONG:
2416     case T_DOUBLE:
2417       vmaskmovpd(dst, src, mask, vec_enc);
2418       break;
2419     default:
2420       fatal("Unsupported type %s", type2name(elem_bt));
2421       break;
2422   }
2423 }
2424 
2425 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2426                                           XMMRegister dst, XMMRegister src,
2427                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2428                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2429   const int permconst[] = {1, 14};
2430   XMMRegister wsrc = src;
2431   XMMRegister wdst = xmm_0;
2432   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2433 
2434   int vlen_enc = Assembler::AVX_128bit;
2435   if (vlen == 16) {
2436     vlen_enc = Assembler::AVX_256bit;
2437   }
2438 
2439   for (int i = log2(vlen) - 1; i >=0; i--) {
2440     if (i == 0 && !is_dst_valid) {
2441       wdst = dst;
2442     }
2443     if (i == 3) {
2444       vextracti64x4_high(wtmp, wsrc);
2445     } else if (i == 2) {
2446       vextracti128_high(wtmp, wsrc);
2447     } else { // i = [0,1]
2448       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2449     }
2450     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2451     wsrc = wdst;
2452     vlen_enc = Assembler::AVX_128bit;
2453   }
2454   if (is_dst_valid) {
2455     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2456   }
2457 }
2458 
2459 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2460                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2461                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2462   XMMRegister wsrc = src;
2463   XMMRegister wdst = xmm_0;
2464   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2465   int vlen_enc = Assembler::AVX_128bit;
2466   if (vlen == 8) {
2467     vlen_enc = Assembler::AVX_256bit;
2468   }
2469   for (int i = log2(vlen) - 1; i >=0; i--) {
2470     if (i == 0 && !is_dst_valid) {
2471       wdst = dst;
2472     }
2473     if (i == 1) {
2474       vextracti128_high(wtmp, wsrc);
2475     } else if (i == 2) {
2476       vextracti64x4_high(wtmp, wsrc);
2477     } else {
2478       assert(i == 0, "%d", i);
2479       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2480     }
2481     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2482     wsrc = wdst;
2483     vlen_enc = Assembler::AVX_128bit;
2484   }
2485   if (is_dst_valid) {
2486     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2487   }
2488 }
2489 
2490 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2491   switch (bt) {
2492     case T_BYTE:  pextrb(dst, src, idx); break;
2493     case T_SHORT: pextrw(dst, src, idx); break;
2494     case T_INT:   pextrd(dst, src, idx); break;
2495     case T_LONG:  pextrq(dst, src, idx); break;
2496 
2497     default:
2498       assert(false,"Should not reach here.");
2499       break;
2500   }
2501 }
2502 
2503 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2504   int esize =  type2aelembytes(typ);
2505   int elem_per_lane = 16/esize;
2506   int lane = elemindex / elem_per_lane;
2507   int eindex = elemindex % elem_per_lane;
2508 
2509   if (lane >= 2) {
2510     assert(UseAVX > 2, "required");
2511     vextractf32x4(dst, src, lane & 3);
2512     return dst;
2513   } else if (lane > 0) {
2514     assert(UseAVX > 0, "required");
2515     vextractf128(dst, src, lane);
2516     return dst;
2517   } else {
2518     return src;
2519   }
2520 }
2521 
2522 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2523   if (typ == T_BYTE) {
2524     movsbl(dst, dst);
2525   } else if (typ == T_SHORT) {
2526     movswl(dst, dst);
2527   }
2528 }
2529 
2530 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2531   int esize =  type2aelembytes(typ);
2532   int elem_per_lane = 16/esize;
2533   int eindex = elemindex % elem_per_lane;
2534   assert(is_integral_type(typ),"required");
2535 
2536   if (eindex == 0) {
2537     if (typ == T_LONG) {
2538       movq(dst, src);
2539     } else {
2540       movdl(dst, src);
2541       movsxl(typ, dst);
2542     }
2543   } else {
2544     extract(typ, dst, src, eindex);
2545     movsxl(typ, dst);
2546   }
2547 }
2548 
2549 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2550   int esize =  type2aelembytes(typ);
2551   int elem_per_lane = 16/esize;
2552   int eindex = elemindex % elem_per_lane;
2553   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2554 
2555   if (eindex == 0) {
2556     movq(dst, src);
2557   } else {
2558     if (typ == T_FLOAT) {
2559       if (UseAVX == 0) {
2560         movdqu(dst, src);
2561         shufps(dst, dst, eindex);
2562       } else {
2563         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2564       }
2565     } else {
2566       if (UseAVX == 0) {
2567         movdqu(dst, src);
2568         psrldq(dst, eindex*esize);
2569       } else {
2570         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2571       }
2572       movq(dst, dst);
2573     }
2574   }
2575   // Zero upper bits
2576   if (typ == T_FLOAT) {
2577     if (UseAVX == 0) {
2578       assert(vtmp != xnoreg, "required.");
2579       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2580       pand(dst, vtmp);
2581     } else {
2582       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2583     }
2584   }
2585 }
2586 
2587 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2588   switch(typ) {
2589     case T_BYTE:
2590     case T_BOOLEAN:
2591       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2592       break;
2593     case T_SHORT:
2594     case T_CHAR:
2595       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2596       break;
2597     case T_INT:
2598     case T_FLOAT:
2599       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2600       break;
2601     case T_LONG:
2602     case T_DOUBLE:
2603       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2604       break;
2605     default:
2606       assert(false,"Should not reach here.");
2607       break;
2608   }
2609 }
2610 
2611 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2612   assert(rscratch != noreg || always_reachable(src2), "missing");
2613 
2614   switch(typ) {
2615     case T_BOOLEAN:
2616     case T_BYTE:
2617       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2618       break;
2619     case T_CHAR:
2620     case T_SHORT:
2621       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2622       break;
2623     case T_INT:
2624     case T_FLOAT:
2625       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2626       break;
2627     case T_LONG:
2628     case T_DOUBLE:
2629       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2630       break;
2631     default:
2632       assert(false,"Should not reach here.");
2633       break;
2634   }
2635 }
2636 
2637 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2638   switch(typ) {
2639     case T_BYTE:
2640       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2641       break;
2642     case T_SHORT:
2643       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2644       break;
2645     case T_INT:
2646     case T_FLOAT:
2647       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2648       break;
2649     case T_LONG:
2650     case T_DOUBLE:
2651       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2652       break;
2653     default:
2654       assert(false,"Should not reach here.");
2655       break;
2656   }
2657 }
2658 
2659 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2660   assert(vlen_in_bytes <= 32, "");
2661   int esize = type2aelembytes(bt);
2662   if (vlen_in_bytes == 32) {
2663     assert(vtmp == xnoreg, "required.");
2664     if (esize >= 4) {
2665       vtestps(src1, src2, AVX_256bit);
2666     } else {
2667       vptest(src1, src2, AVX_256bit);
2668     }
2669     return;
2670   }
2671   if (vlen_in_bytes < 16) {
2672     // Duplicate the lower part to fill the whole register,
2673     // Don't need to do so for src2
2674     assert(vtmp != xnoreg, "required");
2675     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2676     pshufd(vtmp, src1, shuffle_imm);
2677   } else {
2678     assert(vtmp == xnoreg, "required");
2679     vtmp = src1;
2680   }
2681   if (esize >= 4 && VM_Version::supports_avx()) {
2682     vtestps(vtmp, src2, AVX_128bit);
2683   } else {
2684     ptest(vtmp, src2);
2685   }
2686 }
2687 
2688 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2689   assert(UseAVX >= 2, "required");
2690 #ifdef ASSERT
2691   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2692   bool is_bw_supported = VM_Version::supports_avx512bw();
2693   if (is_bw && !is_bw_supported) {
2694     assert(vlen_enc != Assembler::AVX_512bit, "required");
2695     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2696            "XMM register should be 0-15");
2697   }
2698 #endif // ASSERT
2699   switch (elem_bt) {
2700     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2701     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2702     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2703     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2704     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2705     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2706     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2707   }
2708 }
2709 
2710 #ifdef _LP64
2711 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2712   assert(UseAVX >= 2, "required");
2713   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2714   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2715   if ((UseAVX > 2) &&
2716       (!is_bw || VM_Version::supports_avx512bw()) &&
2717       (!is_vl || VM_Version::supports_avx512vl())) {
2718     switch (elem_bt) {
2719       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2720       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2721       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2722       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2723       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2724     }
2725   } else {
2726     assert(vlen_enc != Assembler::AVX_512bit, "required");
2727     assert((dst->encoding() < 16),"XMM register should be 0-15");
2728     switch (elem_bt) {
2729       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2730       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2731       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2732       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2733       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2734       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2735       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2736     }
2737   }
2738 }
2739 #endif
2740 
2741 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2742   switch (to_elem_bt) {
2743     case T_SHORT:
2744       vpmovsxbw(dst, src, vlen_enc);
2745       break;
2746     case T_INT:
2747       vpmovsxbd(dst, src, vlen_enc);
2748       break;
2749     case T_FLOAT:
2750       vpmovsxbd(dst, src, vlen_enc);
2751       vcvtdq2ps(dst, dst, vlen_enc);
2752       break;
2753     case T_LONG:
2754       vpmovsxbq(dst, src, vlen_enc);
2755       break;
2756     case T_DOUBLE: {
2757       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2758       vpmovsxbd(dst, src, mid_vlen_enc);
2759       vcvtdq2pd(dst, dst, vlen_enc);
2760       break;
2761     }
2762     default:
2763       fatal("Unsupported type %s", type2name(to_elem_bt));
2764       break;
2765   }
2766 }
2767 
2768 //-------------------------------------------------------------------------------------------
2769 
2770 // IndexOf for constant substrings with size >= 8 chars
2771 // which don't need to be loaded through stack.
2772 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2773                                          Register cnt1, Register cnt2,
2774                                          int int_cnt2,  Register result,
2775                                          XMMRegister vec, Register tmp,
2776                                          int ae) {
2777   ShortBranchVerifier sbv(this);
2778   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2779   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2780 
2781   // This method uses the pcmpestri instruction with bound registers
2782   //   inputs:
2783   //     xmm - substring
2784   //     rax - substring length (elements count)
2785   //     mem - scanned string
2786   //     rdx - string length (elements count)
2787   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2788   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2789   //   outputs:
2790   //     rcx - matched index in string
2791   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2792   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2793   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2794   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2795   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2796 
2797   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2798         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2799         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2800 
2801   // Note, inline_string_indexOf() generates checks:
2802   // if (substr.count > string.count) return -1;
2803   // if (substr.count == 0) return 0;
2804   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2805 
2806   // Load substring.
2807   if (ae == StrIntrinsicNode::UL) {
2808     pmovzxbw(vec, Address(str2, 0));
2809   } else {
2810     movdqu(vec, Address(str2, 0));
2811   }
2812   movl(cnt2, int_cnt2);
2813   movptr(result, str1); // string addr
2814 
2815   if (int_cnt2 > stride) {
2816     jmpb(SCAN_TO_SUBSTR);
2817 
2818     // Reload substr for rescan, this code
2819     // is executed only for large substrings (> 8 chars)
2820     bind(RELOAD_SUBSTR);
2821     if (ae == StrIntrinsicNode::UL) {
2822       pmovzxbw(vec, Address(str2, 0));
2823     } else {
2824       movdqu(vec, Address(str2, 0));
2825     }
2826     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2827 
2828     bind(RELOAD_STR);
2829     // We came here after the beginning of the substring was
2830     // matched but the rest of it was not so we need to search
2831     // again. Start from the next element after the previous match.
2832 
2833     // cnt2 is number of substring reminding elements and
2834     // cnt1 is number of string reminding elements when cmp failed.
2835     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2836     subl(cnt1, cnt2);
2837     addl(cnt1, int_cnt2);
2838     movl(cnt2, int_cnt2); // Now restore cnt2
2839 
2840     decrementl(cnt1);     // Shift to next element
2841     cmpl(cnt1, cnt2);
2842     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2843 
2844     addptr(result, (1<<scale1));
2845 
2846   } // (int_cnt2 > 8)
2847 
2848   // Scan string for start of substr in 16-byte vectors
2849   bind(SCAN_TO_SUBSTR);
2850   pcmpestri(vec, Address(result, 0), mode);
2851   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2852   subl(cnt1, stride);
2853   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2854   cmpl(cnt1, cnt2);
2855   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2856   addptr(result, 16);
2857   jmpb(SCAN_TO_SUBSTR);
2858 
2859   // Found a potential substr
2860   bind(FOUND_CANDIDATE);
2861   // Matched whole vector if first element matched (tmp(rcx) == 0).
2862   if (int_cnt2 == stride) {
2863     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2864   } else { // int_cnt2 > 8
2865     jccb(Assembler::overflow, FOUND_SUBSTR);
2866   }
2867   // After pcmpestri tmp(rcx) contains matched element index
2868   // Compute start addr of substr
2869   lea(result, Address(result, tmp, scale1));
2870 
2871   // Make sure string is still long enough
2872   subl(cnt1, tmp);
2873   cmpl(cnt1, cnt2);
2874   if (int_cnt2 == stride) {
2875     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2876   } else { // int_cnt2 > 8
2877     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2878   }
2879   // Left less then substring.
2880 
2881   bind(RET_NOT_FOUND);
2882   movl(result, -1);
2883   jmp(EXIT);
2884 
2885   if (int_cnt2 > stride) {
2886     // This code is optimized for the case when whole substring
2887     // is matched if its head is matched.
2888     bind(MATCH_SUBSTR_HEAD);
2889     pcmpestri(vec, Address(result, 0), mode);
2890     // Reload only string if does not match
2891     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2892 
2893     Label CONT_SCAN_SUBSTR;
2894     // Compare the rest of substring (> 8 chars).
2895     bind(FOUND_SUBSTR);
2896     // First 8 chars are already matched.
2897     negptr(cnt2);
2898     addptr(cnt2, stride);
2899 
2900     bind(SCAN_SUBSTR);
2901     subl(cnt1, stride);
2902     cmpl(cnt2, -stride); // Do not read beyond substring
2903     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2904     // Back-up strings to avoid reading beyond substring:
2905     // cnt1 = cnt1 - cnt2 + 8
2906     addl(cnt1, cnt2); // cnt2 is negative
2907     addl(cnt1, stride);
2908     movl(cnt2, stride); negptr(cnt2);
2909     bind(CONT_SCAN_SUBSTR);
2910     if (int_cnt2 < (int)G) {
2911       int tail_off1 = int_cnt2<<scale1;
2912       int tail_off2 = int_cnt2<<scale2;
2913       if (ae == StrIntrinsicNode::UL) {
2914         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2915       } else {
2916         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2917       }
2918       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2919     } else {
2920       // calculate index in register to avoid integer overflow (int_cnt2*2)
2921       movl(tmp, int_cnt2);
2922       addptr(tmp, cnt2);
2923       if (ae == StrIntrinsicNode::UL) {
2924         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2925       } else {
2926         movdqu(vec, Address(str2, tmp, scale2, 0));
2927       }
2928       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2929     }
2930     // Need to reload strings pointers if not matched whole vector
2931     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2932     addptr(cnt2, stride);
2933     jcc(Assembler::negative, SCAN_SUBSTR);
2934     // Fall through if found full substring
2935 
2936   } // (int_cnt2 > 8)
2937 
2938   bind(RET_FOUND);
2939   // Found result if we matched full small substring.
2940   // Compute substr offset
2941   subptr(result, str1);
2942   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2943     shrl(result, 1); // index
2944   }
2945   bind(EXIT);
2946 
2947 } // string_indexofC8
2948 
2949 // Small strings are loaded through stack if they cross page boundary.
2950 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2951                                        Register cnt1, Register cnt2,
2952                                        int int_cnt2,  Register result,
2953                                        XMMRegister vec, Register tmp,
2954                                        int ae) {
2955   ShortBranchVerifier sbv(this);
2956   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2957   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2958 
2959   //
2960   // int_cnt2 is length of small (< 8 chars) constant substring
2961   // or (-1) for non constant substring in which case its length
2962   // is in cnt2 register.
2963   //
2964   // Note, inline_string_indexOf() generates checks:
2965   // if (substr.count > string.count) return -1;
2966   // if (substr.count == 0) return 0;
2967   //
2968   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2969   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2970   // This method uses the pcmpestri instruction with bound registers
2971   //   inputs:
2972   //     xmm - substring
2973   //     rax - substring length (elements count)
2974   //     mem - scanned string
2975   //     rdx - string length (elements count)
2976   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2977   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2978   //   outputs:
2979   //     rcx - matched index in string
2980   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2981   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2982   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2983   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2984 
2985   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2986         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2987         FOUND_CANDIDATE;
2988 
2989   { //========================================================
2990     // We don't know where these strings are located
2991     // and we can't read beyond them. Load them through stack.
2992     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2993 
2994     movptr(tmp, rsp); // save old SP
2995 
2996     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2997       if (int_cnt2 == (1>>scale2)) { // One byte
2998         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2999         load_unsigned_byte(result, Address(str2, 0));
3000         movdl(vec, result); // move 32 bits
3001       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3002         // Not enough header space in 32-bit VM: 12+3 = 15.
3003         movl(result, Address(str2, -1));
3004         shrl(result, 8);
3005         movdl(vec, result); // move 32 bits
3006       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3007         load_unsigned_short(result, Address(str2, 0));
3008         movdl(vec, result); // move 32 bits
3009       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3010         movdl(vec, Address(str2, 0)); // move 32 bits
3011       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3012         movq(vec, Address(str2, 0));  // move 64 bits
3013       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3014         // Array header size is 12 bytes in 32-bit VM
3015         // + 6 bytes for 3 chars == 18 bytes,
3016         // enough space to load vec and shift.
3017         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3018         if (ae == StrIntrinsicNode::UL) {
3019           int tail_off = int_cnt2-8;
3020           pmovzxbw(vec, Address(str2, tail_off));
3021           psrldq(vec, -2*tail_off);
3022         }
3023         else {
3024           int tail_off = int_cnt2*(1<<scale2);
3025           movdqu(vec, Address(str2, tail_off-16));
3026           psrldq(vec, 16-tail_off);
3027         }
3028       }
3029     } else { // not constant substring
3030       cmpl(cnt2, stride);
3031       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3032 
3033       // We can read beyond string if srt+16 does not cross page boundary
3034       // since heaps are aligned and mapped by pages.
3035       assert(os::vm_page_size() < (int)G, "default page should be small");
3036       movl(result, str2); // We need only low 32 bits
3037       andl(result, ((int)os::vm_page_size()-1));
3038       cmpl(result, ((int)os::vm_page_size()-16));
3039       jccb(Assembler::belowEqual, CHECK_STR);
3040 
3041       // Move small strings to stack to allow load 16 bytes into vec.
3042       subptr(rsp, 16);
3043       int stk_offset = wordSize-(1<<scale2);
3044       push(cnt2);
3045 
3046       bind(COPY_SUBSTR);
3047       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3048         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3049         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3050       } else if (ae == StrIntrinsicNode::UU) {
3051         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3052         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3053       }
3054       decrement(cnt2);
3055       jccb(Assembler::notZero, COPY_SUBSTR);
3056 
3057       pop(cnt2);
3058       movptr(str2, rsp);  // New substring address
3059     } // non constant
3060 
3061     bind(CHECK_STR);
3062     cmpl(cnt1, stride);
3063     jccb(Assembler::aboveEqual, BIG_STRINGS);
3064 
3065     // Check cross page boundary.
3066     movl(result, str1); // We need only low 32 bits
3067     andl(result, ((int)os::vm_page_size()-1));
3068     cmpl(result, ((int)os::vm_page_size()-16));
3069     jccb(Assembler::belowEqual, BIG_STRINGS);
3070 
3071     subptr(rsp, 16);
3072     int stk_offset = -(1<<scale1);
3073     if (int_cnt2 < 0) { // not constant
3074       push(cnt2);
3075       stk_offset += wordSize;
3076     }
3077     movl(cnt2, cnt1);
3078 
3079     bind(COPY_STR);
3080     if (ae == StrIntrinsicNode::LL) {
3081       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3082       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3083     } else {
3084       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3085       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3086     }
3087     decrement(cnt2);
3088     jccb(Assembler::notZero, COPY_STR);
3089 
3090     if (int_cnt2 < 0) { // not constant
3091       pop(cnt2);
3092     }
3093     movptr(str1, rsp);  // New string address
3094 
3095     bind(BIG_STRINGS);
3096     // Load substring.
3097     if (int_cnt2 < 0) { // -1
3098       if (ae == StrIntrinsicNode::UL) {
3099         pmovzxbw(vec, Address(str2, 0));
3100       } else {
3101         movdqu(vec, Address(str2, 0));
3102       }
3103       push(cnt2);       // substr count
3104       push(str2);       // substr addr
3105       push(str1);       // string addr
3106     } else {
3107       // Small (< 8 chars) constant substrings are loaded already.
3108       movl(cnt2, int_cnt2);
3109     }
3110     push(tmp);  // original SP
3111 
3112   } // Finished loading
3113 
3114   //========================================================
3115   // Start search
3116   //
3117 
3118   movptr(result, str1); // string addr
3119 
3120   if (int_cnt2  < 0) {  // Only for non constant substring
3121     jmpb(SCAN_TO_SUBSTR);
3122 
3123     // SP saved at sp+0
3124     // String saved at sp+1*wordSize
3125     // Substr saved at sp+2*wordSize
3126     // Substr count saved at sp+3*wordSize
3127 
3128     // Reload substr for rescan, this code
3129     // is executed only for large substrings (> 8 chars)
3130     bind(RELOAD_SUBSTR);
3131     movptr(str2, Address(rsp, 2*wordSize));
3132     movl(cnt2, Address(rsp, 3*wordSize));
3133     if (ae == StrIntrinsicNode::UL) {
3134       pmovzxbw(vec, Address(str2, 0));
3135     } else {
3136       movdqu(vec, Address(str2, 0));
3137     }
3138     // We came here after the beginning of the substring was
3139     // matched but the rest of it was not so we need to search
3140     // again. Start from the next element after the previous match.
3141     subptr(str1, result); // Restore counter
3142     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3143       shrl(str1, 1);
3144     }
3145     addl(cnt1, str1);
3146     decrementl(cnt1);   // Shift to next element
3147     cmpl(cnt1, cnt2);
3148     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3149 
3150     addptr(result, (1<<scale1));
3151   } // non constant
3152 
3153   // Scan string for start of substr in 16-byte vectors
3154   bind(SCAN_TO_SUBSTR);
3155   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3156   pcmpestri(vec, Address(result, 0), mode);
3157   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3158   subl(cnt1, stride);
3159   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3160   cmpl(cnt1, cnt2);
3161   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3162   addptr(result, 16);
3163 
3164   bind(ADJUST_STR);
3165   cmpl(cnt1, stride); // Do not read beyond string
3166   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3167   // Back-up string to avoid reading beyond string.
3168   lea(result, Address(result, cnt1, scale1, -16));
3169   movl(cnt1, stride);
3170   jmpb(SCAN_TO_SUBSTR);
3171 
3172   // Found a potential substr
3173   bind(FOUND_CANDIDATE);
3174   // After pcmpestri tmp(rcx) contains matched element index
3175 
3176   // Make sure string is still long enough
3177   subl(cnt1, tmp);
3178   cmpl(cnt1, cnt2);
3179   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3180   // Left less then substring.
3181 
3182   bind(RET_NOT_FOUND);
3183   movl(result, -1);
3184   jmp(CLEANUP);
3185 
3186   bind(FOUND_SUBSTR);
3187   // Compute start addr of substr
3188   lea(result, Address(result, tmp, scale1));
3189   if (int_cnt2 > 0) { // Constant substring
3190     // Repeat search for small substring (< 8 chars)
3191     // from new point without reloading substring.
3192     // Have to check that we don't read beyond string.
3193     cmpl(tmp, stride-int_cnt2);
3194     jccb(Assembler::greater, ADJUST_STR);
3195     // Fall through if matched whole substring.
3196   } else { // non constant
3197     assert(int_cnt2 == -1, "should be != 0");
3198 
3199     addl(tmp, cnt2);
3200     // Found result if we matched whole substring.
3201     cmpl(tmp, stride);
3202     jcc(Assembler::lessEqual, RET_FOUND);
3203 
3204     // Repeat search for small substring (<= 8 chars)
3205     // from new point 'str1' without reloading substring.
3206     cmpl(cnt2, stride);
3207     // Have to check that we don't read beyond string.
3208     jccb(Assembler::lessEqual, ADJUST_STR);
3209 
3210     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3211     // Compare the rest of substring (> 8 chars).
3212     movptr(str1, result);
3213 
3214     cmpl(tmp, cnt2);
3215     // First 8 chars are already matched.
3216     jccb(Assembler::equal, CHECK_NEXT);
3217 
3218     bind(SCAN_SUBSTR);
3219     pcmpestri(vec, Address(str1, 0), mode);
3220     // Need to reload strings pointers if not matched whole vector
3221     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3222 
3223     bind(CHECK_NEXT);
3224     subl(cnt2, stride);
3225     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3226     addptr(str1, 16);
3227     if (ae == StrIntrinsicNode::UL) {
3228       addptr(str2, 8);
3229     } else {
3230       addptr(str2, 16);
3231     }
3232     subl(cnt1, stride);
3233     cmpl(cnt2, stride); // Do not read beyond substring
3234     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3235     // Back-up strings to avoid reading beyond substring.
3236 
3237     if (ae == StrIntrinsicNode::UL) {
3238       lea(str2, Address(str2, cnt2, scale2, -8));
3239       lea(str1, Address(str1, cnt2, scale1, -16));
3240     } else {
3241       lea(str2, Address(str2, cnt2, scale2, -16));
3242       lea(str1, Address(str1, cnt2, scale1, -16));
3243     }
3244     subl(cnt1, cnt2);
3245     movl(cnt2, stride);
3246     addl(cnt1, stride);
3247     bind(CONT_SCAN_SUBSTR);
3248     if (ae == StrIntrinsicNode::UL) {
3249       pmovzxbw(vec, Address(str2, 0));
3250     } else {
3251       movdqu(vec, Address(str2, 0));
3252     }
3253     jmp(SCAN_SUBSTR);
3254 
3255     bind(RET_FOUND_LONG);
3256     movptr(str1, Address(rsp, wordSize));
3257   } // non constant
3258 
3259   bind(RET_FOUND);
3260   // Compute substr offset
3261   subptr(result, str1);
3262   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3263     shrl(result, 1); // index
3264   }
3265   bind(CLEANUP);
3266   pop(rsp); // restore SP
3267 
3268 } // string_indexof
3269 
3270 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3271                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3272   ShortBranchVerifier sbv(this);
3273   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3274 
3275   int stride = 8;
3276 
3277   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3278         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3279         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3280         FOUND_SEQ_CHAR, DONE_LABEL;
3281 
3282   movptr(result, str1);
3283   if (UseAVX >= 2) {
3284     cmpl(cnt1, stride);
3285     jcc(Assembler::less, SCAN_TO_CHAR);
3286     cmpl(cnt1, 2*stride);
3287     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3288     movdl(vec1, ch);
3289     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3290     vpxor(vec2, vec2);
3291     movl(tmp, cnt1);
3292     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3293     andl(cnt1,0x0000000F);  //tail count (in chars)
3294 
3295     bind(SCAN_TO_16_CHAR_LOOP);
3296     vmovdqu(vec3, Address(result, 0));
3297     vpcmpeqw(vec3, vec3, vec1, 1);
3298     vptest(vec2, vec3);
3299     jcc(Assembler::carryClear, FOUND_CHAR);
3300     addptr(result, 32);
3301     subl(tmp, 2*stride);
3302     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3303     jmp(SCAN_TO_8_CHAR);
3304     bind(SCAN_TO_8_CHAR_INIT);
3305     movdl(vec1, ch);
3306     pshuflw(vec1, vec1, 0x00);
3307     pshufd(vec1, vec1, 0);
3308     pxor(vec2, vec2);
3309   }
3310   bind(SCAN_TO_8_CHAR);
3311   cmpl(cnt1, stride);
3312   jcc(Assembler::less, SCAN_TO_CHAR);
3313   if (UseAVX < 2) {
3314     movdl(vec1, ch);
3315     pshuflw(vec1, vec1, 0x00);
3316     pshufd(vec1, vec1, 0);
3317     pxor(vec2, vec2);
3318   }
3319   movl(tmp, cnt1);
3320   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3321   andl(cnt1,0x00000007);  //tail count (in chars)
3322 
3323   bind(SCAN_TO_8_CHAR_LOOP);
3324   movdqu(vec3, Address(result, 0));
3325   pcmpeqw(vec3, vec1);
3326   ptest(vec2, vec3);
3327   jcc(Assembler::carryClear, FOUND_CHAR);
3328   addptr(result, 16);
3329   subl(tmp, stride);
3330   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3331   bind(SCAN_TO_CHAR);
3332   testl(cnt1, cnt1);
3333   jcc(Assembler::zero, RET_NOT_FOUND);
3334   bind(SCAN_TO_CHAR_LOOP);
3335   load_unsigned_short(tmp, Address(result, 0));
3336   cmpl(ch, tmp);
3337   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3338   addptr(result, 2);
3339   subl(cnt1, 1);
3340   jccb(Assembler::zero, RET_NOT_FOUND);
3341   jmp(SCAN_TO_CHAR_LOOP);
3342 
3343   bind(RET_NOT_FOUND);
3344   movl(result, -1);
3345   jmpb(DONE_LABEL);
3346 
3347   bind(FOUND_CHAR);
3348   if (UseAVX >= 2) {
3349     vpmovmskb(tmp, vec3);
3350   } else {
3351     pmovmskb(tmp, vec3);
3352   }
3353   bsfl(ch, tmp);
3354   addptr(result, ch);
3355 
3356   bind(FOUND_SEQ_CHAR);
3357   subptr(result, str1);
3358   shrl(result, 1);
3359 
3360   bind(DONE_LABEL);
3361 } // string_indexof_char
3362 
3363 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3364                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3365   ShortBranchVerifier sbv(this);
3366   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3367 
3368   int stride = 16;
3369 
3370   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3371         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3372         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3373         FOUND_SEQ_CHAR, DONE_LABEL;
3374 
3375   movptr(result, str1);
3376   if (UseAVX >= 2) {
3377     cmpl(cnt1, stride);
3378     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3379     cmpl(cnt1, stride*2);
3380     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3381     movdl(vec1, ch);
3382     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3383     vpxor(vec2, vec2);
3384     movl(tmp, cnt1);
3385     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3386     andl(cnt1,0x0000001F);  //tail count (in chars)
3387 
3388     bind(SCAN_TO_32_CHAR_LOOP);
3389     vmovdqu(vec3, Address(result, 0));
3390     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3391     vptest(vec2, vec3);
3392     jcc(Assembler::carryClear, FOUND_CHAR);
3393     addptr(result, 32);
3394     subl(tmp, stride*2);
3395     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3396     jmp(SCAN_TO_16_CHAR);
3397 
3398     bind(SCAN_TO_16_CHAR_INIT);
3399     movdl(vec1, ch);
3400     pxor(vec2, vec2);
3401     pshufb(vec1, vec2);
3402   }
3403 
3404   bind(SCAN_TO_16_CHAR);
3405   cmpl(cnt1, stride);
3406   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3407   if (UseAVX < 2) {
3408     movdl(vec1, ch);
3409     pxor(vec2, vec2);
3410     pshufb(vec1, vec2);
3411   }
3412   movl(tmp, cnt1);
3413   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3414   andl(cnt1,0x0000000F);  //tail count (in bytes)
3415 
3416   bind(SCAN_TO_16_CHAR_LOOP);
3417   movdqu(vec3, Address(result, 0));
3418   pcmpeqb(vec3, vec1);
3419   ptest(vec2, vec3);
3420   jcc(Assembler::carryClear, FOUND_CHAR);
3421   addptr(result, 16);
3422   subl(tmp, stride);
3423   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3424 
3425   bind(SCAN_TO_CHAR_INIT);
3426   testl(cnt1, cnt1);
3427   jcc(Assembler::zero, RET_NOT_FOUND);
3428   bind(SCAN_TO_CHAR_LOOP);
3429   load_unsigned_byte(tmp, Address(result, 0));
3430   cmpl(ch, tmp);
3431   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3432   addptr(result, 1);
3433   subl(cnt1, 1);
3434   jccb(Assembler::zero, RET_NOT_FOUND);
3435   jmp(SCAN_TO_CHAR_LOOP);
3436 
3437   bind(RET_NOT_FOUND);
3438   movl(result, -1);
3439   jmpb(DONE_LABEL);
3440 
3441   bind(FOUND_CHAR);
3442   if (UseAVX >= 2) {
3443     vpmovmskb(tmp, vec3);
3444   } else {
3445     pmovmskb(tmp, vec3);
3446   }
3447   bsfl(ch, tmp);
3448   addptr(result, ch);
3449 
3450   bind(FOUND_SEQ_CHAR);
3451   subptr(result, str1);
3452 
3453   bind(DONE_LABEL);
3454 } // stringL_indexof_char
3455 
3456 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3457   switch (eltype) {
3458   case T_BOOLEAN: return sizeof(jboolean);
3459   case T_BYTE:  return sizeof(jbyte);
3460   case T_SHORT: return sizeof(jshort);
3461   case T_CHAR:  return sizeof(jchar);
3462   case T_INT:   return sizeof(jint);
3463   default:
3464     ShouldNotReachHere();
3465     return -1;
3466   }
3467 }
3468 
3469 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3470   switch (eltype) {
3471   // T_BOOLEAN used as surrogate for unsigned byte
3472   case T_BOOLEAN: movzbl(dst, src);   break;
3473   case T_BYTE:    movsbl(dst, src);   break;
3474   case T_SHORT:   movswl(dst, src);   break;
3475   case T_CHAR:    movzwl(dst, src);   break;
3476   case T_INT:     movl(dst, src);     break;
3477   default:
3478     ShouldNotReachHere();
3479   }
3480 }
3481 
3482 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3483   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3484 }
3485 
3486 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3487   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3488 }
3489 
3490 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3491   const int vlen = Assembler::AVX_256bit;
3492   switch (eltype) {
3493   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3494   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3495   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3496   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3497   case T_INT:
3498     // do nothing
3499     break;
3500   default:
3501     ShouldNotReachHere();
3502   }
3503 }
3504 
3505 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3506                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3507                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3508                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3509                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3510                                         BasicType eltype) {
3511   ShortBranchVerifier sbv(this);
3512   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3513   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3514   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3515 
3516   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3517         SHORT_UNROLLED_LOOP_EXIT,
3518         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3519         UNROLLED_VECTOR_LOOP_BEGIN,
3520         END;
3521   switch (eltype) {
3522   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3523   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3524   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3525   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3526   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3527   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3528   }
3529 
3530   // For "renaming" for readibility of the code
3531   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3532                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3533                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3534 
3535   const int elsize = arrays_hashcode_elsize(eltype);
3536 
3537   /*
3538     if (cnt1 >= 2) {
3539       if (cnt1 >= 32) {
3540         UNROLLED VECTOR LOOP
3541       }
3542       UNROLLED SCALAR LOOP
3543     }
3544     SINGLE SCALAR
3545    */
3546 
3547   cmpl(cnt1, 32);
3548   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3549 
3550   // cnt1 >= 32 && generate_vectorized_loop
3551   xorl(index, index);
3552 
3553   // vresult = IntVector.zero(I256);
3554   for (int idx = 0; idx < 4; idx++) {
3555     vpxor(vresult[idx], vresult[idx]);
3556   }
3557   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3558   Register bound = tmp2;
3559   Register next = tmp3;
3560   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3561   movl(next, Address(tmp2, 0));
3562   movdl(vnext, next);
3563   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3564 
3565   // index = 0;
3566   // bound = cnt1 & ~(32 - 1);
3567   movl(bound, cnt1);
3568   andl(bound, ~(32 - 1));
3569   // for (; index < bound; index += 32) {
3570   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3571   // result *= next;
3572   imull(result, next);
3573   // loop fission to upfront the cost of fetching from memory, OOO execution
3574   // can then hopefully do a better job of prefetching
3575   for (int idx = 0; idx < 4; idx++) {
3576     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3577   }
3578   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3579   for (int idx = 0; idx < 4; idx++) {
3580     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3581     arrays_hashcode_elvcast(vtmp[idx], eltype);
3582     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3583   }
3584   // index += 32;
3585   addl(index, 32);
3586   // index < bound;
3587   cmpl(index, bound);
3588   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3589   // }
3590 
3591   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3592   subl(cnt1, bound);
3593   // release bound
3594 
3595   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3596   for (int idx = 0; idx < 4; idx++) {
3597     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3598     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3599     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3600   }
3601   // result += vresult.reduceLanes(ADD);
3602   for (int idx = 0; idx < 4; idx++) {
3603     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3604   }
3605 
3606   // } else if (cnt1 < 32) {
3607 
3608   bind(SHORT_UNROLLED_BEGIN);
3609   // int i = 1;
3610   movl(index, 1);
3611   cmpl(index, cnt1);
3612   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3613 
3614   // for (; i < cnt1 ; i += 2) {
3615   bind(SHORT_UNROLLED_LOOP_BEGIN);
3616   movl(tmp3, 961);
3617   imull(result, tmp3);
3618   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3619   movl(tmp3, tmp2);
3620   shll(tmp3, 5);
3621   subl(tmp3, tmp2);
3622   addl(result, tmp3);
3623   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3624   addl(result, tmp3);
3625   addl(index, 2);
3626   cmpl(index, cnt1);
3627   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3628 
3629   // }
3630   // if (i >= cnt1) {
3631   bind(SHORT_UNROLLED_LOOP_EXIT);
3632   jccb(Assembler::greater, END);
3633   movl(tmp2, result);
3634   shll(result, 5);
3635   subl(result, tmp2);
3636   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3637   addl(result, tmp3);
3638   // }
3639   bind(END);
3640 
3641   BLOCK_COMMENT("} // arrays_hashcode");
3642 
3643 } // arrays_hashcode
3644 
3645 // helper function for string_compare
3646 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3647                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3648                                            Address::ScaleFactor scale2, Register index, int ae) {
3649   if (ae == StrIntrinsicNode::LL) {
3650     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3651     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3652   } else if (ae == StrIntrinsicNode::UU) {
3653     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3654     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3655   } else {
3656     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3657     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3658   }
3659 }
3660 
3661 // Compare strings, used for char[] and byte[].
3662 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3663                                        Register cnt1, Register cnt2, Register result,
3664                                        XMMRegister vec1, int ae, KRegister mask) {
3665   ShortBranchVerifier sbv(this);
3666   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3667   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3668   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3669   int stride2x2 = 0x40;
3670   Address::ScaleFactor scale = Address::no_scale;
3671   Address::ScaleFactor scale1 = Address::no_scale;
3672   Address::ScaleFactor scale2 = Address::no_scale;
3673 
3674   if (ae != StrIntrinsicNode::LL) {
3675     stride2x2 = 0x20;
3676   }
3677 
3678   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3679     shrl(cnt2, 1);
3680   }
3681   // Compute the minimum of the string lengths and the
3682   // difference of the string lengths (stack).
3683   // Do the conditional move stuff
3684   movl(result, cnt1);
3685   subl(cnt1, cnt2);
3686   push(cnt1);
3687   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3688 
3689   // Is the minimum length zero?
3690   testl(cnt2, cnt2);
3691   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3692   if (ae == StrIntrinsicNode::LL) {
3693     // Load first bytes
3694     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3695     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3696   } else if (ae == StrIntrinsicNode::UU) {
3697     // Load first characters
3698     load_unsigned_short(result, Address(str1, 0));
3699     load_unsigned_short(cnt1, Address(str2, 0));
3700   } else {
3701     load_unsigned_byte(result, Address(str1, 0));
3702     load_unsigned_short(cnt1, Address(str2, 0));
3703   }
3704   subl(result, cnt1);
3705   jcc(Assembler::notZero,  POP_LABEL);
3706 
3707   if (ae == StrIntrinsicNode::UU) {
3708     // Divide length by 2 to get number of chars
3709     shrl(cnt2, 1);
3710   }
3711   cmpl(cnt2, 1);
3712   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3713 
3714   // Check if the strings start at the same location and setup scale and stride
3715   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3716     cmpptr(str1, str2);
3717     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3718     if (ae == StrIntrinsicNode::LL) {
3719       scale = Address::times_1;
3720       stride = 16;
3721     } else {
3722       scale = Address::times_2;
3723       stride = 8;
3724     }
3725   } else {
3726     scale1 = Address::times_1;
3727     scale2 = Address::times_2;
3728     // scale not used
3729     stride = 8;
3730   }
3731 
3732   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3733     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3734     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3735     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3736     Label COMPARE_TAIL_LONG;
3737     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3738 
3739     int pcmpmask = 0x19;
3740     if (ae == StrIntrinsicNode::LL) {
3741       pcmpmask &= ~0x01;
3742     }
3743 
3744     // Setup to compare 16-chars (32-bytes) vectors,
3745     // start from first character again because it has aligned address.
3746     if (ae == StrIntrinsicNode::LL) {
3747       stride2 = 32;
3748     } else {
3749       stride2 = 16;
3750     }
3751     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3752       adr_stride = stride << scale;
3753     } else {
3754       adr_stride1 = 8;  //stride << scale1;
3755       adr_stride2 = 16; //stride << scale2;
3756     }
3757 
3758     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3759     // rax and rdx are used by pcmpestri as elements counters
3760     movl(result, cnt2);
3761     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3762     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3763 
3764     // fast path : compare first 2 8-char vectors.
3765     bind(COMPARE_16_CHARS);
3766     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3767       movdqu(vec1, Address(str1, 0));
3768     } else {
3769       pmovzxbw(vec1, Address(str1, 0));
3770     }
3771     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3772     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3773 
3774     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3775       movdqu(vec1, Address(str1, adr_stride));
3776       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3777     } else {
3778       pmovzxbw(vec1, Address(str1, adr_stride1));
3779       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3780     }
3781     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3782     addl(cnt1, stride);
3783 
3784     // Compare the characters at index in cnt1
3785     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3786     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3787     subl(result, cnt2);
3788     jmp(POP_LABEL);
3789 
3790     // Setup the registers to start vector comparison loop
3791     bind(COMPARE_WIDE_VECTORS);
3792     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3793       lea(str1, Address(str1, result, scale));
3794       lea(str2, Address(str2, result, scale));
3795     } else {
3796       lea(str1, Address(str1, result, scale1));
3797       lea(str2, Address(str2, result, scale2));
3798     }
3799     subl(result, stride2);
3800     subl(cnt2, stride2);
3801     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3802     negptr(result);
3803 
3804     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3805     bind(COMPARE_WIDE_VECTORS_LOOP);
3806 
3807 #ifdef _LP64
3808     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3809       cmpl(cnt2, stride2x2);
3810       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3811       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3812       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3813 
3814       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3815       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3816         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3817         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3818       } else {
3819         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3820         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3821       }
3822       kortestql(mask, mask);
3823       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3824       addptr(result, stride2x2);  // update since we already compared at this addr
3825       subl(cnt2, stride2x2);      // and sub the size too
3826       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3827 
3828       vpxor(vec1, vec1);
3829       jmpb(COMPARE_WIDE_TAIL);
3830     }//if (VM_Version::supports_avx512vlbw())
3831 #endif // _LP64
3832 
3833 
3834     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3835     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3836       vmovdqu(vec1, Address(str1, result, scale));
3837       vpxor(vec1, Address(str2, result, scale));
3838     } else {
3839       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3840       vpxor(vec1, Address(str2, result, scale2));
3841     }
3842     vptest(vec1, vec1);
3843     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3844     addptr(result, stride2);
3845     subl(cnt2, stride2);
3846     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3847     // clean upper bits of YMM registers
3848     vpxor(vec1, vec1);
3849 
3850     // compare wide vectors tail
3851     bind(COMPARE_WIDE_TAIL);
3852     testptr(result, result);
3853     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3854 
3855     movl(result, stride2);
3856     movl(cnt2, result);
3857     negptr(result);
3858     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3859 
3860     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3861     bind(VECTOR_NOT_EQUAL);
3862     // clean upper bits of YMM registers
3863     vpxor(vec1, vec1);
3864     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3865       lea(str1, Address(str1, result, scale));
3866       lea(str2, Address(str2, result, scale));
3867     } else {
3868       lea(str1, Address(str1, result, scale1));
3869       lea(str2, Address(str2, result, scale2));
3870     }
3871     jmp(COMPARE_16_CHARS);
3872 
3873     // Compare tail chars, length between 1 to 15 chars
3874     bind(COMPARE_TAIL_LONG);
3875     movl(cnt2, result);
3876     cmpl(cnt2, stride);
3877     jcc(Assembler::less, COMPARE_SMALL_STR);
3878 
3879     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3880       movdqu(vec1, Address(str1, 0));
3881     } else {
3882       pmovzxbw(vec1, Address(str1, 0));
3883     }
3884     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3885     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3886     subptr(cnt2, stride);
3887     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3888     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3889       lea(str1, Address(str1, result, scale));
3890       lea(str2, Address(str2, result, scale));
3891     } else {
3892       lea(str1, Address(str1, result, scale1));
3893       lea(str2, Address(str2, result, scale2));
3894     }
3895     negptr(cnt2);
3896     jmpb(WHILE_HEAD_LABEL);
3897 
3898     bind(COMPARE_SMALL_STR);
3899   } else if (UseSSE42Intrinsics) {
3900     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3901     int pcmpmask = 0x19;
3902     // Setup to compare 8-char (16-byte) vectors,
3903     // start from first character again because it has aligned address.
3904     movl(result, cnt2);
3905     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3906     if (ae == StrIntrinsicNode::LL) {
3907       pcmpmask &= ~0x01;
3908     }
3909     jcc(Assembler::zero, COMPARE_TAIL);
3910     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3911       lea(str1, Address(str1, result, scale));
3912       lea(str2, Address(str2, result, scale));
3913     } else {
3914       lea(str1, Address(str1, result, scale1));
3915       lea(str2, Address(str2, result, scale2));
3916     }
3917     negptr(result);
3918 
3919     // pcmpestri
3920     //   inputs:
3921     //     vec1- substring
3922     //     rax - negative string length (elements count)
3923     //     mem - scanned string
3924     //     rdx - string length (elements count)
3925     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3926     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3927     //   outputs:
3928     //     rcx - first mismatched element index
3929     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3930 
3931     bind(COMPARE_WIDE_VECTORS);
3932     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3933       movdqu(vec1, Address(str1, result, scale));
3934       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3935     } else {
3936       pmovzxbw(vec1, Address(str1, result, scale1));
3937       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3938     }
3939     // After pcmpestri cnt1(rcx) contains mismatched element index
3940 
3941     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3942     addptr(result, stride);
3943     subptr(cnt2, stride);
3944     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3945 
3946     // compare wide vectors tail
3947     testptr(result, result);
3948     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3949 
3950     movl(cnt2, stride);
3951     movl(result, stride);
3952     negptr(result);
3953     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3954       movdqu(vec1, Address(str1, result, scale));
3955       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3956     } else {
3957       pmovzxbw(vec1, Address(str1, result, scale1));
3958       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3959     }
3960     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3961 
3962     // Mismatched characters in the vectors
3963     bind(VECTOR_NOT_EQUAL);
3964     addptr(cnt1, result);
3965     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3966     subl(result, cnt2);
3967     jmpb(POP_LABEL);
3968 
3969     bind(COMPARE_TAIL); // limit is zero
3970     movl(cnt2, result);
3971     // Fallthru to tail compare
3972   }
3973   // Shift str2 and str1 to the end of the arrays, negate min
3974   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3975     lea(str1, Address(str1, cnt2, scale));
3976     lea(str2, Address(str2, cnt2, scale));
3977   } else {
3978     lea(str1, Address(str1, cnt2, scale1));
3979     lea(str2, Address(str2, cnt2, scale2));
3980   }
3981   decrementl(cnt2);  // first character was compared already
3982   negptr(cnt2);
3983 
3984   // Compare the rest of the elements
3985   bind(WHILE_HEAD_LABEL);
3986   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3987   subl(result, cnt1);
3988   jccb(Assembler::notZero, POP_LABEL);
3989   increment(cnt2);
3990   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3991 
3992   // Strings are equal up to min length.  Return the length difference.
3993   bind(LENGTH_DIFF_LABEL);
3994   pop(result);
3995   if (ae == StrIntrinsicNode::UU) {
3996     // Divide diff by 2 to get number of chars
3997     sarl(result, 1);
3998   }
3999   jmpb(DONE_LABEL);
4000 
4001 #ifdef _LP64
4002   if (VM_Version::supports_avx512vlbw()) {
4003 
4004     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4005 
4006     kmovql(cnt1, mask);
4007     notq(cnt1);
4008     bsfq(cnt2, cnt1);
4009     if (ae != StrIntrinsicNode::LL) {
4010       // Divide diff by 2 to get number of chars
4011       sarl(cnt2, 1);
4012     }
4013     addq(result, cnt2);
4014     if (ae == StrIntrinsicNode::LL) {
4015       load_unsigned_byte(cnt1, Address(str2, result));
4016       load_unsigned_byte(result, Address(str1, result));
4017     } else if (ae == StrIntrinsicNode::UU) {
4018       load_unsigned_short(cnt1, Address(str2, result, scale));
4019       load_unsigned_short(result, Address(str1, result, scale));
4020     } else {
4021       load_unsigned_short(cnt1, Address(str2, result, scale2));
4022       load_unsigned_byte(result, Address(str1, result, scale1));
4023     }
4024     subl(result, cnt1);
4025     jmpb(POP_LABEL);
4026   }//if (VM_Version::supports_avx512vlbw())
4027 #endif // _LP64
4028 
4029   // Discard the stored length difference
4030   bind(POP_LABEL);
4031   pop(cnt1);
4032 
4033   // That's it
4034   bind(DONE_LABEL);
4035   if(ae == StrIntrinsicNode::UL) {
4036     negl(result);
4037   }
4038 
4039 }
4040 
4041 // Search for Non-ASCII character (Negative byte value) in a byte array,
4042 // return the index of the first such character, otherwise the length
4043 // of the array segment searched.
4044 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4045 //   @IntrinsicCandidate
4046 //   public static int countPositives(byte[] ba, int off, int len) {
4047 //     for (int i = off; i < off + len; i++) {
4048 //       if (ba[i] < 0) {
4049 //         return i - off;
4050 //       }
4051 //     }
4052 //     return len;
4053 //   }
4054 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4055   Register result, Register tmp1,
4056   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4057   // rsi: byte array
4058   // rcx: len
4059   // rax: result
4060   ShortBranchVerifier sbv(this);
4061   assert_different_registers(ary1, len, result, tmp1);
4062   assert_different_registers(vec1, vec2);
4063   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4064 
4065   movl(result, len); // copy
4066   // len == 0
4067   testl(len, len);
4068   jcc(Assembler::zero, DONE);
4069 
4070   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4071     VM_Version::supports_avx512vlbw() &&
4072     VM_Version::supports_bmi2()) {
4073 
4074     Label test_64_loop, test_tail, BREAK_LOOP;
4075     movl(tmp1, len);
4076     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4077 
4078     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4079     andl(len,  0xffffffc0); // vector count (in chars)
4080     jccb(Assembler::zero, test_tail);
4081 
4082     lea(ary1, Address(ary1, len, Address::times_1));
4083     negptr(len);
4084 
4085     bind(test_64_loop);
4086     // Check whether our 64 elements of size byte contain negatives
4087     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4088     kortestql(mask1, mask1);
4089     jcc(Assembler::notZero, BREAK_LOOP);
4090 
4091     addptr(len, 64);
4092     jccb(Assembler::notZero, test_64_loop);
4093 
4094     bind(test_tail);
4095     // bail out when there is nothing to be done
4096     testl(tmp1, -1);
4097     jcc(Assembler::zero, DONE);
4098 
4099 
4100     // check the tail for absense of negatives
4101     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4102 #ifdef _LP64
4103     {
4104       Register tmp3_aliased = len;
4105       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4106       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4107       notq(tmp3_aliased);
4108       kmovql(mask2, tmp3_aliased);
4109     }
4110 #else
4111     Label k_init;
4112     jmp(k_init);
4113 
4114     // We could not read 64-bits from a general purpose register thus we move
4115     // data required to compose 64 1's to the instruction stream
4116     // We emit 64 byte wide series of elements from 0..63 which later on would
4117     // be used as a compare targets with tail count contained in tmp1 register.
4118     // Result would be a k register having tmp1 consecutive number or 1
4119     // counting from least significant bit.
4120     address tmp = pc();
4121     emit_int64(0x0706050403020100);
4122     emit_int64(0x0F0E0D0C0B0A0908);
4123     emit_int64(0x1716151413121110);
4124     emit_int64(0x1F1E1D1C1B1A1918);
4125     emit_int64(0x2726252423222120);
4126     emit_int64(0x2F2E2D2C2B2A2928);
4127     emit_int64(0x3736353433323130);
4128     emit_int64(0x3F3E3D3C3B3A3938);
4129 
4130     bind(k_init);
4131     lea(len, InternalAddress(tmp));
4132     // create mask to test for negative byte inside a vector
4133     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4134     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4135 
4136 #endif
4137     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4138     ktestq(mask1, mask2);
4139     jcc(Assembler::zero, DONE);
4140 
4141     // do a full check for negative registers in the tail
4142     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4143                      // ary1 already pointing to the right place
4144     jmpb(TAIL_START);
4145 
4146     bind(BREAK_LOOP);
4147     // At least one byte in the last 64 byte block was negative.
4148     // Set up to look at the last 64 bytes as if they were a tail
4149     lea(ary1, Address(ary1, len, Address::times_1));
4150     addptr(result, len);
4151     // Ignore the very last byte: if all others are positive,
4152     // it must be negative, so we can skip right to the 2+1 byte
4153     // end comparison at this point
4154     orl(result, 63);
4155     movl(len, 63);
4156     // Fallthru to tail compare
4157   } else {
4158 
4159     if (UseAVX >= 2 && UseSSE >= 2) {
4160       // With AVX2, use 32-byte vector compare
4161       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4162 
4163       // Compare 32-byte vectors
4164       testl(len, 0xffffffe0);   // vector count (in bytes)
4165       jccb(Assembler::zero, TAIL_START);
4166 
4167       andl(len, 0xffffffe0);
4168       lea(ary1, Address(ary1, len, Address::times_1));
4169       negptr(len);
4170 
4171       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4172       movdl(vec2, tmp1);
4173       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4174 
4175       bind(COMPARE_WIDE_VECTORS);
4176       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4177       vptest(vec1, vec2);
4178       jccb(Assembler::notZero, BREAK_LOOP);
4179       addptr(len, 32);
4180       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4181 
4182       testl(result, 0x0000001f);   // any bytes remaining?
4183       jcc(Assembler::zero, DONE);
4184 
4185       // Quick test using the already prepared vector mask
4186       movl(len, result);
4187       andl(len, 0x0000001f);
4188       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4189       vptest(vec1, vec2);
4190       jcc(Assembler::zero, DONE);
4191       // There are zeros, jump to the tail to determine exactly where
4192       jmpb(TAIL_START);
4193 
4194       bind(BREAK_LOOP);
4195       // At least one byte in the last 32-byte vector is negative.
4196       // Set up to look at the last 32 bytes as if they were a tail
4197       lea(ary1, Address(ary1, len, Address::times_1));
4198       addptr(result, len);
4199       // Ignore the very last byte: if all others are positive,
4200       // it must be negative, so we can skip right to the 2+1 byte
4201       // end comparison at this point
4202       orl(result, 31);
4203       movl(len, 31);
4204       // Fallthru to tail compare
4205     } else if (UseSSE42Intrinsics) {
4206       // With SSE4.2, use double quad vector compare
4207       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4208 
4209       // Compare 16-byte vectors
4210       testl(len, 0xfffffff0);   // vector count (in bytes)
4211       jcc(Assembler::zero, TAIL_START);
4212 
4213       andl(len, 0xfffffff0);
4214       lea(ary1, Address(ary1, len, Address::times_1));
4215       negptr(len);
4216 
4217       movl(tmp1, 0x80808080);
4218       movdl(vec2, tmp1);
4219       pshufd(vec2, vec2, 0);
4220 
4221       bind(COMPARE_WIDE_VECTORS);
4222       movdqu(vec1, Address(ary1, len, Address::times_1));
4223       ptest(vec1, vec2);
4224       jccb(Assembler::notZero, BREAK_LOOP);
4225       addptr(len, 16);
4226       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4227 
4228       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4229       jcc(Assembler::zero, DONE);
4230 
4231       // Quick test using the already prepared vector mask
4232       movl(len, result);
4233       andl(len, 0x0000000f);   // tail count (in bytes)
4234       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4235       ptest(vec1, vec2);
4236       jcc(Assembler::zero, DONE);
4237       jmpb(TAIL_START);
4238 
4239       bind(BREAK_LOOP);
4240       // At least one byte in the last 16-byte vector is negative.
4241       // Set up and look at the last 16 bytes as if they were a tail
4242       lea(ary1, Address(ary1, len, Address::times_1));
4243       addptr(result, len);
4244       // Ignore the very last byte: if all others are positive,
4245       // it must be negative, so we can skip right to the 2+1 byte
4246       // end comparison at this point
4247       orl(result, 15);
4248       movl(len, 15);
4249       // Fallthru to tail compare
4250     }
4251   }
4252 
4253   bind(TAIL_START);
4254   // Compare 4-byte vectors
4255   andl(len, 0xfffffffc); // vector count (in bytes)
4256   jccb(Assembler::zero, COMPARE_CHAR);
4257 
4258   lea(ary1, Address(ary1, len, Address::times_1));
4259   negptr(len);
4260 
4261   bind(COMPARE_VECTORS);
4262   movl(tmp1, Address(ary1, len, Address::times_1));
4263   andl(tmp1, 0x80808080);
4264   jccb(Assembler::notZero, TAIL_ADJUST);
4265   addptr(len, 4);
4266   jccb(Assembler::notZero, COMPARE_VECTORS);
4267 
4268   // Compare trailing char (final 2-3 bytes), if any
4269   bind(COMPARE_CHAR);
4270 
4271   testl(result, 0x2);   // tail  char
4272   jccb(Assembler::zero, COMPARE_BYTE);
4273   load_unsigned_short(tmp1, Address(ary1, 0));
4274   andl(tmp1, 0x00008080);
4275   jccb(Assembler::notZero, CHAR_ADJUST);
4276   lea(ary1, Address(ary1, 2));
4277 
4278   bind(COMPARE_BYTE);
4279   testl(result, 0x1);   // tail  byte
4280   jccb(Assembler::zero, DONE);
4281   load_unsigned_byte(tmp1, Address(ary1, 0));
4282   testl(tmp1, 0x00000080);
4283   jccb(Assembler::zero, DONE);
4284   subptr(result, 1);
4285   jmpb(DONE);
4286 
4287   bind(TAIL_ADJUST);
4288   // there are negative bits in the last 4 byte block.
4289   // Adjust result and check the next three bytes
4290   addptr(result, len);
4291   orl(result, 3);
4292   lea(ary1, Address(ary1, len, Address::times_1));
4293   jmpb(COMPARE_CHAR);
4294 
4295   bind(CHAR_ADJUST);
4296   // We are looking at a char + optional byte tail, and found that one
4297   // of the bytes in the char is negative. Adjust the result, check the
4298   // first byte and readjust if needed.
4299   andl(result, 0xfffffffc);
4300   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4301   jccb(Assembler::notZero, DONE);
4302   addptr(result, 1);
4303 
4304   // That's it
4305   bind(DONE);
4306   if (UseAVX >= 2 && UseSSE >= 2) {
4307     // clean upper bits of YMM registers
4308     vpxor(vec1, vec1);
4309     vpxor(vec2, vec2);
4310   }
4311 }
4312 
4313 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4314 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4315                                       Register limit, Register result, Register chr,
4316                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4317                                       KRegister mask, bool expand_ary2) {
4318   // for expand_ary2, limit is the (smaller) size of the second array.
4319   ShortBranchVerifier sbv(this);
4320   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4321 
4322   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4323          "Expansion only implemented for AVX2");
4324 
4325   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4326   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4327 
4328   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4329   int scaleIncr = expand_ary2 ? 8 : 16;
4330 
4331   if (is_array_equ) {
4332     // Check the input args
4333     cmpoop(ary1, ary2);
4334     jcc(Assembler::equal, TRUE_LABEL);
4335 
4336     // Need additional checks for arrays_equals.
4337     testptr(ary1, ary1);
4338     jcc(Assembler::zero, FALSE_LABEL);
4339     testptr(ary2, ary2);
4340     jcc(Assembler::zero, FALSE_LABEL);
4341 
4342     // Check the lengths
4343     movl(limit, Address(ary1, length_offset));
4344     cmpl(limit, Address(ary2, length_offset));
4345     jcc(Assembler::notEqual, FALSE_LABEL);
4346   }
4347 
4348   // count == 0
4349   testl(limit, limit);
4350   jcc(Assembler::zero, TRUE_LABEL);
4351 
4352   if (is_array_equ) {
4353     // Load array address
4354     lea(ary1, Address(ary1, base_offset));
4355     lea(ary2, Address(ary2, base_offset));
4356   }
4357 
4358   if (is_array_equ && is_char) {
4359     // arrays_equals when used for char[].
4360     shll(limit, 1);      // byte count != 0
4361   }
4362   movl(result, limit); // copy
4363 
4364   if (UseAVX >= 2) {
4365     // With AVX2, use 32-byte vector compare
4366     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4367 
4368     // Compare 32-byte vectors
4369     if (expand_ary2) {
4370       andl(result, 0x0000000f);  //   tail count (in bytes)
4371       andl(limit, 0xfffffff0);   // vector count (in bytes)
4372       jcc(Assembler::zero, COMPARE_TAIL);
4373     } else {
4374       andl(result, 0x0000001f);  //   tail count (in bytes)
4375       andl(limit, 0xffffffe0);   // vector count (in bytes)
4376       jcc(Assembler::zero, COMPARE_TAIL_16);
4377     }
4378 
4379     lea(ary1, Address(ary1, limit, scaleFactor));
4380     lea(ary2, Address(ary2, limit, Address::times_1));
4381     negptr(limit);
4382 
4383 #ifdef _LP64
4384     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4385       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4386 
4387       cmpl(limit, -64);
4388       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4389 
4390       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4391 
4392       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4393       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4394       kortestql(mask, mask);
4395       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4396       addptr(limit, 64);  // update since we already compared at this addr
4397       cmpl(limit, -64);
4398       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4399 
4400       // At this point we may still need to compare -limit+result bytes.
4401       // We could execute the next two instruction and just continue via non-wide path:
4402       //  cmpl(limit, 0);
4403       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4404       // But since we stopped at the points ary{1,2}+limit which are
4405       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4406       // (|limit| <= 32 and result < 32),
4407       // we may just compare the last 64 bytes.
4408       //
4409       addptr(result, -64);   // it is safe, bc we just came from this area
4410       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4411       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4412       kortestql(mask, mask);
4413       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4414 
4415       jmp(TRUE_LABEL);
4416 
4417       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4418 
4419     }//if (VM_Version::supports_avx512vlbw())
4420 #endif //_LP64
4421     bind(COMPARE_WIDE_VECTORS);
4422     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4423     if (expand_ary2) {
4424       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4425     } else {
4426       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4427     }
4428     vpxor(vec1, vec2);
4429 
4430     vptest(vec1, vec1);
4431     jcc(Assembler::notZero, FALSE_LABEL);
4432     addptr(limit, scaleIncr * 2);
4433     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4434 
4435     testl(result, result);
4436     jcc(Assembler::zero, TRUE_LABEL);
4437 
4438     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4439     if (expand_ary2) {
4440       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4441     } else {
4442       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4443     }
4444     vpxor(vec1, vec2);
4445 
4446     vptest(vec1, vec1);
4447     jcc(Assembler::notZero, FALSE_LABEL);
4448     jmp(TRUE_LABEL);
4449 
4450     bind(COMPARE_TAIL_16); // limit is zero
4451     movl(limit, result);
4452 
4453     // Compare 16-byte chunks
4454     andl(result, 0x0000000f);  //   tail count (in bytes)
4455     andl(limit, 0xfffffff0);   // vector count (in bytes)
4456     jcc(Assembler::zero, COMPARE_TAIL);
4457 
4458     lea(ary1, Address(ary1, limit, scaleFactor));
4459     lea(ary2, Address(ary2, limit, Address::times_1));
4460     negptr(limit);
4461 
4462     bind(COMPARE_WIDE_VECTORS_16);
4463     movdqu(vec1, Address(ary1, limit, scaleFactor));
4464     if (expand_ary2) {
4465       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4466     } else {
4467       movdqu(vec2, Address(ary2, limit, Address::times_1));
4468     }
4469     pxor(vec1, vec2);
4470 
4471     ptest(vec1, vec1);
4472     jcc(Assembler::notZero, FALSE_LABEL);
4473     addptr(limit, scaleIncr);
4474     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4475 
4476     bind(COMPARE_TAIL); // limit is zero
4477     movl(limit, result);
4478     // Fallthru to tail compare
4479   } else if (UseSSE42Intrinsics) {
4480     // With SSE4.2, use double quad vector compare
4481     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4482 
4483     // Compare 16-byte vectors
4484     andl(result, 0x0000000f);  //   tail count (in bytes)
4485     andl(limit, 0xfffffff0);   // vector count (in bytes)
4486     jcc(Assembler::zero, COMPARE_TAIL);
4487 
4488     lea(ary1, Address(ary1, limit, Address::times_1));
4489     lea(ary2, Address(ary2, limit, Address::times_1));
4490     negptr(limit);
4491 
4492     bind(COMPARE_WIDE_VECTORS);
4493     movdqu(vec1, Address(ary1, limit, Address::times_1));
4494     movdqu(vec2, Address(ary2, limit, Address::times_1));
4495     pxor(vec1, vec2);
4496 
4497     ptest(vec1, vec1);
4498     jcc(Assembler::notZero, FALSE_LABEL);
4499     addptr(limit, 16);
4500     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4501 
4502     testl(result, result);
4503     jcc(Assembler::zero, TRUE_LABEL);
4504 
4505     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4506     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4507     pxor(vec1, vec2);
4508 
4509     ptest(vec1, vec1);
4510     jccb(Assembler::notZero, FALSE_LABEL);
4511     jmpb(TRUE_LABEL);
4512 
4513     bind(COMPARE_TAIL); // limit is zero
4514     movl(limit, result);
4515     // Fallthru to tail compare
4516   }
4517 
4518   // Compare 4-byte vectors
4519   if (expand_ary2) {
4520     testl(result, result);
4521     jccb(Assembler::zero, TRUE_LABEL);
4522   } else {
4523     andl(limit, 0xfffffffc); // vector count (in bytes)
4524     jccb(Assembler::zero, COMPARE_CHAR);
4525   }
4526 
4527   lea(ary1, Address(ary1, limit, scaleFactor));
4528   lea(ary2, Address(ary2, limit, Address::times_1));
4529   negptr(limit);
4530 
4531   bind(COMPARE_VECTORS);
4532   if (expand_ary2) {
4533     // There are no "vector" operations for bytes to shorts
4534     movzbl(chr, Address(ary2, limit, Address::times_1));
4535     cmpw(Address(ary1, limit, Address::times_2), chr);
4536     jccb(Assembler::notEqual, FALSE_LABEL);
4537     addptr(limit, 1);
4538     jcc(Assembler::notZero, COMPARE_VECTORS);
4539     jmp(TRUE_LABEL);
4540   } else {
4541     movl(chr, Address(ary1, limit, Address::times_1));
4542     cmpl(chr, Address(ary2, limit, Address::times_1));
4543     jccb(Assembler::notEqual, FALSE_LABEL);
4544     addptr(limit, 4);
4545     jcc(Assembler::notZero, COMPARE_VECTORS);
4546   }
4547 
4548   // Compare trailing char (final 2 bytes), if any
4549   bind(COMPARE_CHAR);
4550   testl(result, 0x2);   // tail  char
4551   jccb(Assembler::zero, COMPARE_BYTE);
4552   load_unsigned_short(chr, Address(ary1, 0));
4553   load_unsigned_short(limit, Address(ary2, 0));
4554   cmpl(chr, limit);
4555   jccb(Assembler::notEqual, FALSE_LABEL);
4556 
4557   if (is_array_equ && is_char) {
4558     bind(COMPARE_BYTE);
4559   } else {
4560     lea(ary1, Address(ary1, 2));
4561     lea(ary2, Address(ary2, 2));
4562 
4563     bind(COMPARE_BYTE);
4564     testl(result, 0x1);   // tail  byte
4565     jccb(Assembler::zero, TRUE_LABEL);
4566     load_unsigned_byte(chr, Address(ary1, 0));
4567     load_unsigned_byte(limit, Address(ary2, 0));
4568     cmpl(chr, limit);
4569     jccb(Assembler::notEqual, FALSE_LABEL);
4570   }
4571   bind(TRUE_LABEL);
4572   movl(result, 1);   // return true
4573   jmpb(DONE);
4574 
4575   bind(FALSE_LABEL);
4576   xorl(result, result); // return false
4577 
4578   // That's it
4579   bind(DONE);
4580   if (UseAVX >= 2) {
4581     // clean upper bits of YMM registers
4582     vpxor(vec1, vec1);
4583     vpxor(vec2, vec2);
4584   }
4585 }
4586 
4587 #ifdef _LP64
4588 
4589 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4590 #define __ masm.
4591   Register dst = stub.data<0>();
4592   XMMRegister src = stub.data<1>();
4593   address target = stub.data<2>();
4594   __ bind(stub.entry());
4595   __ subptr(rsp, 8);
4596   __ movdbl(Address(rsp), src);
4597   __ call(RuntimeAddress(target));
4598   __ pop(dst);
4599   __ jmp(stub.continuation());
4600 #undef __
4601 }
4602 
4603 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4604   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4605   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4606 
4607   address slowpath_target;
4608   if (dst_bt == T_INT) {
4609     if (src_bt == T_FLOAT) {
4610       cvttss2sil(dst, src);
4611       cmpl(dst, 0x80000000);
4612       slowpath_target = StubRoutines::x86::f2i_fixup();
4613     } else {
4614       cvttsd2sil(dst, src);
4615       cmpl(dst, 0x80000000);
4616       slowpath_target = StubRoutines::x86::d2i_fixup();
4617     }
4618   } else {
4619     if (src_bt == T_FLOAT) {
4620       cvttss2siq(dst, src);
4621       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4622       slowpath_target = StubRoutines::x86::f2l_fixup();
4623     } else {
4624       cvttsd2siq(dst, src);
4625       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4626       slowpath_target = StubRoutines::x86::d2l_fixup();
4627     }
4628   }
4629 
4630   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4631   jcc(Assembler::equal, stub->entry());
4632   bind(stub->continuation());
4633 }
4634 
4635 #endif // _LP64
4636 
4637 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4638                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4639   switch(ideal_opc) {
4640     case Op_LShiftVS:
4641       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4642     case Op_LShiftVI:
4643       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4644     case Op_LShiftVL:
4645       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4646     case Op_RShiftVS:
4647       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4648     case Op_RShiftVI:
4649       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4650     case Op_RShiftVL:
4651       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4652     case Op_URShiftVS:
4653       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4654     case Op_URShiftVI:
4655       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4656     case Op_URShiftVL:
4657       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4658     case Op_RotateRightV:
4659       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4660     case Op_RotateLeftV:
4661       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4662     default:
4663       fatal("Unsupported masked operation"); break;
4664   }
4665 }
4666 
4667 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4668                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4669                                     bool is_varshift) {
4670   switch (ideal_opc) {
4671     case Op_AddVB:
4672       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4673     case Op_AddVS:
4674       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4675     case Op_AddVI:
4676       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4677     case Op_AddVL:
4678       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4679     case Op_AddVF:
4680       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4681     case Op_AddVD:
4682       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4683     case Op_SubVB:
4684       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4685     case Op_SubVS:
4686       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4687     case Op_SubVI:
4688       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4689     case Op_SubVL:
4690       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4691     case Op_SubVF:
4692       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4693     case Op_SubVD:
4694       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4695     case Op_MulVS:
4696       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4697     case Op_MulVI:
4698       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4699     case Op_MulVL:
4700       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4701     case Op_MulVF:
4702       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4703     case Op_MulVD:
4704       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4705     case Op_DivVF:
4706       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4707     case Op_DivVD:
4708       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4709     case Op_SqrtVF:
4710       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4711     case Op_SqrtVD:
4712       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4713     case Op_AbsVB:
4714       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4715     case Op_AbsVS:
4716       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4717     case Op_AbsVI:
4718       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4719     case Op_AbsVL:
4720       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4721     case Op_FmaVF:
4722       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4723     case Op_FmaVD:
4724       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4725     case Op_VectorRearrange:
4726       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4727     case Op_LShiftVS:
4728       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4729     case Op_LShiftVI:
4730       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4731     case Op_LShiftVL:
4732       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4733     case Op_RShiftVS:
4734       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4735     case Op_RShiftVI:
4736       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4737     case Op_RShiftVL:
4738       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4739     case Op_URShiftVS:
4740       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4741     case Op_URShiftVI:
4742       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4743     case Op_URShiftVL:
4744       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4745     case Op_RotateLeftV:
4746       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4747     case Op_RotateRightV:
4748       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4749     case Op_MaxV:
4750       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4751     case Op_MinV:
4752       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4753     case Op_XorV:
4754       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4755     case Op_OrV:
4756       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4757     case Op_AndV:
4758       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4759     default:
4760       fatal("Unsupported masked operation"); break;
4761   }
4762 }
4763 
4764 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4765                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4766   switch (ideal_opc) {
4767     case Op_AddVB:
4768       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4769     case Op_AddVS:
4770       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4771     case Op_AddVI:
4772       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4773     case Op_AddVL:
4774       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4775     case Op_AddVF:
4776       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4777     case Op_AddVD:
4778       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4779     case Op_SubVB:
4780       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4781     case Op_SubVS:
4782       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4783     case Op_SubVI:
4784       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4785     case Op_SubVL:
4786       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4787     case Op_SubVF:
4788       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4789     case Op_SubVD:
4790       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4791     case Op_MulVS:
4792       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4793     case Op_MulVI:
4794       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4795     case Op_MulVL:
4796       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4797     case Op_MulVF:
4798       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4799     case Op_MulVD:
4800       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4801     case Op_DivVF:
4802       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4803     case Op_DivVD:
4804       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4805     case Op_FmaVF:
4806       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4807     case Op_FmaVD:
4808       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4809     case Op_MaxV:
4810       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4811     case Op_MinV:
4812       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4813     case Op_XorV:
4814       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4815     case Op_OrV:
4816       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4817     case Op_AndV:
4818       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4819     default:
4820       fatal("Unsupported masked operation"); break;
4821   }
4822 }
4823 
4824 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4825                                   KRegister src1, KRegister src2) {
4826   BasicType etype = T_ILLEGAL;
4827   switch(mask_len) {
4828     case 2:
4829     case 4:
4830     case 8:  etype = T_BYTE; break;
4831     case 16: etype = T_SHORT; break;
4832     case 32: etype = T_INT; break;
4833     case 64: etype = T_LONG; break;
4834     default: fatal("Unsupported type"); break;
4835   }
4836   assert(etype != T_ILLEGAL, "");
4837   switch(ideal_opc) {
4838     case Op_AndVMask:
4839       kand(etype, dst, src1, src2); break;
4840     case Op_OrVMask:
4841       kor(etype, dst, src1, src2); break;
4842     case Op_XorVMask:
4843       kxor(etype, dst, src1, src2); break;
4844     default:
4845       fatal("Unsupported masked operation"); break;
4846   }
4847 }
4848 
4849 /*
4850  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4851  * If src is NaN, the result is 0.
4852  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4853  * the result is equal to the value of Integer.MIN_VALUE.
4854  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4855  * the result is equal to the value of Integer.MAX_VALUE.
4856  */
4857 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4858                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4859                                                                    Register rscratch, AddressLiteral float_sign_flip,
4860                                                                    int vec_enc) {
4861   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4862   Label done;
4863   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4864   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4865   vptest(xtmp2, xtmp2, vec_enc);
4866   jccb(Assembler::equal, done);
4867 
4868   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4869   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4870 
4871   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4872   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4873   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4874 
4875   // Recompute the mask for remaining special value.
4876   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4877   // Extract SRC values corresponding to TRUE mask lanes.
4878   vpand(xtmp4, xtmp2, src, vec_enc);
4879   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4880   // values are set.
4881   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4882 
4883   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4884   bind(done);
4885 }
4886 
4887 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4888                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4889                                                                     Register rscratch, AddressLiteral float_sign_flip,
4890                                                                     int vec_enc) {
4891   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4892   Label done;
4893   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4894   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4895   kortestwl(ktmp1, ktmp1);
4896   jccb(Assembler::equal, done);
4897 
4898   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4899   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4900   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4901 
4902   kxorwl(ktmp1, ktmp1, ktmp2);
4903   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4904   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4905   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4906   bind(done);
4907 }
4908 
4909 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4910                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4911                                                                      Register rscratch, AddressLiteral double_sign_flip,
4912                                                                      int vec_enc) {
4913   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4914 
4915   Label done;
4916   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4917   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4918   kortestwl(ktmp1, ktmp1);
4919   jccb(Assembler::equal, done);
4920 
4921   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4922   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4923   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4924 
4925   kxorwl(ktmp1, ktmp1, ktmp2);
4926   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4927   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4928   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4929   bind(done);
4930 }
4931 
4932 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4933                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4934                                                                      Register rscratch, AddressLiteral float_sign_flip,
4935                                                                      int vec_enc) {
4936   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4937   Label done;
4938   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4939   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4940   kortestwl(ktmp1, ktmp1);
4941   jccb(Assembler::equal, done);
4942 
4943   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4944   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4945   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4946 
4947   kxorwl(ktmp1, ktmp1, ktmp2);
4948   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4949   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4950   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4951   bind(done);
4952 }
4953 
4954 /*
4955  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4956  * If src is NaN, the result is 0.
4957  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4958  * the result is equal to the value of Long.MIN_VALUE.
4959  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4960  * the result is equal to the value of Long.MAX_VALUE.
4961  */
4962 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4963                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4964                                                                       Register rscratch, AddressLiteral double_sign_flip,
4965                                                                       int vec_enc) {
4966   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4967 
4968   Label done;
4969   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4970   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4971   kortestwl(ktmp1, ktmp1);
4972   jccb(Assembler::equal, done);
4973 
4974   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4975   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4976   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4977 
4978   kxorwl(ktmp1, ktmp1, ktmp2);
4979   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4980   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4981   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4982   bind(done);
4983 }
4984 
4985 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4986                                                              XMMRegister xtmp, int index, int vec_enc) {
4987    assert(vec_enc < Assembler::AVX_512bit, "");
4988    if (vec_enc == Assembler::AVX_256bit) {
4989      vextractf128_high(xtmp, src);
4990      vshufps(dst, src, xtmp, index, vec_enc);
4991    } else {
4992      vshufps(dst, src, zero, index, vec_enc);
4993    }
4994 }
4995 
4996 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4997                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4998                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4999   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5000 
5001   Label done;
5002   // Compare the destination lanes with float_sign_flip
5003   // value to get mask for all special values.
5004   movdqu(xtmp1, float_sign_flip, rscratch);
5005   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5006   ptest(xtmp2, xtmp2);
5007   jccb(Assembler::equal, done);
5008 
5009   // Flip float_sign_flip to get max integer value.
5010   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5011   pxor(xtmp1, xtmp4);
5012 
5013   // Set detination lanes corresponding to unordered source lanes as zero.
5014   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5015   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5016 
5017   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5018   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5019   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5020 
5021   // Recompute the mask for remaining special value.
5022   pxor(xtmp2, xtmp3);
5023   // Extract mask corresponding to non-negative source lanes.
5024   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5025 
5026   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5027   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5028   pand(xtmp3, xtmp2);
5029 
5030   // Replace destination lanes holding special value(0x80000000) with max int
5031   // if corresponding source lane holds a +ve value.
5032   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5033   bind(done);
5034 }
5035 
5036 
5037 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5038                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5039   switch(to_elem_bt) {
5040     case T_SHORT:
5041       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5042       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5043       vpackusdw(dst, dst, zero, vec_enc);
5044       if (vec_enc == Assembler::AVX_256bit) {
5045         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5046       }
5047       break;
5048     case  T_BYTE:
5049       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5050       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5051       vpackusdw(dst, dst, zero, vec_enc);
5052       if (vec_enc == Assembler::AVX_256bit) {
5053         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5054       }
5055       vpackuswb(dst, dst, zero, vec_enc);
5056       break;
5057     default: assert(false, "%s", type2name(to_elem_bt));
5058   }
5059 }
5060 
5061 /*
5062  * Algorithm for vector D2L and F2I conversions:-
5063  * a) Perform vector D2L/F2I cast.
5064  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5065  *    It signifies that source value could be any of the special floating point
5066  *    values(NaN,-Inf,Inf,Max,-Min).
5067  * c) Set destination to zero if source is NaN value.
5068  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5069  */
5070 
5071 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5072                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5073                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5074   int to_elem_sz = type2aelembytes(to_elem_bt);
5075   assert(to_elem_sz <= 4, "");
5076   vcvttps2dq(dst, src, vec_enc);
5077   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5078   if (to_elem_sz < 4) {
5079     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5080     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5081   }
5082 }
5083 
5084 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5085                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5086                                             Register rscratch, int vec_enc) {
5087   int to_elem_sz = type2aelembytes(to_elem_bt);
5088   assert(to_elem_sz <= 4, "");
5089   vcvttps2dq(dst, src, vec_enc);
5090   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5091   switch(to_elem_bt) {
5092     case T_INT:
5093       break;
5094     case T_SHORT:
5095       evpmovdw(dst, dst, vec_enc);
5096       break;
5097     case T_BYTE:
5098       evpmovdb(dst, dst, vec_enc);
5099       break;
5100     default: assert(false, "%s", type2name(to_elem_bt));
5101   }
5102 }
5103 
5104 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5105                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5106                                             Register rscratch, int vec_enc) {
5107   evcvttps2qq(dst, src, vec_enc);
5108   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5109 }
5110 
5111 // Handling for downcasting from double to integer or sub-word types on AVX2.
5112 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5113                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5114                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5115   int to_elem_sz = type2aelembytes(to_elem_bt);
5116   assert(to_elem_sz < 8, "");
5117   vcvttpd2dq(dst, src, vec_enc);
5118   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5119                                               float_sign_flip, vec_enc);
5120   if (to_elem_sz < 4) {
5121     // xtmp4 holds all zero lanes.
5122     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5123   }
5124 }
5125 
5126 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5127                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5128                                             KRegister ktmp2, AddressLiteral sign_flip,
5129                                             Register rscratch, int vec_enc) {
5130   if (VM_Version::supports_avx512dq()) {
5131     evcvttpd2qq(dst, src, vec_enc);
5132     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5133     switch(to_elem_bt) {
5134       case T_LONG:
5135         break;
5136       case T_INT:
5137         evpmovsqd(dst, dst, vec_enc);
5138         break;
5139       case T_SHORT:
5140         evpmovsqd(dst, dst, vec_enc);
5141         evpmovdw(dst, dst, vec_enc);
5142         break;
5143       case T_BYTE:
5144         evpmovsqd(dst, dst, vec_enc);
5145         evpmovdb(dst, dst, vec_enc);
5146         break;
5147       default: assert(false, "%s", type2name(to_elem_bt));
5148     }
5149   } else {
5150     assert(type2aelembytes(to_elem_bt) <= 4, "");
5151     vcvttpd2dq(dst, src, vec_enc);
5152     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5153     switch(to_elem_bt) {
5154       case T_INT:
5155         break;
5156       case T_SHORT:
5157         evpmovdw(dst, dst, vec_enc);
5158         break;
5159       case T_BYTE:
5160         evpmovdb(dst, dst, vec_enc);
5161         break;
5162       default: assert(false, "%s", type2name(to_elem_bt));
5163     }
5164   }
5165 }
5166 
5167 #ifdef _LP64
5168 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5169                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5170                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5171   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5172   // and re-instantiate original MXCSR.RC mode after that.
5173   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5174 
5175   mov64(tmp, julong_cast(0.5L));
5176   evpbroadcastq(xtmp1, tmp, vec_enc);
5177   vaddpd(xtmp1, src , xtmp1, vec_enc);
5178   evcvtpd2qq(dst, xtmp1, vec_enc);
5179   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5180                                                 double_sign_flip, vec_enc);;
5181 
5182   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5183 }
5184 
5185 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5186                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5187                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5188   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5189   // and re-instantiate original MXCSR.RC mode after that.
5190   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5191 
5192   movl(tmp, jint_cast(0.5));
5193   movq(xtmp1, tmp);
5194   vbroadcastss(xtmp1, xtmp1, vec_enc);
5195   vaddps(xtmp1, src , xtmp1, vec_enc);
5196   vcvtps2dq(dst, xtmp1, vec_enc);
5197   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5198                                               float_sign_flip, vec_enc);
5199 
5200   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5201 }
5202 
5203 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5204                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5205                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5206   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5207   // and re-instantiate original MXCSR.RC mode after that.
5208   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5209 
5210   movl(tmp, jint_cast(0.5));
5211   movq(xtmp1, tmp);
5212   vbroadcastss(xtmp1, xtmp1, vec_enc);
5213   vaddps(xtmp1, src , xtmp1, vec_enc);
5214   vcvtps2dq(dst, xtmp1, vec_enc);
5215   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5216 
5217   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5218 }
5219 #endif // _LP64
5220 
5221 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5222                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5223   switch (from_elem_bt) {
5224     case T_BYTE:
5225       switch (to_elem_bt) {
5226         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5227         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5228         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5229         default: ShouldNotReachHere();
5230       }
5231       break;
5232     case T_SHORT:
5233       switch (to_elem_bt) {
5234         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5235         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5236         default: ShouldNotReachHere();
5237       }
5238       break;
5239     case T_INT:
5240       assert(to_elem_bt == T_LONG, "");
5241       vpmovzxdq(dst, src, vlen_enc);
5242       break;
5243     default:
5244       ShouldNotReachHere();
5245   }
5246 }
5247 
5248 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5249                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5250   switch (from_elem_bt) {
5251     case T_BYTE:
5252       switch (to_elem_bt) {
5253         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5254         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5255         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5256         default: ShouldNotReachHere();
5257       }
5258       break;
5259     case T_SHORT:
5260       switch (to_elem_bt) {
5261         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5262         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5263         default: ShouldNotReachHere();
5264       }
5265       break;
5266     case T_INT:
5267       assert(to_elem_bt == T_LONG, "");
5268       vpmovsxdq(dst, src, vlen_enc);
5269       break;
5270     default:
5271       ShouldNotReachHere();
5272   }
5273 }
5274 
5275 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5276                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5277   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5278   assert(vlen_enc != AVX_512bit, "");
5279 
5280   int dst_bt_size = type2aelembytes(dst_bt);
5281   int src_bt_size = type2aelembytes(src_bt);
5282   if (dst_bt_size > src_bt_size) {
5283     switch (dst_bt_size / src_bt_size) {
5284       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5285       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5286       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5287       default: ShouldNotReachHere();
5288     }
5289   } else {
5290     assert(dst_bt_size < src_bt_size, "");
5291     switch (src_bt_size / dst_bt_size) {
5292       case 2: {
5293         if (vlen_enc == AVX_128bit) {
5294           vpacksswb(dst, src, src, vlen_enc);
5295         } else {
5296           vpacksswb(dst, src, src, vlen_enc);
5297           vpermq(dst, dst, 0x08, vlen_enc);
5298         }
5299         break;
5300       }
5301       case 4: {
5302         if (vlen_enc == AVX_128bit) {
5303           vpackssdw(dst, src, src, vlen_enc);
5304           vpacksswb(dst, dst, dst, vlen_enc);
5305         } else {
5306           vpackssdw(dst, src, src, vlen_enc);
5307           vpermq(dst, dst, 0x08, vlen_enc);
5308           vpacksswb(dst, dst, dst, AVX_128bit);
5309         }
5310         break;
5311       }
5312       case 8: {
5313         if (vlen_enc == AVX_128bit) {
5314           vpshufd(dst, src, 0x08, vlen_enc);
5315           vpackssdw(dst, dst, dst, vlen_enc);
5316           vpacksswb(dst, dst, dst, vlen_enc);
5317         } else {
5318           vpshufd(dst, src, 0x08, vlen_enc);
5319           vpermq(dst, dst, 0x08, vlen_enc);
5320           vpackssdw(dst, dst, dst, AVX_128bit);
5321           vpacksswb(dst, dst, dst, AVX_128bit);
5322         }
5323         break;
5324       }
5325       default: ShouldNotReachHere();
5326     }
5327   }
5328 }
5329 
5330 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5331                                    bool merge, BasicType bt, int vlen_enc) {
5332   if (bt == T_INT) {
5333     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5334   } else {
5335     assert(bt == T_LONG, "");
5336     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5337   }
5338 }
5339 
5340 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5341                                    bool merge, BasicType bt, int vlen_enc) {
5342   if (bt == T_INT) {
5343     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5344   } else {
5345     assert(bt == T_LONG, "");
5346     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5347   }
5348 }
5349 
5350 #ifdef _LP64
5351 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5352                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5353                                                int vec_enc) {
5354   int index = 0;
5355   int vindex = 0;
5356   mov64(rtmp1, 0x0101010101010101L);
5357   pdepq(rtmp1, src, rtmp1);
5358   if (mask_len > 8) {
5359     movq(rtmp2, src);
5360     vpxor(xtmp, xtmp, xtmp, vec_enc);
5361     movq(xtmp, rtmp1);
5362   }
5363   movq(dst, rtmp1);
5364 
5365   mask_len -= 8;
5366   while (mask_len > 0) {
5367     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5368     index++;
5369     if ((index % 2) == 0) {
5370       pxor(xtmp, xtmp);
5371     }
5372     mov64(rtmp1, 0x0101010101010101L);
5373     shrq(rtmp2, 8);
5374     pdepq(rtmp1, rtmp2, rtmp1);
5375     pinsrq(xtmp, rtmp1, index % 2);
5376     vindex = index / 2;
5377     if (vindex) {
5378       // Write entire 16 byte vector when both 64 bit
5379       // lanes are update to save redundant instructions.
5380       if (index % 2) {
5381         vinsertf128(dst, dst, xtmp, vindex);
5382       }
5383     } else {
5384       vmovdqu(dst, xtmp);
5385     }
5386     mask_len -= 8;
5387   }
5388 }
5389 
5390 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5391   switch(opc) {
5392     case Op_VectorMaskTrueCount:
5393       popcntq(dst, tmp);
5394       break;
5395     case Op_VectorMaskLastTrue:
5396       if (VM_Version::supports_lzcnt()) {
5397         lzcntq(tmp, tmp);
5398         movl(dst, 63);
5399         subl(dst, tmp);
5400       } else {
5401         movl(dst, -1);
5402         bsrq(tmp, tmp);
5403         cmov32(Assembler::notZero, dst, tmp);
5404       }
5405       break;
5406     case Op_VectorMaskFirstTrue:
5407       if (VM_Version::supports_bmi1()) {
5408         if (masklen < 32) {
5409           orl(tmp, 1 << masklen);
5410           tzcntl(dst, tmp);
5411         } else if (masklen == 32) {
5412           tzcntl(dst, tmp);
5413         } else {
5414           assert(masklen == 64, "");
5415           tzcntq(dst, tmp);
5416         }
5417       } else {
5418         if (masklen < 32) {
5419           orl(tmp, 1 << masklen);
5420           bsfl(dst, tmp);
5421         } else {
5422           assert(masklen == 32 || masklen == 64, "");
5423           movl(dst, masklen);
5424           if (masklen == 32)  {
5425             bsfl(tmp, tmp);
5426           } else {
5427             bsfq(tmp, tmp);
5428           }
5429           cmov32(Assembler::notZero, dst, tmp);
5430         }
5431       }
5432       break;
5433     case Op_VectorMaskToLong:
5434       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5435       break;
5436     default: assert(false, "Unhandled mask operation");
5437   }
5438 }
5439 
5440 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5441                                               int masklen, int masksize, int vec_enc) {
5442   assert(VM_Version::supports_popcnt(), "");
5443 
5444   if(VM_Version::supports_avx512bw()) {
5445     kmovql(tmp, mask);
5446   } else {
5447     assert(masklen <= 16, "");
5448     kmovwl(tmp, mask);
5449   }
5450 
5451   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5452   // operations needs to be clipped.
5453   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5454     andq(tmp, (1 << masklen) - 1);
5455   }
5456 
5457   vector_mask_operation_helper(opc, dst, tmp, masklen);
5458 }
5459 
5460 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5461                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5462   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5463          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5464   assert(VM_Version::supports_popcnt(), "");
5465 
5466   bool need_clip = false;
5467   switch(bt) {
5468     case T_BOOLEAN:
5469       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5470       vpxor(xtmp, xtmp, xtmp, vec_enc);
5471       vpsubb(xtmp, xtmp, mask, vec_enc);
5472       vpmovmskb(tmp, xtmp, vec_enc);
5473       need_clip = masklen < 16;
5474       break;
5475     case T_BYTE:
5476       vpmovmskb(tmp, mask, vec_enc);
5477       need_clip = masklen < 16;
5478       break;
5479     case T_SHORT:
5480       vpacksswb(xtmp, mask, mask, vec_enc);
5481       if (masklen >= 16) {
5482         vpermpd(xtmp, xtmp, 8, vec_enc);
5483       }
5484       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5485       need_clip = masklen < 16;
5486       break;
5487     case T_INT:
5488     case T_FLOAT:
5489       vmovmskps(tmp, mask, vec_enc);
5490       need_clip = masklen < 4;
5491       break;
5492     case T_LONG:
5493     case T_DOUBLE:
5494       vmovmskpd(tmp, mask, vec_enc);
5495       need_clip = masklen < 2;
5496       break;
5497     default: assert(false, "Unhandled type, %s", type2name(bt));
5498   }
5499 
5500   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5501   // operations needs to be clipped.
5502   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5503     // need_clip implies masklen < 32
5504     andq(tmp, (1 << masklen) - 1);
5505   }
5506 
5507   vector_mask_operation_helper(opc, dst, tmp, masklen);
5508 }
5509 
5510 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5511                                              Register rtmp2, int mask_len) {
5512   kmov(rtmp1, src);
5513   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5514   mov64(rtmp2, -1L);
5515   pextq(rtmp2, rtmp2, rtmp1);
5516   kmov(dst, rtmp2);
5517 }
5518 
5519 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5520                                                     XMMRegister mask, Register rtmp, Register rscratch,
5521                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5522                                                     int vec_enc) {
5523   assert(type2aelembytes(bt) >= 4, "");
5524   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5525   address compress_perm_table = nullptr;
5526   address expand_perm_table = nullptr;
5527   if (type2aelembytes(bt) == 8) {
5528     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5529     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5530     vmovmskpd(rtmp, mask, vec_enc);
5531   } else {
5532     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5533     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5534     vmovmskps(rtmp, mask, vec_enc);
5535   }
5536   shlq(rtmp, 5); // for 32 byte permute row.
5537   if (opcode == Op_CompressV) {
5538     lea(rscratch, ExternalAddress(compress_perm_table));
5539   } else {
5540     lea(rscratch, ExternalAddress(expand_perm_table));
5541   }
5542   addptr(rtmp, rscratch);
5543   vmovdqu(permv, Address(rtmp));
5544   vpermps(dst, permv, src, Assembler::AVX_256bit);
5545   vpxor(xtmp, xtmp, xtmp, vec_enc);
5546   // Blend the result with zero vector using permute mask, each column entry
5547   // in a permute table row contains either a valid permute index or a -1 (default)
5548   // value, this can potentially be used as a blending mask after
5549   // compressing/expanding the source vector lanes.
5550   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5551 }
5552 
5553 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5554                                                bool merge, BasicType bt, int vec_enc) {
5555   if (opcode == Op_CompressV) {
5556     switch(bt) {
5557     case T_BYTE:
5558       evpcompressb(dst, mask, src, merge, vec_enc);
5559       break;
5560     case T_CHAR:
5561     case T_SHORT:
5562       evpcompressw(dst, mask, src, merge, vec_enc);
5563       break;
5564     case T_INT:
5565       evpcompressd(dst, mask, src, merge, vec_enc);
5566       break;
5567     case T_FLOAT:
5568       evcompressps(dst, mask, src, merge, vec_enc);
5569       break;
5570     case T_LONG:
5571       evpcompressq(dst, mask, src, merge, vec_enc);
5572       break;
5573     case T_DOUBLE:
5574       evcompresspd(dst, mask, src, merge, vec_enc);
5575       break;
5576     default:
5577       fatal("Unsupported type %s", type2name(bt));
5578       break;
5579     }
5580   } else {
5581     assert(opcode == Op_ExpandV, "");
5582     switch(bt) {
5583     case T_BYTE:
5584       evpexpandb(dst, mask, src, merge, vec_enc);
5585       break;
5586     case T_CHAR:
5587     case T_SHORT:
5588       evpexpandw(dst, mask, src, merge, vec_enc);
5589       break;
5590     case T_INT:
5591       evpexpandd(dst, mask, src, merge, vec_enc);
5592       break;
5593     case T_FLOAT:
5594       evexpandps(dst, mask, src, merge, vec_enc);
5595       break;
5596     case T_LONG:
5597       evpexpandq(dst, mask, src, merge, vec_enc);
5598       break;
5599     case T_DOUBLE:
5600       evexpandpd(dst, mask, src, merge, vec_enc);
5601       break;
5602     default:
5603       fatal("Unsupported type %s", type2name(bt));
5604       break;
5605     }
5606   }
5607 }
5608 #endif
5609 
5610 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5611                                            KRegister ktmp1, int vec_enc) {
5612   if (opcode == Op_SignumVD) {
5613     vsubpd(dst, zero, one, vec_enc);
5614     // if src < 0 ? -1 : 1
5615     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5616     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5617     // if src == NaN, -0.0 or 0.0 return src.
5618     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5619     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5620   } else {
5621     assert(opcode == Op_SignumVF, "");
5622     vsubps(dst, zero, one, vec_enc);
5623     // if src < 0 ? -1 : 1
5624     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5625     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5626     // if src == NaN, -0.0 or 0.0 return src.
5627     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5628     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5629   }
5630 }
5631 
5632 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5633                                           XMMRegister xtmp1, int vec_enc) {
5634   if (opcode == Op_SignumVD) {
5635     vsubpd(dst, zero, one, vec_enc);
5636     // if src < 0 ? -1 : 1
5637     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5638     // if src == NaN, -0.0 or 0.0 return src.
5639     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5640     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5641   } else {
5642     assert(opcode == Op_SignumVF, "");
5643     vsubps(dst, zero, one, vec_enc);
5644     // if src < 0 ? -1 : 1
5645     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5646     // if src == NaN, -0.0 or 0.0 return src.
5647     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5648     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5649   }
5650 }
5651 
5652 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5653   if (VM_Version::supports_avx512bw()) {
5654     if (mask_len > 32) {
5655       kmovql(dst, src);
5656     } else {
5657       kmovdl(dst, src);
5658       if (mask_len != 32) {
5659         kshiftrdl(dst, dst, 32 - mask_len);
5660       }
5661     }
5662   } else {
5663     assert(mask_len <= 16, "");
5664     kmovwl(dst, src);
5665     if (mask_len != 16) {
5666       kshiftrwl(dst, dst, 16 - mask_len);
5667     }
5668   }
5669 }
5670 
5671 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5672   int lane_size = type2aelembytes(bt);
5673   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5674   if ((is_LP64 || lane_size < 8) &&
5675       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5676        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5677     movptr(rtmp, imm32);
5678     switch(lane_size) {
5679       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5680       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5681       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5682       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5683       fatal("Unsupported lane size %d", lane_size);
5684       break;
5685     }
5686   } else {
5687     movptr(rtmp, imm32);
5688     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5689     switch(lane_size) {
5690       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5691       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5692       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5693       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5694       fatal("Unsupported lane size %d", lane_size);
5695       break;
5696     }
5697   }
5698 }
5699 
5700 //
5701 // Following is lookup table based popcount computation algorithm:-
5702 //       Index   Bit set count
5703 //     [ 0000 ->   0,
5704 //       0001 ->   1,
5705 //       0010 ->   1,
5706 //       0011 ->   2,
5707 //       0100 ->   1,
5708 //       0101 ->   2,
5709 //       0110 ->   2,
5710 //       0111 ->   3,
5711 //       1000 ->   1,
5712 //       1001 ->   2,
5713 //       1010 ->   3,
5714 //       1011 ->   3,
5715 //       1100 ->   2,
5716 //       1101 ->   3,
5717 //       1111 ->   4 ]
5718 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5719 //     shuffle indices for lookup table access.
5720 //  b. Right shift each byte of vector lane by 4 positions.
5721 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5722 //     shuffle indices for lookup table access.
5723 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5724 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5725 //     count of all the bytes of a quadword.
5726 //  f. Perform step e. for upper 128bit vector lane.
5727 //  g. Pack the bitset count of quadwords back to double word.
5728 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5729 
5730 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5731                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5732   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5733   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5734   vpsrlw(dst, src, 4, vec_enc);
5735   vpand(dst, dst, xtmp1, vec_enc);
5736   vpand(xtmp1, src, xtmp1, vec_enc);
5737   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5738   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5739   vpshufb(dst, xtmp2, dst, vec_enc);
5740   vpaddb(dst, dst, xtmp1, vec_enc);
5741 }
5742 
5743 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5744                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5745   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5746   // Following code is as per steps e,f,g and h of above algorithm.
5747   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5748   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5749   vpsadbw(dst, dst, xtmp2, vec_enc);
5750   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5751   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5752   vpackuswb(dst, xtmp1, dst, vec_enc);
5753 }
5754 
5755 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5756                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5757   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5758   // Add the popcount of upper and lower bytes of word.
5759   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5760   vpsrlw(dst, xtmp1, 8, vec_enc);
5761   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5762   vpaddw(dst, dst, xtmp1, vec_enc);
5763 }
5764 
5765 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5766                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5767   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5768   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5769   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5770 }
5771 
5772 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5773                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5774   switch(bt) {
5775     case T_LONG:
5776       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5777       break;
5778     case T_INT:
5779       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5780       break;
5781     case T_CHAR:
5782     case T_SHORT:
5783       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5784       break;
5785     case T_BYTE:
5786     case T_BOOLEAN:
5787       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5788       break;
5789     default:
5790       fatal("Unsupported type %s", type2name(bt));
5791       break;
5792   }
5793 }
5794 
5795 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5796                                                       KRegister mask, bool merge, int vec_enc) {
5797   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5798   switch(bt) {
5799     case T_LONG:
5800       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5801       evpopcntq(dst, mask, src, merge, vec_enc);
5802       break;
5803     case T_INT:
5804       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5805       evpopcntd(dst, mask, src, merge, vec_enc);
5806       break;
5807     case T_CHAR:
5808     case T_SHORT:
5809       assert(VM_Version::supports_avx512_bitalg(), "");
5810       evpopcntw(dst, mask, src, merge, vec_enc);
5811       break;
5812     case T_BYTE:
5813     case T_BOOLEAN:
5814       assert(VM_Version::supports_avx512_bitalg(), "");
5815       evpopcntb(dst, mask, src, merge, vec_enc);
5816       break;
5817     default:
5818       fatal("Unsupported type %s", type2name(bt));
5819       break;
5820   }
5821 }
5822 
5823 #ifndef _LP64
5824 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5825   assert(VM_Version::supports_avx512bw(), "");
5826   kmovdl(tmp, src);
5827   kunpckdql(dst, tmp, tmp);
5828 }
5829 #endif
5830 
5831 // Bit reversal algorithm first reverses the bits of each byte followed by
5832 // a byte level reversal for multi-byte primitive types (short/int/long).
5833 // Algorithm performs a lookup table access to get reverse bit sequence
5834 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5835 // is obtained by swapping the reverse bit sequences of upper and lower
5836 // nibble of a byte.
5837 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5838                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5839   if (VM_Version::supports_avx512vlbw()) {
5840 
5841     // Get the reverse bit sequence of lower nibble of each byte.
5842     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5843     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5844     evpandq(dst, xtmp2, src, vec_enc);
5845     vpshufb(dst, xtmp1, dst, vec_enc);
5846     vpsllq(dst, dst, 4, vec_enc);
5847 
5848     // Get the reverse bit sequence of upper nibble of each byte.
5849     vpandn(xtmp2, xtmp2, src, vec_enc);
5850     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5851     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5852 
5853     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5854     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5855     evporq(xtmp2, dst, xtmp2, vec_enc);
5856     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5857 
5858   } else if(vec_enc == Assembler::AVX_512bit) {
5859     // Shift based bit reversal.
5860     assert(bt == T_LONG || bt == T_INT, "");
5861 
5862     // Swap lower and upper nibble of each byte.
5863     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5864 
5865     // Swap two least and most significant bits of each nibble.
5866     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5867 
5868     // Swap adjacent pair of bits.
5869     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5870     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5871 
5872     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5873     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5874   } else {
5875     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5876     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5877 
5878     // Get the reverse bit sequence of lower nibble of each byte.
5879     vpand(dst, xtmp2, src, vec_enc);
5880     vpshufb(dst, xtmp1, dst, vec_enc);
5881     vpsllq(dst, dst, 4, vec_enc);
5882 
5883     // Get the reverse bit sequence of upper nibble of each byte.
5884     vpandn(xtmp2, xtmp2, src, vec_enc);
5885     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5886     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5887 
5888     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5889     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5890     vpor(xtmp2, dst, xtmp2, vec_enc);
5891     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5892   }
5893 }
5894 
5895 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5896                                                 XMMRegister xtmp, Register rscratch) {
5897   assert(VM_Version::supports_gfni(), "");
5898   assert(rscratch != noreg || always_reachable(mask), "missing");
5899 
5900   // Galois field instruction based bit reversal based on following algorithm.
5901   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5902   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5903   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5904   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5905 }
5906 
5907 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5908                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5909   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5910   evpandq(dst, xtmp1, src, vec_enc);
5911   vpsllq(dst, dst, nbits, vec_enc);
5912   vpandn(xtmp1, xtmp1, src, vec_enc);
5913   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5914   evporq(dst, dst, xtmp1, vec_enc);
5915 }
5916 
5917 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5918                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5919   // Shift based bit reversal.
5920   assert(VM_Version::supports_evex(), "");
5921   switch(bt) {
5922     case T_LONG:
5923       // Swap upper and lower double word of each quad word.
5924       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5925       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5926       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5927       break;
5928     case T_INT:
5929       // Swap upper and lower word of each double word.
5930       evprord(xtmp1, k0, src, 16, true, vec_enc);
5931       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5932       break;
5933     case T_CHAR:
5934     case T_SHORT:
5935       // Swap upper and lower byte of each word.
5936       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5937       break;
5938     case T_BYTE:
5939       evmovdquq(dst, k0, src, true, vec_enc);
5940       break;
5941     default:
5942       fatal("Unsupported type %s", type2name(bt));
5943       break;
5944   }
5945 }
5946 
5947 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5948   if (bt == T_BYTE) {
5949     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5950       evmovdquq(dst, k0, src, true, vec_enc);
5951     } else {
5952       vmovdqu(dst, src);
5953     }
5954     return;
5955   }
5956   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5957   // pre-computed shuffle indices.
5958   switch(bt) {
5959     case T_LONG:
5960       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5961       break;
5962     case T_INT:
5963       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5964       break;
5965     case T_CHAR:
5966     case T_SHORT:
5967       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5968       break;
5969     default:
5970       fatal("Unsupported type %s", type2name(bt));
5971       break;
5972   }
5973   vpshufb(dst, src, dst, vec_enc);
5974 }
5975 
5976 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5977                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5978                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5979   assert(is_integral_type(bt), "");
5980   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5981   assert(VM_Version::supports_avx512cd(), "");
5982   switch(bt) {
5983     case T_LONG:
5984       evplzcntq(dst, ktmp, src, merge, vec_enc);
5985       break;
5986     case T_INT:
5987       evplzcntd(dst, ktmp, src, merge, vec_enc);
5988       break;
5989     case T_SHORT:
5990       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5991       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5992       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5993       vpunpckhwd(dst, xtmp1, src, vec_enc);
5994       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5995       vpackusdw(dst, xtmp2, dst, vec_enc);
5996       break;
5997     case T_BYTE:
5998       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5999       // accessing the lookup table.
6000       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6001       // accessing the lookup table.
6002       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6003       assert(VM_Version::supports_avx512bw(), "");
6004       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6005       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6006       vpand(xtmp2, dst, src, vec_enc);
6007       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6008       vpsrlw(xtmp3, src, 4, vec_enc);
6009       vpand(xtmp3, dst, xtmp3, vec_enc);
6010       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6011       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6012       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6013       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6014       break;
6015     default:
6016       fatal("Unsupported type %s", type2name(bt));
6017       break;
6018   }
6019 }
6020 
6021 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6022                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6023   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6024   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6025   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6026   // accessing the lookup table.
6027   vpand(dst, xtmp2, src, vec_enc);
6028   vpshufb(dst, xtmp1, dst, vec_enc);
6029   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6030   // accessing the lookup table.
6031   vpsrlw(xtmp3, src, 4, vec_enc);
6032   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6033   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6034   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6035   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6036   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6037   vpaddb(dst, dst, xtmp2, vec_enc);
6038   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6039 }
6040 
6041 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6042                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6043   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6044   // Add zero counts of lower byte and upper byte of a word if
6045   // upper byte holds a zero value.
6046   vpsrlw(xtmp3, src, 8, vec_enc);
6047   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6048   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6049   vpsllw(xtmp2, dst, 8, vec_enc);
6050   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6051   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6052   vpsrlw(dst, dst, 8, vec_enc);
6053 }
6054 
6055 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6056                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6057   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6058   // hence biased exponent can be used to compute leading zero count as per
6059   // following formula:-
6060   // LZCNT = 32 - (biased_exp - 127)
6061   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6062 
6063   // Broadcast 0xFF
6064   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6065   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6066 
6067   // Extract biased exponent.
6068   vcvtdq2ps(dst, src, vec_enc);
6069   vpsrld(dst, dst, 23, vec_enc);
6070   vpand(dst, dst, xtmp1, vec_enc);
6071 
6072   // Broadcast 127.
6073   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6074   // Exponent = biased_exp - 127
6075   vpsubd(dst, dst, xtmp1, vec_enc);
6076 
6077   // Exponent = Exponent  + 1
6078   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6079   vpaddd(dst, dst, xtmp3, vec_enc);
6080 
6081   // Replace -ve exponent with zero, exponent is -ve when src
6082   // lane contains a zero value.
6083   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6084   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6085 
6086   // Rematerialize broadcast 32.
6087   vpslld(xtmp1, xtmp3, 5, vec_enc);
6088   // Exponent is 32 if corresponding source lane contains max_int value.
6089   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6090   // LZCNT = 32 - exponent
6091   vpsubd(dst, xtmp1, dst, vec_enc);
6092 
6093   // Replace LZCNT with a value 1 if corresponding source lane
6094   // contains max_int value.
6095   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6096 
6097   // Replace biased_exp with 0 if source lane value is less than zero.
6098   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6099   vblendvps(dst, dst, xtmp2, src, vec_enc);
6100 }
6101 
6102 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6103                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6104   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6105   // Add zero counts of lower word and upper word of a double word if
6106   // upper word holds a zero value.
6107   vpsrld(xtmp3, src, 16, vec_enc);
6108   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6109   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6110   vpslld(xtmp2, dst, 16, vec_enc);
6111   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6112   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6113   vpsrld(dst, dst, 16, vec_enc);
6114   // Add zero counts of lower doubleword and upper doubleword of a
6115   // quadword if upper doubleword holds a zero value.
6116   vpsrlq(xtmp3, src, 32, vec_enc);
6117   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6118   vpsllq(xtmp2, dst, 32, vec_enc);
6119   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6120   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6121   vpsrlq(dst, dst, 32, vec_enc);
6122 }
6123 
6124 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6125                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6126                                                        Register rtmp, int vec_enc) {
6127   assert(is_integral_type(bt), "unexpected type");
6128   assert(vec_enc < Assembler::AVX_512bit, "");
6129   switch(bt) {
6130     case T_LONG:
6131       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6132       break;
6133     case T_INT:
6134       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6135       break;
6136     case T_SHORT:
6137       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6138       break;
6139     case T_BYTE:
6140       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6141       break;
6142     default:
6143       fatal("Unsupported type %s", type2name(bt));
6144       break;
6145   }
6146 }
6147 
6148 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6149   switch(bt) {
6150     case T_BYTE:
6151       vpsubb(dst, src1, src2, vec_enc);
6152       break;
6153     case T_SHORT:
6154       vpsubw(dst, src1, src2, vec_enc);
6155       break;
6156     case T_INT:
6157       vpsubd(dst, src1, src2, vec_enc);
6158       break;
6159     case T_LONG:
6160       vpsubq(dst, src1, src2, vec_enc);
6161       break;
6162     default:
6163       fatal("Unsupported type %s", type2name(bt));
6164       break;
6165   }
6166 }
6167 
6168 // Trailing zero count computation is based on leading zero count operation as per
6169 // following equation. All AVX3 targets support AVX512CD feature which offers
6170 // direct vector instruction to compute leading zero count.
6171 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6172 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6173                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6174                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6175   assert(is_integral_type(bt), "");
6176   // xtmp = -1
6177   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6178   // xtmp = xtmp + src
6179   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6180   // xtmp = xtmp & ~src
6181   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6182   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6183   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6184   vpsub(bt, dst, xtmp4, dst, vec_enc);
6185 }
6186 
6187 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6188 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6189 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6190                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6191   assert(is_integral_type(bt), "");
6192   // xtmp = 0
6193   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6194   // xtmp = 0 - src
6195   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6196   // xtmp = xtmp | src
6197   vpor(xtmp3, xtmp3, src, vec_enc);
6198   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6199   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6200   vpsub(bt, dst, xtmp1, dst, vec_enc);
6201 }
6202 
6203 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6204   Label done;
6205   Label neg_divisor_fastpath;
6206   cmpl(divisor, 0);
6207   jccb(Assembler::less, neg_divisor_fastpath);
6208   xorl(rdx, rdx);
6209   divl(divisor);
6210   jmpb(done);
6211   bind(neg_divisor_fastpath);
6212   // Fastpath for divisor < 0:
6213   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6214   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6215   movl(rdx, rax);
6216   subl(rdx, divisor);
6217   if (VM_Version::supports_bmi1()) {
6218     andnl(rax, rdx, rax);
6219   } else {
6220     notl(rdx);
6221     andl(rax, rdx);
6222   }
6223   shrl(rax, 31);
6224   bind(done);
6225 }
6226 
6227 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6228   Label done;
6229   Label neg_divisor_fastpath;
6230   cmpl(divisor, 0);
6231   jccb(Assembler::less, neg_divisor_fastpath);
6232   xorl(rdx, rdx);
6233   divl(divisor);
6234   jmpb(done);
6235   bind(neg_divisor_fastpath);
6236   // Fastpath when divisor < 0:
6237   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6238   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6239   movl(rdx, rax);
6240   subl(rax, divisor);
6241   if (VM_Version::supports_bmi1()) {
6242     andnl(rax, rax, rdx);
6243   } else {
6244     notl(rax);
6245     andl(rax, rdx);
6246   }
6247   sarl(rax, 31);
6248   andl(rax, divisor);
6249   subl(rdx, rax);
6250   bind(done);
6251 }
6252 
6253 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6254   Label done;
6255   Label neg_divisor_fastpath;
6256 
6257   cmpl(divisor, 0);
6258   jccb(Assembler::less, neg_divisor_fastpath);
6259   xorl(rdx, rdx);
6260   divl(divisor);
6261   jmpb(done);
6262   bind(neg_divisor_fastpath);
6263   // Fastpath for divisor < 0:
6264   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6265   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6266   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6267   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6268   movl(rdx, rax);
6269   subl(rax, divisor);
6270   if (VM_Version::supports_bmi1()) {
6271     andnl(rax, rax, rdx);
6272   } else {
6273     notl(rax);
6274     andl(rax, rdx);
6275   }
6276   movl(tmp, rax);
6277   shrl(rax, 31); // quotient
6278   sarl(tmp, 31);
6279   andl(tmp, divisor);
6280   subl(rdx, tmp); // remainder
6281   bind(done);
6282 }
6283 
6284 #ifdef _LP64
6285 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6286                                  XMMRegister xtmp2, Register rtmp) {
6287   if(VM_Version::supports_gfni()) {
6288     // Galois field instruction based bit reversal based on following algorithm.
6289     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6290     mov64(rtmp, 0x8040201008040201L);
6291     movq(xtmp1, src);
6292     movq(xtmp2, rtmp);
6293     gf2p8affineqb(xtmp1, xtmp2, 0);
6294     movq(dst, xtmp1);
6295   } else {
6296     // Swap even and odd numbered bits.
6297     movl(rtmp, src);
6298     andl(rtmp, 0x55555555);
6299     shll(rtmp, 1);
6300     movl(dst, src);
6301     andl(dst, 0xAAAAAAAA);
6302     shrl(dst, 1);
6303     orl(dst, rtmp);
6304 
6305     // Swap LSB and MSB 2 bits of each nibble.
6306     movl(rtmp, dst);
6307     andl(rtmp, 0x33333333);
6308     shll(rtmp, 2);
6309     andl(dst, 0xCCCCCCCC);
6310     shrl(dst, 2);
6311     orl(dst, rtmp);
6312 
6313     // Swap LSB and MSB 4 bits of each byte.
6314     movl(rtmp, dst);
6315     andl(rtmp, 0x0F0F0F0F);
6316     shll(rtmp, 4);
6317     andl(dst, 0xF0F0F0F0);
6318     shrl(dst, 4);
6319     orl(dst, rtmp);
6320   }
6321   bswapl(dst);
6322 }
6323 
6324 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6325                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6326   if(VM_Version::supports_gfni()) {
6327     // Galois field instruction based bit reversal based on following algorithm.
6328     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6329     mov64(rtmp1, 0x8040201008040201L);
6330     movq(xtmp1, src);
6331     movq(xtmp2, rtmp1);
6332     gf2p8affineqb(xtmp1, xtmp2, 0);
6333     movq(dst, xtmp1);
6334   } else {
6335     // Swap even and odd numbered bits.
6336     movq(rtmp1, src);
6337     mov64(rtmp2, 0x5555555555555555L);
6338     andq(rtmp1, rtmp2);
6339     shlq(rtmp1, 1);
6340     movq(dst, src);
6341     notq(rtmp2);
6342     andq(dst, rtmp2);
6343     shrq(dst, 1);
6344     orq(dst, rtmp1);
6345 
6346     // Swap LSB and MSB 2 bits of each nibble.
6347     movq(rtmp1, dst);
6348     mov64(rtmp2, 0x3333333333333333L);
6349     andq(rtmp1, rtmp2);
6350     shlq(rtmp1, 2);
6351     notq(rtmp2);
6352     andq(dst, rtmp2);
6353     shrq(dst, 2);
6354     orq(dst, rtmp1);
6355 
6356     // Swap LSB and MSB 4 bits of each byte.
6357     movq(rtmp1, dst);
6358     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6359     andq(rtmp1, rtmp2);
6360     shlq(rtmp1, 4);
6361     notq(rtmp2);
6362     andq(dst, rtmp2);
6363     shrq(dst, 4);
6364     orq(dst, rtmp1);
6365   }
6366   bswapq(dst);
6367 }
6368 
6369 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6370   Label done;
6371   Label neg_divisor_fastpath;
6372   cmpq(divisor, 0);
6373   jccb(Assembler::less, neg_divisor_fastpath);
6374   xorl(rdx, rdx);
6375   divq(divisor);
6376   jmpb(done);
6377   bind(neg_divisor_fastpath);
6378   // Fastpath for divisor < 0:
6379   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6380   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6381   movq(rdx, rax);
6382   subq(rdx, divisor);
6383   if (VM_Version::supports_bmi1()) {
6384     andnq(rax, rdx, rax);
6385   } else {
6386     notq(rdx);
6387     andq(rax, rdx);
6388   }
6389   shrq(rax, 63);
6390   bind(done);
6391 }
6392 
6393 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6394   Label done;
6395   Label neg_divisor_fastpath;
6396   cmpq(divisor, 0);
6397   jccb(Assembler::less, neg_divisor_fastpath);
6398   xorq(rdx, rdx);
6399   divq(divisor);
6400   jmp(done);
6401   bind(neg_divisor_fastpath);
6402   // Fastpath when divisor < 0:
6403   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6404   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6405   movq(rdx, rax);
6406   subq(rax, divisor);
6407   if (VM_Version::supports_bmi1()) {
6408     andnq(rax, rax, rdx);
6409   } else {
6410     notq(rax);
6411     andq(rax, rdx);
6412   }
6413   sarq(rax, 63);
6414   andq(rax, divisor);
6415   subq(rdx, rax);
6416   bind(done);
6417 }
6418 
6419 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6420   Label done;
6421   Label neg_divisor_fastpath;
6422   cmpq(divisor, 0);
6423   jccb(Assembler::less, neg_divisor_fastpath);
6424   xorq(rdx, rdx);
6425   divq(divisor);
6426   jmp(done);
6427   bind(neg_divisor_fastpath);
6428   // Fastpath for divisor < 0:
6429   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6430   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6431   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6432   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6433   movq(rdx, rax);
6434   subq(rax, divisor);
6435   if (VM_Version::supports_bmi1()) {
6436     andnq(rax, rax, rdx);
6437   } else {
6438     notq(rax);
6439     andq(rax, rdx);
6440   }
6441   movq(tmp, rax);
6442   shrq(rax, 63); // quotient
6443   sarq(tmp, 63);
6444   andq(tmp, divisor);
6445   subq(rdx, tmp); // remainder
6446   bind(done);
6447 }
6448 #endif
6449 
6450 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6451                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6452                                         int vlen_enc) {
6453   assert(VM_Version::supports_avx512bw(), "");
6454   // Byte shuffles are inlane operations and indices are determined using
6455   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6456   // normalized to index range 0-15. This makes sure that all the multiples
6457   // of an index value are placed at same relative position in 128 bit
6458   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6459   // will be 16th element in their respective 128 bit lanes.
6460   movl(rtmp, 16);
6461   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6462 
6463   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6464   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6465   // original shuffle indices and move the shuffled lanes corresponding to true
6466   // mask to destination vector.
6467   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6468   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6469   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6470 
6471   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6472   // and broadcasting second 128 bit lane.
6473   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6474   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6475   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6476   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6477   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6478 
6479   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6480   // and broadcasting third 128 bit lane.
6481   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6482   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6483   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6484   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6485   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6486 
6487   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6488   // and broadcasting third 128 bit lane.
6489   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6490   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6491   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6492   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6493   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6494 }
6495 
6496 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6497                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6498   if (vlen_enc == AVX_128bit) {
6499     vpermilps(dst, src, shuffle, vlen_enc);
6500   } else if (bt == T_INT) {
6501     vpermd(dst, shuffle, src, vlen_enc);
6502   } else {
6503     assert(bt == T_FLOAT, "");
6504     vpermps(dst, shuffle, src, vlen_enc);
6505   }
6506 }
6507 
6508 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6509                                                      XMMRegister src2, int vlen_enc) {
6510   switch(elem_bt) {
6511     case T_BYTE:
6512       evpermi2b(dst, src1, src2, vlen_enc);
6513       break;
6514     case T_SHORT:
6515       evpermi2w(dst, src1, src2, vlen_enc);
6516       break;
6517     case T_INT:
6518       evpermi2d(dst, src1, src2, vlen_enc);
6519       break;
6520     case T_LONG:
6521       evpermi2q(dst, src1, src2, vlen_enc);
6522       break;
6523     case T_FLOAT:
6524       evpermi2ps(dst, src1, src2, vlen_enc);
6525       break;
6526     case T_DOUBLE:
6527       evpermi2pd(dst, src1, src2, vlen_enc);
6528       break;
6529     default:
6530       fatal("Unsupported type %s", type2name(elem_bt));
6531       break;
6532   }
6533 }