1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  54   if (C->clinit_barrier_on_entry()) {
  55     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  56     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  57 
  58     Label L_skip_barrier;
  59     Register klass = rscratch1;
  60 
  61     mov_metadata(klass, C->method()->holder()->constant_encoding());
  62     clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
  63 
  64     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  65 
  66     bind(L_skip_barrier);
  67   }
  68 
  69   int framesize = C->output()->frame_size_in_bytes();
  70   int bangsize = C->output()->bang_size_in_bytes();
  71   bool fp_mode_24b = false;
  72   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  73 
  74   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  75   // NativeJump::patch_verified_entry will be able to patch out the entry
  76   // code safely. The push to verify stack depth is ok at 5 bytes,
  77   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  78   // stack bang then we must use the 6 byte frame allocation even if
  79   // we have no frame. :-(
  80   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  81 
  82   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  83   // Remove word for return addr
  84   framesize -= wordSize;
  85   stack_bang_size -= wordSize;
  86 
  87   // Calls to C2R adapters often do not accept exceptional returns.
  88   // We require that their callers must bang for them.  But be careful, because
  89   // some VM calls (such as call site linkage) can use several kilobytes of
  90   // stack.  But the stack safety zone should account for that.
  91   // See bugs 4446381, 4468289, 4497237.
  92   if (stack_bang_size > 0) {
  93     generate_stack_overflow_check(stack_bang_size);
  94 
  95     // We always push rbp, so that on return to interpreter rbp, will be
  96     // restored correctly and we can correct the stack.
  97     push(rbp);
  98     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  99     if (PreserveFramePointer) {
 100       mov(rbp, rsp);
 101     }
 102     // Remove word for ebp
 103     framesize -= wordSize;
 104 
 105     // Create frame
 106     if (framesize) {
 107       subptr(rsp, framesize);
 108     }
 109   } else {
 110     // Create frame (force generation of a 4 byte immediate value)
 111     subptr_imm32(rsp, framesize);
 112 
 113     // Save RBP register now.
 114     framesize -= wordSize;
 115     movptr(Address(rsp, framesize), rbp);
 116     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 117     if (PreserveFramePointer) {
 118       movptr(rbp, rsp);
 119       if (framesize > 0) {
 120         addptr(rbp, framesize);
 121       }
 122     }
 123   }
 124 
 125   if (C->needs_stack_repair()) {
 126     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 127     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 128     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
 129   }
 130 
 131   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 132     framesize -= wordSize;
 133     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 134   }
 135 
 136 #ifndef _LP64
 137   // If method sets FPU control word do it now
 138   if (fp_mode_24b) {
 139     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 140   }
 141   if (UseSSE >= 2 && VerifyFPU) {
 142     verify_FPU(0, "FPU stack must be clean on entry");
 143   }
 144 #endif
 145 
 146 #ifdef ASSERT
 147   if (VerifyStackAtCalls) {
 148     Label L;
 149     push(rax);
 150     mov(rax, rsp);
 151     andptr(rax, StackAlignmentInBytes-1);
 152     cmpptr(rax, StackAlignmentInBytes-wordSize);
 153     pop(rax);
 154     jcc(Assembler::equal, L);
 155     STOP("Stack is not properly aligned!");
 156     bind(L);
 157   }
 158 #endif
 159 }
 160 
 161 void C2_MacroAssembler::entry_barrier() {
 162   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 163 #ifdef _LP64
 164   if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 165     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 166     Label dummy_slow_path;
 167     Label dummy_continuation;
 168     Label* slow_path = &dummy_slow_path;
 169     Label* continuation = &dummy_continuation;
 170     if (!Compile::current()->output()->in_scratch_emit_size()) {
 171       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 172       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 173       Compile::current()->output()->add_stub(stub);
 174       slow_path = &stub->entry();
 175       continuation = &stub->continuation();
 176     }
 177     bs->nmethod_entry_barrier(this, slow_path, continuation);
 178   }
 179 #else
 180   // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 181   bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 182 #endif
 183 }
 184 
 185 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 186   switch (vlen_in_bytes) {
 187     case  4: // fall-through
 188     case  8: // fall-through
 189     case 16: return Assembler::AVX_128bit;
 190     case 32: return Assembler::AVX_256bit;
 191     case 64: return Assembler::AVX_512bit;
 192 
 193     default: {
 194       ShouldNotReachHere();
 195       return Assembler::AVX_NoVec;
 196     }
 197   }
 198 }
 199 
 200 // fast_lock and fast_unlock used by C2
 201 
 202 // Because the transitions from emitted code to the runtime
 203 // monitorenter/exit helper stubs are so slow it's critical that
 204 // we inline both the stack-locking fast path and the inflated fast path.
 205 //
 206 // See also: cmpFastLock and cmpFastUnlock.
 207 //
 208 // What follows is a specialized inline transliteration of the code
 209 // in enter() and exit(). If we're concerned about I$ bloat another
 210 // option would be to emit TrySlowEnter and TrySlowExit methods
 211 // at startup-time.  These methods would accept arguments as
 212 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 213 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 214 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 215 // In practice, however, the # of lock sites is bounded and is usually small.
 216 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 217 // if the processor uses simple bimodal branch predictors keyed by EIP
 218 // Since the helper routines would be called from multiple synchronization
 219 // sites.
 220 //
 221 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 222 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 223 // to those specialized methods.  That'd give us a mostly platform-independent
 224 // implementation that the JITs could optimize and inline at their pleasure.
 225 // Done correctly, the only time we'd need to cross to native could would be
 226 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 227 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 228 // (b) explicit barriers or fence operations.
 229 //
 230 // TODO:
 231 //
 232 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 233 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 234 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 235 //    the lock operators would typically be faster than reifying Self.
 236 //
 237 // *  Ideally I'd define the primitives as:
 238 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 239 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 240 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 241 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 242 //    Furthermore the register assignments are overconstrained, possibly resulting in
 243 //    sub-optimal code near the synchronization site.
 244 //
 245 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 246 //    Alternately, use a better sp-proximity test.
 247 //
 248 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 249 //    Either one is sufficient to uniquely identify a thread.
 250 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 251 //
 252 // *  Intrinsify notify() and notifyAll() for the common cases where the
 253 //    object is locked by the calling thread but the waitlist is empty.
 254 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 255 //
 256 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 257 //    But beware of excessive branch density on AMD Opterons.
 258 //
 259 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 260 //    or failure of the fast path.  If the fast path fails then we pass
 261 //    control to the slow path, typically in C.  In fast_lock and
 262 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 263 //    will emit a conditional branch immediately after the node.
 264 //    So we have branches to branches and lots of ICC.ZF games.
 265 //    Instead, it might be better to have C2 pass a "FailureLabel"
 266 //    into fast_lock and fast_unlock.  In the case of success, control
 267 //    will drop through the node.  ICC.ZF is undefined at exit.
 268 //    In the case of failure, the node will branch directly to the
 269 //    FailureLabel
 270 
 271 
 272 // obj: object to lock
 273 // box: on-stack box address (displaced header location) - KILLED
 274 // rax,: tmp -- KILLED
 275 // scr: tmp -- KILLED
 276 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 277                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 278                                  Metadata* method_data) {
 279   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 280   // Ensure the register assignments are disjoint
 281   assert(tmpReg == rax, "");
 282   assert(cx1Reg == noreg, "");
 283   assert(cx2Reg == noreg, "");
 284   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 285 
 286   // Possible cases that we'll encounter in fast_lock
 287   // ------------------------------------------------
 288   // * Inflated
 289   //    -- unlocked
 290   //    -- Locked
 291   //       = by self
 292   //       = by other
 293   // * neutral
 294   // * stack-locked
 295   //    -- by self
 296   //       = sp-proximity test hits
 297   //       = sp-proximity test generates false-negative
 298   //    -- by other
 299   //
 300 
 301   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 302 
 303   if (DiagnoseSyncOnValueBasedClasses != 0) {
 304     load_klass(tmpReg, objReg, scrReg);
 305     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 306     jcc(Assembler::notZero, DONE_LABEL);
 307   }
 308 
 309   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 310   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 311   jcc(Assembler::notZero, IsInflated);
 312 
 313   if (LockingMode == LM_MONITOR) {
 314     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 315     testptr(objReg, objReg);
 316   } else {
 317     assert(LockingMode == LM_LEGACY, "must be");
 318     // Attempt stack-locking ...
 319     orptr (tmpReg, markWord::unlocked_value);
 320     if (EnableValhalla) {
 321       // Mask inline_type bit such that we go to the slow path if object is an inline type
 322       andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place));
 323     }
 324     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 325     lock();
 326     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 327     jcc(Assembler::equal, COUNT);           // Success
 328 
 329     // Recursive locking.
 330     // The object is stack-locked: markword contains stack pointer to BasicLock.
 331     // Locked by current thread if difference with current SP is less than one page.
 332     subptr(tmpReg, rsp);
 333     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 334     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 335     movptr(Address(boxReg, 0), tmpReg);
 336   }
 337   jmp(DONE_LABEL);
 338 
 339   bind(IsInflated);
 340   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 341 
 342 #ifndef _LP64
 343   // The object is inflated.
 344 
 345   // boxReg refers to the on-stack BasicLock in the current frame.
 346   // We'd like to write:
 347   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 348   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 349   // additional latency as we have another ST in the store buffer that must drain.
 350 
 351   // avoid ST-before-CAS
 352   // register juggle because we need tmpReg for cmpxchgptr below
 353   movptr(scrReg, boxReg);
 354   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 355 
 356   // Optimistic form: consider XORL tmpReg,tmpReg
 357   movptr(tmpReg, NULL_WORD);
 358 
 359   // Appears unlocked - try to swing _owner from null to non-null.
 360   // Ideally, I'd manifest "Self" with get_thread and then attempt
 361   // to CAS the register containing Self into m->Owner.
 362   // But we don't have enough registers, so instead we can either try to CAS
 363   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 364   // we later store "Self" into m->Owner.  Transiently storing a stack address
 365   // (rsp or the address of the box) into  m->owner is harmless.
 366   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 367   lock();
 368   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 369   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 370   // If we weren't able to swing _owner from null to the BasicLock
 371   // then take the slow path.
 372   jccb  (Assembler::notZero, NO_COUNT);
 373   // update _owner from BasicLock to thread
 374   get_thread (scrReg);                    // beware: clobbers ICCs
 375   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 376   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 377 
 378   // If the CAS fails we can either retry or pass control to the slow path.
 379   // We use the latter tactic.
 380   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 381   // If the CAS was successful ...
 382   //   Self has acquired the lock
 383   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 384   // Intentional fall-through into DONE_LABEL ...
 385 #else // _LP64
 386   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 387   movq(scrReg, tmpReg);
 388   xorq(tmpReg, tmpReg);
 389   lock();
 390   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 391   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 392   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 393   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 394   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 395   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 396 
 397   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 398   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 399   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 400   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 401 #endif // _LP64
 402   bind(DONE_LABEL);
 403 
 404   // ZFlag == 1 count in fast path
 405   // ZFlag == 0 count in slow path
 406   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 407 
 408   bind(COUNT);
 409   // Count monitors in fast path
 410   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 411 
 412   xorl(tmpReg, tmpReg); // Set ZF == 1
 413 
 414   bind(NO_COUNT);
 415 
 416   // At NO_COUNT the icc ZFlag is set as follows ...
 417   // fast_unlock uses the same protocol.
 418   // ZFlag == 1 -> Success
 419   // ZFlag == 0 -> Failure - force control through the slow path
 420 }
 421 
 422 // obj: object to unlock
 423 // box: box address (displaced header location), killed.  Must be EAX.
 424 // tmp: killed, cannot be obj nor box.
 425 //
 426 // Some commentary on balanced locking:
 427 //
 428 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 429 // Methods that don't have provably balanced locking are forced to run in the
 430 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 431 // The interpreter provides two properties:
 432 // I1:  At return-time the interpreter automatically and quietly unlocks any
 433 //      objects acquired the current activation (frame).  Recall that the
 434 //      interpreter maintains an on-stack list of locks currently held by
 435 //      a frame.
 436 // I2:  If a method attempts to unlock an object that is not held by the
 437 //      the frame the interpreter throws IMSX.
 438 //
 439 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 440 // B() doesn't have provably balanced locking so it runs in the interpreter.
 441 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 442 // is still locked by A().
 443 //
 444 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 445 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 446 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 447 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 448 // Arguably given that the spec legislates the JNI case as undefined our implementation
 449 // could reasonably *avoid* checking owner in fast_unlock().
 450 // In the interest of performance we elide m->Owner==Self check in unlock.
 451 // A perfectly viable alternative is to elide the owner check except when
 452 // Xcheck:jni is enabled.
 453 
 454 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 455   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 456   assert(boxReg == rax, "");
 457   assert_different_registers(objReg, boxReg, tmpReg);
 458 
 459   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 460 
 461   if (LockingMode == LM_LEGACY) {
 462     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 463     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 464   }
 465   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 466   if (LockingMode != LM_MONITOR) {
 467     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 468     jcc(Assembler::zero, Stacked);
 469   }
 470 
 471   // It's inflated.
 472 
 473   // Despite our balanced locking property we still check that m->_owner == Self
 474   // as java routines or native JNI code called by this thread might
 475   // have released the lock.
 476   // Refer to the comments in synchronizer.cpp for how we might encode extra
 477   // state in _succ so we can avoid fetching EntryList|cxq.
 478   //
 479   // If there's no contention try a 1-0 exit.  That is, exit without
 480   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 481   // we detect and recover from the race that the 1-0 exit admits.
 482   //
 483   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 484   // before it STs null into _owner, releasing the lock.  Updates
 485   // to data protected by the critical section must be visible before
 486   // we drop the lock (and thus before any other thread could acquire
 487   // the lock and observe the fields protected by the lock).
 488   // IA32's memory-model is SPO, so STs are ordered with respect to
 489   // each other and there's no need for an explicit barrier (fence).
 490   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 491 #ifndef _LP64
 492   // Note that we could employ various encoding schemes to reduce
 493   // the number of loads below (currently 4) to just 2 or 3.
 494   // Refer to the comments in synchronizer.cpp.
 495   // In practice the chain of fetches doesn't seem to impact performance, however.
 496   xorptr(boxReg, boxReg);
 497   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 498   jccb  (Assembler::notZero, DONE_LABEL);
 499   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 500   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 501   jccb  (Assembler::notZero, DONE_LABEL);
 502   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 503   jmpb  (DONE_LABEL);
 504 #else // _LP64
 505   // It's inflated
 506   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 507 
 508   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 509   jccb(Assembler::equal, LNotRecursive);
 510 
 511   // Recursive inflated unlock
 512   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 513   jmpb(LSuccess);
 514 
 515   bind(LNotRecursive);
 516   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 517   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 518   jccb  (Assembler::notZero, CheckSucc);
 519   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 520   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 521   jmpb  (DONE_LABEL);
 522 
 523   // Try to avoid passing control into the slow_path ...
 524   bind  (CheckSucc);
 525 
 526   // The following optional optimization can be elided if necessary
 527   // Effectively: if (succ == null) goto slow path
 528   // The code reduces the window for a race, however,
 529   // and thus benefits performance.
 530   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 531   jccb  (Assembler::zero, LGoSlowPath);
 532 
 533   xorptr(boxReg, boxReg);
 534   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 535   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 536 
 537   // Memory barrier/fence
 538   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 539   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 540   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 541   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 542   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 543   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 544   lock(); addl(Address(rsp, 0), 0);
 545 
 546   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 547   jccb  (Assembler::notZero, LSuccess);
 548 
 549   // Rare inopportune interleaving - race.
 550   // The successor vanished in the small window above.
 551   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 552   // We need to ensure progress and succession.
 553   // Try to reacquire the lock.
 554   // If that fails then the new owner is responsible for succession and this
 555   // thread needs to take no further action and can exit via the fast path (success).
 556   // If the re-acquire succeeds then pass control into the slow path.
 557   // As implemented, this latter mode is horrible because we generated more
 558   // coherence traffic on the lock *and* artificially extended the critical section
 559   // length while by virtue of passing control into the slow path.
 560 
 561   // box is really RAX -- the following CMPXCHG depends on that binding
 562   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 563   lock();
 564   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 565   // There's no successor so we tried to regrab the lock.
 566   // If that didn't work, then another thread grabbed the
 567   // lock so we're done (and exit was a success).
 568   jccb  (Assembler::notEqual, LSuccess);
 569   // Intentional fall-through into slow path
 570 
 571   bind  (LGoSlowPath);
 572   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 573   jmpb  (DONE_LABEL);
 574 
 575   bind  (LSuccess);
 576   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 577   jmpb  (DONE_LABEL);
 578 
 579 #endif
 580   if (LockingMode == LM_LEGACY) {
 581     bind  (Stacked);
 582     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 583     lock();
 584     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 585     // Intentional fall-thru into DONE_LABEL
 586   }
 587 
 588   bind(DONE_LABEL);
 589 
 590   // ZFlag == 1 count in fast path
 591   // ZFlag == 0 count in slow path
 592   jccb(Assembler::notZero, NO_COUNT);
 593 
 594   bind(COUNT);
 595   // Count monitors in fast path
 596 #ifndef _LP64
 597   get_thread(tmpReg);
 598   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 599 #else // _LP64
 600   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 601 #endif
 602 
 603   xorl(tmpReg, tmpReg); // Set ZF == 1
 604 
 605   bind(NO_COUNT);
 606 }
 607 
 608 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 609                                               Register t, Register thread) {
 610   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 611   assert(rax_reg == rax, "Used for CAS");
 612   assert_different_registers(obj, box, rax_reg, t, thread);
 613 
 614   // Handle inflated monitor.
 615   Label inflated;
 616   // Finish fast lock successfully. ZF value is irrelevant.
 617   Label locked;
 618   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 619   Label slow_path;
 620 
 621   if (UseObjectMonitorTable) {
 622     // Clear cache in case fast locking succeeds.
 623     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 624   }
 625 
 626   if (DiagnoseSyncOnValueBasedClasses != 0) {
 627     load_klass(rax_reg, obj, t);
 628     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 629     jcc(Assembler::notZero, slow_path);
 630   }
 631 
 632   const Register mark = t;
 633 
 634   { // Lightweight Lock
 635 
 636     Label push;
 637 
 638     const Register top = UseObjectMonitorTable ? rax_reg : box;
 639 
 640     // Load the mark.
 641     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 642 
 643     // Prefetch top.
 644     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 645 
 646     // Check for monitor (0b10).
 647     testptr(mark, markWord::monitor_value);
 648     jcc(Assembler::notZero, inflated);
 649 
 650     // Check if lock-stack is full.
 651     cmpl(top, LockStack::end_offset() - 1);
 652     jcc(Assembler::greater, slow_path);
 653 
 654     // Check if recursive.
 655     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 656     jccb(Assembler::equal, push);
 657 
 658     // Try to lock. Transition lock bits 0b01 => 0b00
 659     movptr(rax_reg, mark);
 660     orptr(rax_reg, markWord::unlocked_value);
 661     andptr(mark, ~(int32_t)markWord::unlocked_value);
 662     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 663     jcc(Assembler::notEqual, slow_path);
 664 
 665     if (UseObjectMonitorTable) {
 666       // Need to reload top, clobbered by CAS.
 667       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 668     }
 669     bind(push);
 670     // After successful lock, push object on lock-stack.
 671     movptr(Address(thread, top), obj);
 672     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 673     jmpb(locked);
 674   }
 675 
 676   { // Handle inflated monitor.
 677     bind(inflated);
 678 
 679     const Register monitor = t;
 680 
 681     if (!UseObjectMonitorTable) {
 682       assert(mark == monitor, "should be the same here");
 683     } else {
 684       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 685       // Fetch ObjectMonitor* from the cache or take the slow-path.
 686       Label monitor_found;
 687 
 688       // Load cache address
 689       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 690 
 691       const int num_unrolled = 2;
 692       for (int i = 0; i < num_unrolled; i++) {
 693         cmpptr(obj, Address(t));
 694         jccb(Assembler::equal, monitor_found);
 695         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 696       }
 697 
 698       Label loop;
 699 
 700       // Search for obj in cache.
 701       bind(loop);
 702 
 703       // Check for match.
 704       cmpptr(obj, Address(t));
 705       jccb(Assembler::equal, monitor_found);
 706 
 707       // Search until null encountered, guaranteed _null_sentinel at end.
 708       cmpptr(Address(t), 1);
 709       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 710       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 711       jmpb(loop);
 712 
 713       // Cache hit.
 714       bind(monitor_found);
 715       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 716     }
 717     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 718     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 719     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 720 
 721     Label monitor_locked;
 722     // Lock the monitor.
 723 
 724     // CAS owner (null => current thread).
 725     xorptr(rax_reg, rax_reg);
 726     lock(); cmpxchgptr(thread, owner_address);
 727     jccb(Assembler::equal, monitor_locked);
 728 
 729     // Check if recursive.
 730     cmpptr(thread, rax_reg);
 731     jccb(Assembler::notEqual, slow_path);
 732 
 733     // Recursive.
 734     increment(recursions_address);
 735 
 736     bind(monitor_locked);
 737     if (UseObjectMonitorTable) {
 738       // Cache the monitor for unlock
 739       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 740     }
 741   }
 742 
 743   bind(locked);
 744   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 745   // Set ZF = 1
 746   xorl(rax_reg, rax_reg);
 747 
 748 #ifdef ASSERT
 749   // Check that locked label is reached with ZF set.
 750   Label zf_correct;
 751   Label zf_bad_zero;
 752   jcc(Assembler::zero, zf_correct);
 753   jmp(zf_bad_zero);
 754 #endif
 755 
 756   bind(slow_path);
 757 #ifdef ASSERT
 758   // Check that slow_path label is reached with ZF not set.
 759   jcc(Assembler::notZero, zf_correct);
 760   stop("Fast Lock ZF != 0");
 761   bind(zf_bad_zero);
 762   stop("Fast Lock ZF != 1");
 763   bind(zf_correct);
 764 #endif
 765   // C2 uses the value of ZF to determine the continuation.
 766 }
 767 
 768 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 769   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 770   assert(reg_rax == rax, "Used for CAS");
 771   assert_different_registers(obj, reg_rax, t);
 772 
 773   // Handle inflated monitor.
 774   Label inflated, inflated_check_lock_stack;
 775   // Finish fast unlock successfully.  MUST jump with ZF == 1
 776   Label unlocked;
 777 
 778   // Assume success.
 779   decrement(Address(thread, JavaThread::held_monitor_count_offset()));
 780 
 781   const Register mark = t;
 782   const Register monitor = t;
 783   const Register top = UseObjectMonitorTable ? t : reg_rax;
 784   const Register box = reg_rax;
 785 
 786   Label dummy;
 787   C2FastUnlockLightweightStub* stub = nullptr;
 788 
 789   if (!Compile::current()->output()->in_scratch_emit_size()) {
 790     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 791     Compile::current()->output()->add_stub(stub);
 792   }
 793 
 794   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 795   Label& check_successor = stub == nullptr ? dummy : stub->check_successor();
 796   Label& slow_path = stub == nullptr ? dummy : stub->slow_path();
 797 
 798   { // Lightweight Unlock
 799 
 800     // Load top.
 801     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 802 
 803     if (!UseObjectMonitorTable) {
 804       // Prefetch mark.
 805       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 806     }
 807 
 808     // Check if obj is top of lock-stack.
 809     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 810     // Top of lock stack was not obj. Must be monitor.
 811     jcc(Assembler::notEqual, inflated_check_lock_stack);
 812 
 813     // Pop lock-stack.
 814     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 815     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 816 
 817     // Check if recursive.
 818     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 819     jcc(Assembler::equal, unlocked);
 820 
 821     // We elide the monitor check, let the CAS fail instead.
 822 
 823     if (UseObjectMonitorTable) {
 824       // Load mark.
 825       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 826     }
 827 
 828     // Try to unlock. Transition lock bits 0b00 => 0b01
 829     movptr(reg_rax, mark);
 830     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 831     orptr(mark, markWord::unlocked_value);
 832     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 833     jcc(Assembler::notEqual, push_and_slow_path);
 834     jmp(unlocked);
 835   }
 836 
 837 
 838   { // Handle inflated monitor.
 839     bind(inflated_check_lock_stack);
 840 #ifdef ASSERT
 841     Label check_done;
 842     subl(top, oopSize);
 843     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 844     jcc(Assembler::below, check_done);
 845     cmpptr(obj, Address(thread, top));
 846     jccb(Assembler::notEqual, inflated_check_lock_stack);
 847     stop("Fast Unlock lock on stack");
 848     bind(check_done);
 849     if (UseObjectMonitorTable) {
 850       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 851     }
 852     testptr(mark, markWord::monitor_value);
 853     jccb(Assembler::notZero, inflated);
 854     stop("Fast Unlock not monitor");
 855 #endif
 856 
 857     bind(inflated);
 858 
 859     if (!UseObjectMonitorTable) {
 860       assert(mark == monitor, "should be the same here");
 861     } else {
 862       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 863       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 864       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 865       cmpptr(monitor, alignof(ObjectMonitor*));
 866       jcc(Assembler::below, slow_path);
 867     }
 868     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 869     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 870     const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag};
 871     const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag};
 872     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 873 
 874     Label recursive;
 875 
 876     // Check if recursive.
 877     cmpptr(recursions_address, 0);
 878     jccb(Assembler::notEqual, recursive);
 879 
 880     // Check if the entry lists are empty.
 881     movptr(reg_rax, cxq_address);
 882     orptr(reg_rax, EntryList_address);
 883     jcc(Assembler::notZero, check_successor);
 884 
 885     // Release lock.
 886     movptr(owner_address, NULL_WORD);
 887     jmpb(unlocked);
 888 
 889     // Recursive unlock.
 890     bind(recursive);
 891     decrement(recursions_address);
 892     xorl(t, t);
 893   }
 894 
 895   bind(unlocked);
 896   if (stub != nullptr) {
 897     bind(stub->unlocked_continuation());
 898   }
 899 
 900 #ifdef ASSERT
 901   // Check that unlocked label is reached with ZF set.
 902   Label zf_correct;
 903   jcc(Assembler::zero, zf_correct);
 904   stop("Fast Unlock ZF != 1");
 905 #endif
 906 
 907   if (stub != nullptr) {
 908     bind(stub->slow_path_continuation());
 909   }
 910 #ifdef ASSERT
 911   // Check that stub->continuation() label is reached with ZF not set.
 912   jccb(Assembler::notZero, zf_correct);
 913   stop("Fast Unlock ZF != 0");
 914   bind(zf_correct);
 915 #endif
 916   // C2 uses the value of ZF to determine the continuation.
 917 }
 918 
 919 //-------------------------------------------------------------------------------------------
 920 // Generic instructions support for use in .ad files C2 code generation
 921 
 922 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 923   if (dst != src) {
 924     movdqu(dst, src);
 925   }
 926   if (opcode == Op_AbsVD) {
 927     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 928   } else {
 929     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 930     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 931   }
 932 }
 933 
 934 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 935   if (opcode == Op_AbsVD) {
 936     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 937   } else {
 938     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 939     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 940   }
 941 }
 942 
 943 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 944   if (dst != src) {
 945     movdqu(dst, src);
 946   }
 947   if (opcode == Op_AbsVF) {
 948     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 949   } else {
 950     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 951     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 952   }
 953 }
 954 
 955 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 956   if (opcode == Op_AbsVF) {
 957     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 958   } else {
 959     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 960     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 961   }
 962 }
 963 
 964 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 965   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 966   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 967 
 968   if (opcode == Op_MinV) {
 969     if (elem_bt == T_BYTE) {
 970       pminsb(dst, src);
 971     } else if (elem_bt == T_SHORT) {
 972       pminsw(dst, src);
 973     } else if (elem_bt == T_INT) {
 974       pminsd(dst, src);
 975     } else {
 976       assert(elem_bt == T_LONG, "required");
 977       assert(tmp == xmm0, "required");
 978       assert_different_registers(dst, src, tmp);
 979       movdqu(xmm0, dst);
 980       pcmpgtq(xmm0, src);
 981       blendvpd(dst, src);  // xmm0 as mask
 982     }
 983   } else { // opcode == Op_MaxV
 984     if (elem_bt == T_BYTE) {
 985       pmaxsb(dst, src);
 986     } else if (elem_bt == T_SHORT) {
 987       pmaxsw(dst, src);
 988     } else if (elem_bt == T_INT) {
 989       pmaxsd(dst, src);
 990     } else {
 991       assert(elem_bt == T_LONG, "required");
 992       assert(tmp == xmm0, "required");
 993       assert_different_registers(dst, src, tmp);
 994       movdqu(xmm0, src);
 995       pcmpgtq(xmm0, dst);
 996       blendvpd(dst, src);  // xmm0 as mask
 997     }
 998   }
 999 }
1000 
1001 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1002                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1003                                  int vlen_enc) {
1004   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1005 
1006   if (opcode == Op_MinV) {
1007     if (elem_bt == T_BYTE) {
1008       vpminsb(dst, src1, src2, vlen_enc);
1009     } else if (elem_bt == T_SHORT) {
1010       vpminsw(dst, src1, src2, vlen_enc);
1011     } else if (elem_bt == T_INT) {
1012       vpminsd(dst, src1, src2, vlen_enc);
1013     } else {
1014       assert(elem_bt == T_LONG, "required");
1015       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1016         vpminsq(dst, src1, src2, vlen_enc);
1017       } else {
1018         assert_different_registers(dst, src1, src2);
1019         vpcmpgtq(dst, src1, src2, vlen_enc);
1020         vblendvpd(dst, src1, src2, dst, vlen_enc);
1021       }
1022     }
1023   } else { // opcode == Op_MaxV
1024     if (elem_bt == T_BYTE) {
1025       vpmaxsb(dst, src1, src2, vlen_enc);
1026     } else if (elem_bt == T_SHORT) {
1027       vpmaxsw(dst, src1, src2, vlen_enc);
1028     } else if (elem_bt == T_INT) {
1029       vpmaxsd(dst, src1, src2, vlen_enc);
1030     } else {
1031       assert(elem_bt == T_LONG, "required");
1032       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1033         vpmaxsq(dst, src1, src2, vlen_enc);
1034       } else {
1035         assert_different_registers(dst, src1, src2);
1036         vpcmpgtq(dst, src1, src2, vlen_enc);
1037         vblendvpd(dst, src2, src1, dst, vlen_enc);
1038       }
1039     }
1040   }
1041 }
1042 
1043 // Float/Double min max
1044 
1045 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1046                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1047                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1048                                    int vlen_enc) {
1049   assert(UseAVX > 0, "required");
1050   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1051          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1052   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1053   assert_different_registers(a, tmp, atmp, btmp);
1054   assert_different_registers(b, tmp, atmp, btmp);
1055 
1056   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1057   bool is_double_word = is_double_word_type(elem_bt);
1058 
1059   /* Note on 'non-obvious' assembly sequence:
1060    *
1061    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1062    * and Java on how they handle floats:
1063    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1064    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1065    *
1066    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1067    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1068    *                (only useful when signs differ, noop otherwise)
1069    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1070 
1071    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1072    *   btmp = (b < +0.0) ? a : b
1073    *   atmp = (b < +0.0) ? b : a
1074    *   Tmp  = Max_Float(atmp , btmp)
1075    *   Res  = (atmp == NaN) ? atmp : Tmp
1076    */
1077 
1078   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1079   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1080   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1081   XMMRegister mask;
1082 
1083   if (!is_double_word && is_min) {
1084     mask = a;
1085     vblend = &MacroAssembler::vblendvps;
1086     vmaxmin = &MacroAssembler::vminps;
1087     vcmp = &MacroAssembler::vcmpps;
1088   } else if (!is_double_word && !is_min) {
1089     mask = b;
1090     vblend = &MacroAssembler::vblendvps;
1091     vmaxmin = &MacroAssembler::vmaxps;
1092     vcmp = &MacroAssembler::vcmpps;
1093   } else if (is_double_word && is_min) {
1094     mask = a;
1095     vblend = &MacroAssembler::vblendvpd;
1096     vmaxmin = &MacroAssembler::vminpd;
1097     vcmp = &MacroAssembler::vcmppd;
1098   } else {
1099     assert(is_double_word && !is_min, "sanity");
1100     mask = b;
1101     vblend = &MacroAssembler::vblendvpd;
1102     vmaxmin = &MacroAssembler::vmaxpd;
1103     vcmp = &MacroAssembler::vcmppd;
1104   }
1105 
1106   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1107   XMMRegister maxmin, scratch;
1108   if (dst == btmp) {
1109     maxmin = btmp;
1110     scratch = tmp;
1111   } else {
1112     maxmin = tmp;
1113     scratch = btmp;
1114   }
1115 
1116   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1117   if (precompute_mask && !is_double_word) {
1118     vpsrad(tmp, mask, 32, vlen_enc);
1119     mask = tmp;
1120   } else if (precompute_mask && is_double_word) {
1121     vpxor(tmp, tmp, tmp, vlen_enc);
1122     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1123     mask = tmp;
1124   }
1125 
1126   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1127   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1128   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1129   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1130   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1131 }
1132 
1133 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1134                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1135                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1136                                     int vlen_enc) {
1137   assert(UseAVX > 2, "required");
1138   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1139          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1140   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1141   assert_different_registers(dst, a, atmp, btmp);
1142   assert_different_registers(dst, b, atmp, btmp);
1143 
1144   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1145   bool is_double_word = is_double_word_type(elem_bt);
1146   bool merge = true;
1147 
1148   if (!is_double_word && is_min) {
1149     evpmovd2m(ktmp, a, vlen_enc);
1150     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1151     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1152     vminps(dst, atmp, btmp, vlen_enc);
1153     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1154     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1155   } else if (!is_double_word && !is_min) {
1156     evpmovd2m(ktmp, b, vlen_enc);
1157     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1158     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1159     vmaxps(dst, atmp, btmp, vlen_enc);
1160     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1161     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1162   } else if (is_double_word && is_min) {
1163     evpmovq2m(ktmp, a, vlen_enc);
1164     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1165     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1166     vminpd(dst, atmp, btmp, vlen_enc);
1167     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1168     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1169   } else {
1170     assert(is_double_word && !is_min, "sanity");
1171     evpmovq2m(ktmp, b, vlen_enc);
1172     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1173     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1174     vmaxpd(dst, atmp, btmp, vlen_enc);
1175     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1176     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1177   }
1178 }
1179 
1180 // Float/Double signum
1181 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1182   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1183 
1184   Label DONE_LABEL;
1185 
1186   if (opcode == Op_SignumF) {
1187     assert(UseSSE > 0, "required");
1188     ucomiss(dst, zero);
1189     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1190     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1191     movflt(dst, one);
1192     jcc(Assembler::above, DONE_LABEL);
1193     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1194   } else if (opcode == Op_SignumD) {
1195     assert(UseSSE > 1, "required");
1196     ucomisd(dst, zero);
1197     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1198     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1199     movdbl(dst, one);
1200     jcc(Assembler::above, DONE_LABEL);
1201     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1202   }
1203 
1204   bind(DONE_LABEL);
1205 }
1206 
1207 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1208   if (sign) {
1209     pmovsxbw(dst, src);
1210   } else {
1211     pmovzxbw(dst, src);
1212   }
1213 }
1214 
1215 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1216   if (sign) {
1217     vpmovsxbw(dst, src, vector_len);
1218   } else {
1219     vpmovzxbw(dst, src, vector_len);
1220   }
1221 }
1222 
1223 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1224   if (sign) {
1225     vpmovsxbd(dst, src, vector_len);
1226   } else {
1227     vpmovzxbd(dst, src, vector_len);
1228   }
1229 }
1230 
1231 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1232   if (sign) {
1233     vpmovsxwd(dst, src, vector_len);
1234   } else {
1235     vpmovzxwd(dst, src, vector_len);
1236   }
1237 }
1238 
1239 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1240                                      int shift, int vector_len) {
1241   if (opcode == Op_RotateLeftV) {
1242     if (etype == T_INT) {
1243       evprold(dst, src, shift, vector_len);
1244     } else {
1245       assert(etype == T_LONG, "expected type T_LONG");
1246       evprolq(dst, src, shift, vector_len);
1247     }
1248   } else {
1249     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1250     if (etype == T_INT) {
1251       evprord(dst, src, shift, vector_len);
1252     } else {
1253       assert(etype == T_LONG, "expected type T_LONG");
1254       evprorq(dst, src, shift, vector_len);
1255     }
1256   }
1257 }
1258 
1259 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1260                                      XMMRegister shift, int vector_len) {
1261   if (opcode == Op_RotateLeftV) {
1262     if (etype == T_INT) {
1263       evprolvd(dst, src, shift, vector_len);
1264     } else {
1265       assert(etype == T_LONG, "expected type T_LONG");
1266       evprolvq(dst, src, shift, vector_len);
1267     }
1268   } else {
1269     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1270     if (etype == T_INT) {
1271       evprorvd(dst, src, shift, vector_len);
1272     } else {
1273       assert(etype == T_LONG, "expected type T_LONG");
1274       evprorvq(dst, src, shift, vector_len);
1275     }
1276   }
1277 }
1278 
1279 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1280   if (opcode == Op_RShiftVI) {
1281     psrad(dst, shift);
1282   } else if (opcode == Op_LShiftVI) {
1283     pslld(dst, shift);
1284   } else {
1285     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1286     psrld(dst, shift);
1287   }
1288 }
1289 
1290 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1291   switch (opcode) {
1292     case Op_RShiftVI:  psrad(dst, shift); break;
1293     case Op_LShiftVI:  pslld(dst, shift); break;
1294     case Op_URShiftVI: psrld(dst, shift); break;
1295 
1296     default: assert(false, "%s", NodeClassNames[opcode]);
1297   }
1298 }
1299 
1300 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1301   if (opcode == Op_RShiftVI) {
1302     vpsrad(dst, nds, shift, vector_len);
1303   } else if (opcode == Op_LShiftVI) {
1304     vpslld(dst, nds, shift, vector_len);
1305   } else {
1306     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1307     vpsrld(dst, nds, shift, vector_len);
1308   }
1309 }
1310 
1311 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1312   switch (opcode) {
1313     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1314     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1315     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1316 
1317     default: assert(false, "%s", NodeClassNames[opcode]);
1318   }
1319 }
1320 
1321 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1322   switch (opcode) {
1323     case Op_RShiftVB:  // fall-through
1324     case Op_RShiftVS:  psraw(dst, shift); break;
1325 
1326     case Op_LShiftVB:  // fall-through
1327     case Op_LShiftVS:  psllw(dst, shift);   break;
1328 
1329     case Op_URShiftVS: // fall-through
1330     case Op_URShiftVB: psrlw(dst, shift);  break;
1331 
1332     default: assert(false, "%s", NodeClassNames[opcode]);
1333   }
1334 }
1335 
1336 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1337   switch (opcode) {
1338     case Op_RShiftVB:  // fall-through
1339     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1340 
1341     case Op_LShiftVB:  // fall-through
1342     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1343 
1344     case Op_URShiftVS: // fall-through
1345     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1346 
1347     default: assert(false, "%s", NodeClassNames[opcode]);
1348   }
1349 }
1350 
1351 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1352   switch (opcode) {
1353     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1354     case Op_LShiftVL:  psllq(dst, shift); break;
1355     case Op_URShiftVL: psrlq(dst, shift); break;
1356 
1357     default: assert(false, "%s", NodeClassNames[opcode]);
1358   }
1359 }
1360 
1361 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1362   if (opcode == Op_RShiftVL) {
1363     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1364   } else if (opcode == Op_LShiftVL) {
1365     psllq(dst, shift);
1366   } else {
1367     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1368     psrlq(dst, shift);
1369   }
1370 }
1371 
1372 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1373   switch (opcode) {
1374     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1375     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1376     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1377 
1378     default: assert(false, "%s", NodeClassNames[opcode]);
1379   }
1380 }
1381 
1382 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1383   if (opcode == Op_RShiftVL) {
1384     evpsraq(dst, nds, shift, vector_len);
1385   } else if (opcode == Op_LShiftVL) {
1386     vpsllq(dst, nds, shift, vector_len);
1387   } else {
1388     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1389     vpsrlq(dst, nds, shift, vector_len);
1390   }
1391 }
1392 
1393 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1394   switch (opcode) {
1395     case Op_RShiftVB:  // fall-through
1396     case Op_RShiftVS:  // fall-through
1397     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1398 
1399     case Op_LShiftVB:  // fall-through
1400     case Op_LShiftVS:  // fall-through
1401     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1402 
1403     case Op_URShiftVB: // fall-through
1404     case Op_URShiftVS: // fall-through
1405     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1406 
1407     default: assert(false, "%s", NodeClassNames[opcode]);
1408   }
1409 }
1410 
1411 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1412   switch (opcode) {
1413     case Op_RShiftVB:  // fall-through
1414     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1415 
1416     case Op_LShiftVB:  // fall-through
1417     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1418 
1419     case Op_URShiftVB: // fall-through
1420     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1421 
1422     default: assert(false, "%s", NodeClassNames[opcode]);
1423   }
1424 }
1425 
1426 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1427   assert(UseAVX >= 2, "required");
1428   switch (opcode) {
1429     case Op_RShiftVL: {
1430       if (UseAVX > 2) {
1431         assert(tmp == xnoreg, "not used");
1432         if (!VM_Version::supports_avx512vl()) {
1433           vlen_enc = Assembler::AVX_512bit;
1434         }
1435         evpsravq(dst, src, shift, vlen_enc);
1436       } else {
1437         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1438         vpsrlvq(dst, src, shift, vlen_enc);
1439         vpsrlvq(tmp, tmp, shift, vlen_enc);
1440         vpxor(dst, dst, tmp, vlen_enc);
1441         vpsubq(dst, dst, tmp, vlen_enc);
1442       }
1443       break;
1444     }
1445     case Op_LShiftVL: {
1446       assert(tmp == xnoreg, "not used");
1447       vpsllvq(dst, src, shift, vlen_enc);
1448       break;
1449     }
1450     case Op_URShiftVL: {
1451       assert(tmp == xnoreg, "not used");
1452       vpsrlvq(dst, src, shift, vlen_enc);
1453       break;
1454     }
1455     default: assert(false, "%s", NodeClassNames[opcode]);
1456   }
1457 }
1458 
1459 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1460 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1461   assert(opcode == Op_LShiftVB ||
1462          opcode == Op_RShiftVB ||
1463          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1464   bool sign = (opcode != Op_URShiftVB);
1465   assert(vector_len == 0, "required");
1466   vextendbd(sign, dst, src, 1);
1467   vpmovzxbd(vtmp, shift, 1);
1468   varshiftd(opcode, dst, dst, vtmp, 1);
1469   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1470   vextracti128_high(vtmp, dst);
1471   vpackusdw(dst, dst, vtmp, 0);
1472 }
1473 
1474 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1475 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1476   assert(opcode == Op_LShiftVB ||
1477          opcode == Op_RShiftVB ||
1478          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1479   bool sign = (opcode != Op_URShiftVB);
1480   int ext_vector_len = vector_len + 1;
1481   vextendbw(sign, dst, src, ext_vector_len);
1482   vpmovzxbw(vtmp, shift, ext_vector_len);
1483   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1484   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1485   if (vector_len == 0) {
1486     vextracti128_high(vtmp, dst);
1487     vpackuswb(dst, dst, vtmp, vector_len);
1488   } else {
1489     vextracti64x4_high(vtmp, dst);
1490     vpackuswb(dst, dst, vtmp, vector_len);
1491     vpermq(dst, dst, 0xD8, vector_len);
1492   }
1493 }
1494 
1495 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1496   switch(typ) {
1497     case T_BYTE:
1498       pinsrb(dst, val, idx);
1499       break;
1500     case T_SHORT:
1501       pinsrw(dst, val, idx);
1502       break;
1503     case T_INT:
1504       pinsrd(dst, val, idx);
1505       break;
1506     case T_LONG:
1507       pinsrq(dst, val, idx);
1508       break;
1509     default:
1510       assert(false,"Should not reach here.");
1511       break;
1512   }
1513 }
1514 
1515 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1516   switch(typ) {
1517     case T_BYTE:
1518       vpinsrb(dst, src, val, idx);
1519       break;
1520     case T_SHORT:
1521       vpinsrw(dst, src, val, idx);
1522       break;
1523     case T_INT:
1524       vpinsrd(dst, src, val, idx);
1525       break;
1526     case T_LONG:
1527       vpinsrq(dst, src, val, idx);
1528       break;
1529     default:
1530       assert(false,"Should not reach here.");
1531       break;
1532   }
1533 }
1534 
1535 #ifdef _LP64
1536 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1537                                                 XMMRegister dst, Register base,
1538                                                 Register idx_base,
1539                                                 Register offset, Register mask,
1540                                                 Register mask_idx, Register rtmp,
1541                                                 int vlen_enc) {
1542   vpxor(dst, dst, dst, vlen_enc);
1543   if (elem_bt == T_SHORT) {
1544     for (int i = 0; i < 4; i++) {
1545       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1546       Label skip_load;
1547       btq(mask, mask_idx);
1548       jccb(Assembler::carryClear, skip_load);
1549       movl(rtmp, Address(idx_base, i * 4));
1550       if (offset != noreg) {
1551         addl(rtmp, offset);
1552       }
1553       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1554       bind(skip_load);
1555       incq(mask_idx);
1556     }
1557   } else {
1558     assert(elem_bt == T_BYTE, "");
1559     for (int i = 0; i < 8; i++) {
1560       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1561       Label skip_load;
1562       btq(mask, mask_idx);
1563       jccb(Assembler::carryClear, skip_load);
1564       movl(rtmp, Address(idx_base, i * 4));
1565       if (offset != noreg) {
1566         addl(rtmp, offset);
1567       }
1568       pinsrb(dst, Address(base, rtmp), i);
1569       bind(skip_load);
1570       incq(mask_idx);
1571     }
1572   }
1573 }
1574 #endif // _LP64
1575 
1576 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1577                                          Register base, Register idx_base,
1578                                          Register offset, Register rtmp,
1579                                          int vlen_enc) {
1580   vpxor(dst, dst, dst, vlen_enc);
1581   if (elem_bt == T_SHORT) {
1582     for (int i = 0; i < 4; i++) {
1583       // dst[i] = src[offset + idx_base[i]]
1584       movl(rtmp, Address(idx_base, i * 4));
1585       if (offset != noreg) {
1586         addl(rtmp, offset);
1587       }
1588       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1589     }
1590   } else {
1591     assert(elem_bt == T_BYTE, "");
1592     for (int i = 0; i < 8; i++) {
1593       // dst[i] = src[offset + idx_base[i]]
1594       movl(rtmp, Address(idx_base, i * 4));
1595       if (offset != noreg) {
1596         addl(rtmp, offset);
1597       }
1598       pinsrb(dst, Address(base, rtmp), i);
1599     }
1600   }
1601 }
1602 
1603 /*
1604  * Gather using hybrid algorithm, first partially unroll scalar loop
1605  * to accumulate values from gather indices into a quad-word(64bit) slice.
1606  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1607  * permutation to place the slice into appropriate vector lane
1608  * locations in destination vector. Following pseudo code describes the
1609  * algorithm in detail:
1610  *
1611  * DST_VEC = ZERO_VEC
1612  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1613  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1614  * FOREACH_ITER:
1615  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1616  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1617  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1618  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1619  *
1620  * With each iteration, doubleword permute indices (0,1) corresponding
1621  * to gathered quadword gets right shifted by two lane positions.
1622  *
1623  */
1624 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1625                                         Register base, Register idx_base,
1626                                         Register offset, Register mask,
1627                                         XMMRegister xtmp1, XMMRegister xtmp2,
1628                                         XMMRegister temp_dst, Register rtmp,
1629                                         Register mask_idx, Register length,
1630                                         int vector_len, int vlen_enc) {
1631   Label GATHER8_LOOP;
1632   assert(is_subword_type(elem_ty), "");
1633   movl(length, vector_len);
1634   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1635   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1636   vallones(xtmp2, vlen_enc);
1637   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1638   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1639   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1640 
1641   bind(GATHER8_LOOP);
1642     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1643     if (mask == noreg) {
1644       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1645     } else {
1646       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1647     }
1648     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1649     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1650     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1651     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1652     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1653     vpor(dst, dst, temp_dst, vlen_enc);
1654     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1655     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1656     jcc(Assembler::notEqual, GATHER8_LOOP);
1657 }
1658 
1659 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1660   switch(typ) {
1661     case T_INT:
1662       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1663       break;
1664     case T_FLOAT:
1665       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1666       break;
1667     case T_LONG:
1668       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1669       break;
1670     case T_DOUBLE:
1671       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1672       break;
1673     default:
1674       assert(false,"Should not reach here.");
1675       break;
1676   }
1677 }
1678 
1679 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1680   switch(typ) {
1681     case T_INT:
1682       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1683       break;
1684     case T_FLOAT:
1685       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1686       break;
1687     case T_LONG:
1688       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1689       break;
1690     case T_DOUBLE:
1691       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1692       break;
1693     default:
1694       assert(false,"Should not reach here.");
1695       break;
1696   }
1697 }
1698 
1699 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1700   switch(typ) {
1701     case T_INT:
1702       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1703       break;
1704     case T_FLOAT:
1705       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1706       break;
1707     case T_LONG:
1708       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1709       break;
1710     case T_DOUBLE:
1711       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1712       break;
1713     default:
1714       assert(false,"Should not reach here.");
1715       break;
1716   }
1717 }
1718 
1719 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1720   if (vlen_in_bytes <= 16) {
1721     pxor (dst, dst);
1722     psubb(dst, src);
1723     switch (elem_bt) {
1724       case T_BYTE:   /* nothing to do */ break;
1725       case T_SHORT:  pmovsxbw(dst, dst); break;
1726       case T_INT:    pmovsxbd(dst, dst); break;
1727       case T_FLOAT:  pmovsxbd(dst, dst); break;
1728       case T_LONG:   pmovsxbq(dst, dst); break;
1729       case T_DOUBLE: pmovsxbq(dst, dst); break;
1730 
1731       default: assert(false, "%s", type2name(elem_bt));
1732     }
1733   } else {
1734     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1735     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1736 
1737     vpxor (dst, dst, dst, vlen_enc);
1738     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1739 
1740     switch (elem_bt) {
1741       case T_BYTE:   /* nothing to do */            break;
1742       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1743       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1744       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1745       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1746       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1747 
1748       default: assert(false, "%s", type2name(elem_bt));
1749     }
1750   }
1751 }
1752 
1753 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1754   if (novlbwdq) {
1755     vpmovsxbd(xtmp, src, vlen_enc);
1756     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1757             Assembler::eq, true, vlen_enc, noreg);
1758   } else {
1759     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1760     vpsubb(xtmp, xtmp, src, vlen_enc);
1761     evpmovb2m(dst, xtmp, vlen_enc);
1762   }
1763 }
1764 
1765 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1766   switch (vlen_in_bytes) {
1767     case 4:  movdl(dst, src);   break;
1768     case 8:  movq(dst, src);    break;
1769     case 16: movdqu(dst, src);  break;
1770     case 32: vmovdqu(dst, src); break;
1771     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1772     default: ShouldNotReachHere();
1773   }
1774 }
1775 
1776 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1777   assert(rscratch != noreg || always_reachable(src), "missing");
1778 
1779   if (reachable(src)) {
1780     load_vector(dst, as_Address(src), vlen_in_bytes);
1781   } else {
1782     lea(rscratch, src);
1783     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1784   }
1785 }
1786 
1787 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1788   int vlen_enc = vector_length_encoding(vlen);
1789   if (VM_Version::supports_avx()) {
1790     if (bt == T_LONG) {
1791       if (VM_Version::supports_avx2()) {
1792         vpbroadcastq(dst, src, vlen_enc);
1793       } else {
1794         vmovddup(dst, src, vlen_enc);
1795       }
1796     } else if (bt == T_DOUBLE) {
1797       if (vlen_enc != Assembler::AVX_128bit) {
1798         vbroadcastsd(dst, src, vlen_enc, noreg);
1799       } else {
1800         vmovddup(dst, src, vlen_enc);
1801       }
1802     } else {
1803       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1804         vpbroadcastd(dst, src, vlen_enc);
1805       } else {
1806         vbroadcastss(dst, src, vlen_enc);
1807       }
1808     }
1809   } else if (VM_Version::supports_sse3()) {
1810     movddup(dst, src);
1811   } else {
1812     movq(dst, src);
1813     if (vlen == 16) {
1814       punpcklqdq(dst, dst);
1815     }
1816   }
1817 }
1818 
1819 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1820   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1821   int offset = exact_log2(type2aelembytes(bt)) << 6;
1822   if (is_floating_point_type(bt)) {
1823     offset += 128;
1824   }
1825   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1826   load_vector(dst, addr, vlen_in_bytes);
1827 }
1828 
1829 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1830 
1831 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1832   int vector_len = Assembler::AVX_128bit;
1833 
1834   switch (opcode) {
1835     case Op_AndReductionV:  pand(dst, src); break;
1836     case Op_OrReductionV:   por (dst, src); break;
1837     case Op_XorReductionV:  pxor(dst, src); break;
1838     case Op_MinReductionV:
1839       switch (typ) {
1840         case T_BYTE:        pminsb(dst, src); break;
1841         case T_SHORT:       pminsw(dst, src); break;
1842         case T_INT:         pminsd(dst, src); break;
1843         case T_LONG:        assert(UseAVX > 2, "required");
1844                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1845         default:            assert(false, "wrong type");
1846       }
1847       break;
1848     case Op_MaxReductionV:
1849       switch (typ) {
1850         case T_BYTE:        pmaxsb(dst, src); break;
1851         case T_SHORT:       pmaxsw(dst, src); break;
1852         case T_INT:         pmaxsd(dst, src); break;
1853         case T_LONG:        assert(UseAVX > 2, "required");
1854                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1855         default:            assert(false, "wrong type");
1856       }
1857       break;
1858     case Op_AddReductionVF: addss(dst, src); break;
1859     case Op_AddReductionVD: addsd(dst, src); break;
1860     case Op_AddReductionVI:
1861       switch (typ) {
1862         case T_BYTE:        paddb(dst, src); break;
1863         case T_SHORT:       paddw(dst, src); break;
1864         case T_INT:         paddd(dst, src); break;
1865         default:            assert(false, "wrong type");
1866       }
1867       break;
1868     case Op_AddReductionVL: paddq(dst, src); break;
1869     case Op_MulReductionVF: mulss(dst, src); break;
1870     case Op_MulReductionVD: mulsd(dst, src); break;
1871     case Op_MulReductionVI:
1872       switch (typ) {
1873         case T_SHORT:       pmullw(dst, src); break;
1874         case T_INT:         pmulld(dst, src); break;
1875         default:            assert(false, "wrong type");
1876       }
1877       break;
1878     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1879                             evpmullq(dst, dst, src, vector_len); break;
1880     default:                assert(false, "wrong opcode");
1881   }
1882 }
1883 
1884 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1885   switch (opcode) {
1886     case Op_AddReductionVF: addps(dst, src); break;
1887     case Op_AddReductionVD: addpd(dst, src); break;
1888     case Op_MulReductionVF: mulps(dst, src); break;
1889     case Op_MulReductionVD: mulpd(dst, src); break;
1890     default:                assert(false, "%s", NodeClassNames[opcode]);
1891   }
1892 }
1893 
1894 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1895   int vector_len = Assembler::AVX_256bit;
1896 
1897   switch (opcode) {
1898     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1899     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1900     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1901     case Op_MinReductionV:
1902       switch (typ) {
1903         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1904         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1905         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1906         case T_LONG:        assert(UseAVX > 2, "required");
1907                             vpminsq(dst, src1, src2, vector_len); break;
1908         default:            assert(false, "wrong type");
1909       }
1910       break;
1911     case Op_MaxReductionV:
1912       switch (typ) {
1913         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1914         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1915         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1916         case T_LONG:        assert(UseAVX > 2, "required");
1917                             vpmaxsq(dst, src1, src2, vector_len); break;
1918         default:            assert(false, "wrong type");
1919       }
1920       break;
1921     case Op_AddReductionVI:
1922       switch (typ) {
1923         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1924         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1925         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1926         default:            assert(false, "wrong type");
1927       }
1928       break;
1929     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1930     case Op_MulReductionVI:
1931       switch (typ) {
1932         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1933         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1934         default:            assert(false, "wrong type");
1935       }
1936       break;
1937     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1938     default:                assert(false, "wrong opcode");
1939   }
1940 }
1941 
1942 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1943   int vector_len = Assembler::AVX_256bit;
1944 
1945   switch (opcode) {
1946     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1947     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1948     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1949     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1950     default:                assert(false, "%s", NodeClassNames[opcode]);
1951   }
1952 }
1953 
1954 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1955                                   XMMRegister dst, XMMRegister src,
1956                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1957   switch (opcode) {
1958     case Op_AddReductionVF:
1959     case Op_MulReductionVF:
1960       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1961       break;
1962 
1963     case Op_AddReductionVD:
1964     case Op_MulReductionVD:
1965       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1966       break;
1967 
1968     default: assert(false, "wrong opcode");
1969   }
1970 }
1971 
1972 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1973                                             XMMRegister dst, XMMRegister src,
1974                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1975   switch (opcode) {
1976     case Op_AddReductionVF:
1977     case Op_MulReductionVF:
1978       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1979       break;
1980 
1981     case Op_AddReductionVD:
1982     case Op_MulReductionVD:
1983       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1984       break;
1985 
1986     default: assert(false, "%s", NodeClassNames[opcode]);
1987   }
1988 }
1989 
1990 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1991                              Register dst, Register src1, XMMRegister src2,
1992                              XMMRegister vtmp1, XMMRegister vtmp2) {
1993   switch (vlen) {
1994     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1995     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1996     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1997     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1998 
1999     default: assert(false, "wrong vector length");
2000   }
2001 }
2002 
2003 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2004                              Register dst, Register src1, XMMRegister src2,
2005                              XMMRegister vtmp1, XMMRegister vtmp2) {
2006   switch (vlen) {
2007     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2008     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2009     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2010     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2011 
2012     default: assert(false, "wrong vector length");
2013   }
2014 }
2015 
2016 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2017                              Register dst, Register src1, XMMRegister src2,
2018                              XMMRegister vtmp1, XMMRegister vtmp2) {
2019   switch (vlen) {
2020     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2021     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2022     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2023     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2024 
2025     default: assert(false, "wrong vector length");
2026   }
2027 }
2028 
2029 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2030                              Register dst, Register src1, XMMRegister src2,
2031                              XMMRegister vtmp1, XMMRegister vtmp2) {
2032   switch (vlen) {
2033     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2034     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2035     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2036     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2037 
2038     default: assert(false, "wrong vector length");
2039   }
2040 }
2041 
2042 #ifdef _LP64
2043 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2044                              Register dst, Register src1, XMMRegister src2,
2045                              XMMRegister vtmp1, XMMRegister vtmp2) {
2046   switch (vlen) {
2047     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2048     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2049     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2050 
2051     default: assert(false, "wrong vector length");
2052   }
2053 }
2054 #endif // _LP64
2055 
2056 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2057   switch (vlen) {
2058     case 2:
2059       assert(vtmp2 == xnoreg, "");
2060       reduce2F(opcode, dst, src, vtmp1);
2061       break;
2062     case 4:
2063       assert(vtmp2 == xnoreg, "");
2064       reduce4F(opcode, dst, src, vtmp1);
2065       break;
2066     case 8:
2067       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2068       break;
2069     case 16:
2070       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2071       break;
2072     default: assert(false, "wrong vector length");
2073   }
2074 }
2075 
2076 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2077   switch (vlen) {
2078     case 2:
2079       assert(vtmp2 == xnoreg, "");
2080       reduce2D(opcode, dst, src, vtmp1);
2081       break;
2082     case 4:
2083       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2084       break;
2085     case 8:
2086       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2087       break;
2088     default: assert(false, "wrong vector length");
2089   }
2090 }
2091 
2092 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2093   switch (vlen) {
2094     case 2:
2095       assert(vtmp1 == xnoreg, "");
2096       assert(vtmp2 == xnoreg, "");
2097       unorderedReduce2F(opcode, dst, src);
2098       break;
2099     case 4:
2100       assert(vtmp2 == xnoreg, "");
2101       unorderedReduce4F(opcode, dst, src, vtmp1);
2102       break;
2103     case 8:
2104       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2105       break;
2106     case 16:
2107       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2108       break;
2109     default: assert(false, "wrong vector length");
2110   }
2111 }
2112 
2113 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2114   switch (vlen) {
2115     case 2:
2116       assert(vtmp1 == xnoreg, "");
2117       assert(vtmp2 == xnoreg, "");
2118       unorderedReduce2D(opcode, dst, src);
2119       break;
2120     case 4:
2121       assert(vtmp2 == xnoreg, "");
2122       unorderedReduce4D(opcode, dst, src, vtmp1);
2123       break;
2124     case 8:
2125       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2126       break;
2127     default: assert(false, "wrong vector length");
2128   }
2129 }
2130 
2131 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2132   if (opcode == Op_AddReductionVI) {
2133     if (vtmp1 != src2) {
2134       movdqu(vtmp1, src2);
2135     }
2136     phaddd(vtmp1, vtmp1);
2137   } else {
2138     pshufd(vtmp1, src2, 0x1);
2139     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2140   }
2141   movdl(vtmp2, src1);
2142   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2143   movdl(dst, vtmp1);
2144 }
2145 
2146 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2147   if (opcode == Op_AddReductionVI) {
2148     if (vtmp1 != src2) {
2149       movdqu(vtmp1, src2);
2150     }
2151     phaddd(vtmp1, src2);
2152     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2153   } else {
2154     pshufd(vtmp2, src2, 0xE);
2155     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2156     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2157   }
2158 }
2159 
2160 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2161   if (opcode == Op_AddReductionVI) {
2162     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2163     vextracti128_high(vtmp2, vtmp1);
2164     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2165     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2166   } else {
2167     vextracti128_high(vtmp1, src2);
2168     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2169     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2170   }
2171 }
2172 
2173 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2174   vextracti64x4_high(vtmp2, src2);
2175   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2176   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2177 }
2178 
2179 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2180   pshufd(vtmp2, src2, 0x1);
2181   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2182   movdqu(vtmp1, vtmp2);
2183   psrldq(vtmp1, 2);
2184   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2185   movdqu(vtmp2, vtmp1);
2186   psrldq(vtmp2, 1);
2187   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2188   movdl(vtmp2, src1);
2189   pmovsxbd(vtmp1, vtmp1);
2190   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2191   pextrb(dst, vtmp1, 0x0);
2192   movsbl(dst, dst);
2193 }
2194 
2195 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2196   pshufd(vtmp1, src2, 0xE);
2197   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2198   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2199 }
2200 
2201 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2202   vextracti128_high(vtmp2, src2);
2203   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2204   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2205 }
2206 
2207 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2208   vextracti64x4_high(vtmp1, src2);
2209   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2210   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2211 }
2212 
2213 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2214   pmovsxbw(vtmp2, src2);
2215   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2216 }
2217 
2218 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2219   if (UseAVX > 1) {
2220     int vector_len = Assembler::AVX_256bit;
2221     vpmovsxbw(vtmp1, src2, vector_len);
2222     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2223   } else {
2224     pmovsxbw(vtmp2, src2);
2225     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2226     pshufd(vtmp2, src2, 0x1);
2227     pmovsxbw(vtmp2, src2);
2228     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2229   }
2230 }
2231 
2232 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2233   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2234     int vector_len = Assembler::AVX_512bit;
2235     vpmovsxbw(vtmp1, src2, vector_len);
2236     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2237   } else {
2238     assert(UseAVX >= 2,"Should not reach here.");
2239     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2240     vextracti128_high(vtmp2, src2);
2241     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2242   }
2243 }
2244 
2245 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2246   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2247   vextracti64x4_high(vtmp2, src2);
2248   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2249 }
2250 
2251 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2252   if (opcode == Op_AddReductionVI) {
2253     if (vtmp1 != src2) {
2254       movdqu(vtmp1, src2);
2255     }
2256     phaddw(vtmp1, vtmp1);
2257     phaddw(vtmp1, vtmp1);
2258   } else {
2259     pshufd(vtmp2, src2, 0x1);
2260     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2261     movdqu(vtmp1, vtmp2);
2262     psrldq(vtmp1, 2);
2263     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2264   }
2265   movdl(vtmp2, src1);
2266   pmovsxwd(vtmp1, vtmp1);
2267   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2268   pextrw(dst, vtmp1, 0x0);
2269   movswl(dst, dst);
2270 }
2271 
2272 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2273   if (opcode == Op_AddReductionVI) {
2274     if (vtmp1 != src2) {
2275       movdqu(vtmp1, src2);
2276     }
2277     phaddw(vtmp1, src2);
2278   } else {
2279     pshufd(vtmp1, src2, 0xE);
2280     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2281   }
2282   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2283 }
2284 
2285 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2286   if (opcode == Op_AddReductionVI) {
2287     int vector_len = Assembler::AVX_256bit;
2288     vphaddw(vtmp2, src2, src2, vector_len);
2289     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2290   } else {
2291     vextracti128_high(vtmp2, src2);
2292     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2293   }
2294   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2295 }
2296 
2297 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2298   int vector_len = Assembler::AVX_256bit;
2299   vextracti64x4_high(vtmp1, src2);
2300   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2301   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2302 }
2303 
2304 #ifdef _LP64
2305 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2306   pshufd(vtmp2, src2, 0xE);
2307   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2308   movdq(vtmp1, src1);
2309   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2310   movdq(dst, vtmp1);
2311 }
2312 
2313 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2314   vextracti128_high(vtmp1, src2);
2315   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2316   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2317 }
2318 
2319 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2320   vextracti64x4_high(vtmp2, src2);
2321   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2322   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2323 }
2324 
2325 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2326   mov64(temp, -1L);
2327   bzhiq(temp, temp, len);
2328   kmovql(dst, temp);
2329 }
2330 #endif // _LP64
2331 
2332 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2333   reduce_operation_128(T_FLOAT, opcode, dst, src);
2334   pshufd(vtmp, src, 0x1);
2335   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2336 }
2337 
2338 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2339   reduce2F(opcode, dst, src, vtmp);
2340   pshufd(vtmp, src, 0x2);
2341   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2342   pshufd(vtmp, src, 0x3);
2343   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2344 }
2345 
2346 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2347   reduce4F(opcode, dst, src, vtmp2);
2348   vextractf128_high(vtmp2, src);
2349   reduce4F(opcode, dst, vtmp2, vtmp1);
2350 }
2351 
2352 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2353   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2354   vextracti64x4_high(vtmp1, src);
2355   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2356 }
2357 
2358 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2359   pshufd(dst, src, 0x1);
2360   reduce_operation_128(T_FLOAT, opcode, dst, src);
2361 }
2362 
2363 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2364   pshufd(vtmp, src, 0xE);
2365   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2366   unorderedReduce2F(opcode, dst, vtmp);
2367 }
2368 
2369 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2370   vextractf128_high(vtmp1, src);
2371   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2372   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2373 }
2374 
2375 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2376   vextractf64x4_high(vtmp2, src);
2377   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2378   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2379 }
2380 
2381 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2382   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2383   pshufd(vtmp, src, 0xE);
2384   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2385 }
2386 
2387 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2388   reduce2D(opcode, dst, src, vtmp2);
2389   vextractf128_high(vtmp2, src);
2390   reduce2D(opcode, dst, vtmp2, vtmp1);
2391 }
2392 
2393 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2394   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2395   vextracti64x4_high(vtmp1, src);
2396   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2397 }
2398 
2399 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2400   pshufd(dst, src, 0xE);
2401   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2402 }
2403 
2404 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2405   vextractf128_high(vtmp, src);
2406   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2407   unorderedReduce2D(opcode, dst, vtmp);
2408 }
2409 
2410 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2411   vextractf64x4_high(vtmp2, src);
2412   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2413   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2414 }
2415 
2416 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2417   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2418 }
2419 
2420 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2421   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2422 }
2423 
2424 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2425                                  int vec_enc) {
2426   switch(elem_bt) {
2427     case T_INT:
2428     case T_FLOAT:
2429       vmaskmovps(dst, src, mask, vec_enc);
2430       break;
2431     case T_LONG:
2432     case T_DOUBLE:
2433       vmaskmovpd(dst, src, mask, vec_enc);
2434       break;
2435     default:
2436       fatal("Unsupported type %s", type2name(elem_bt));
2437       break;
2438   }
2439 }
2440 
2441 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2442                                  int vec_enc) {
2443   switch(elem_bt) {
2444     case T_INT:
2445     case T_FLOAT:
2446       vmaskmovps(dst, src, mask, vec_enc);
2447       break;
2448     case T_LONG:
2449     case T_DOUBLE:
2450       vmaskmovpd(dst, src, mask, vec_enc);
2451       break;
2452     default:
2453       fatal("Unsupported type %s", type2name(elem_bt));
2454       break;
2455   }
2456 }
2457 
2458 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2459                                           XMMRegister dst, XMMRegister src,
2460                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2461                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2462   const int permconst[] = {1, 14};
2463   XMMRegister wsrc = src;
2464   XMMRegister wdst = xmm_0;
2465   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2466 
2467   int vlen_enc = Assembler::AVX_128bit;
2468   if (vlen == 16) {
2469     vlen_enc = Assembler::AVX_256bit;
2470   }
2471 
2472   for (int i = log2(vlen) - 1; i >=0; i--) {
2473     if (i == 0 && !is_dst_valid) {
2474       wdst = dst;
2475     }
2476     if (i == 3) {
2477       vextracti64x4_high(wtmp, wsrc);
2478     } else if (i == 2) {
2479       vextracti128_high(wtmp, wsrc);
2480     } else { // i = [0,1]
2481       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2482     }
2483     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2484     wsrc = wdst;
2485     vlen_enc = Assembler::AVX_128bit;
2486   }
2487   if (is_dst_valid) {
2488     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2489   }
2490 }
2491 
2492 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2493                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2494                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2495   XMMRegister wsrc = src;
2496   XMMRegister wdst = xmm_0;
2497   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2498   int vlen_enc = Assembler::AVX_128bit;
2499   if (vlen == 8) {
2500     vlen_enc = Assembler::AVX_256bit;
2501   }
2502   for (int i = log2(vlen) - 1; i >=0; i--) {
2503     if (i == 0 && !is_dst_valid) {
2504       wdst = dst;
2505     }
2506     if (i == 1) {
2507       vextracti128_high(wtmp, wsrc);
2508     } else if (i == 2) {
2509       vextracti64x4_high(wtmp, wsrc);
2510     } else {
2511       assert(i == 0, "%d", i);
2512       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2513     }
2514     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2515     wsrc = wdst;
2516     vlen_enc = Assembler::AVX_128bit;
2517   }
2518   if (is_dst_valid) {
2519     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2520   }
2521 }
2522 
2523 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2524   switch (bt) {
2525     case T_BYTE:  pextrb(dst, src, idx); break;
2526     case T_SHORT: pextrw(dst, src, idx); break;
2527     case T_INT:   pextrd(dst, src, idx); break;
2528     case T_LONG:  pextrq(dst, src, idx); break;
2529 
2530     default:
2531       assert(false,"Should not reach here.");
2532       break;
2533   }
2534 }
2535 
2536 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2537   int esize =  type2aelembytes(typ);
2538   int elem_per_lane = 16/esize;
2539   int lane = elemindex / elem_per_lane;
2540   int eindex = elemindex % elem_per_lane;
2541 
2542   if (lane >= 2) {
2543     assert(UseAVX > 2, "required");
2544     vextractf32x4(dst, src, lane & 3);
2545     return dst;
2546   } else if (lane > 0) {
2547     assert(UseAVX > 0, "required");
2548     vextractf128(dst, src, lane);
2549     return dst;
2550   } else {
2551     return src;
2552   }
2553 }
2554 
2555 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2556   if (typ == T_BYTE) {
2557     movsbl(dst, dst);
2558   } else if (typ == T_SHORT) {
2559     movswl(dst, dst);
2560   }
2561 }
2562 
2563 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2564   int esize =  type2aelembytes(typ);
2565   int elem_per_lane = 16/esize;
2566   int eindex = elemindex % elem_per_lane;
2567   assert(is_integral_type(typ),"required");
2568 
2569   if (eindex == 0) {
2570     if (typ == T_LONG) {
2571       movq(dst, src);
2572     } else {
2573       movdl(dst, src);
2574       movsxl(typ, dst);
2575     }
2576   } else {
2577     extract(typ, dst, src, eindex);
2578     movsxl(typ, dst);
2579   }
2580 }
2581 
2582 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2583   int esize =  type2aelembytes(typ);
2584   int elem_per_lane = 16/esize;
2585   int eindex = elemindex % elem_per_lane;
2586   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2587 
2588   if (eindex == 0) {
2589     movq(dst, src);
2590   } else {
2591     if (typ == T_FLOAT) {
2592       if (UseAVX == 0) {
2593         movdqu(dst, src);
2594         shufps(dst, dst, eindex);
2595       } else {
2596         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2597       }
2598     } else {
2599       if (UseAVX == 0) {
2600         movdqu(dst, src);
2601         psrldq(dst, eindex*esize);
2602       } else {
2603         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2604       }
2605       movq(dst, dst);
2606     }
2607   }
2608   // Zero upper bits
2609   if (typ == T_FLOAT) {
2610     if (UseAVX == 0) {
2611       assert(vtmp != xnoreg, "required.");
2612       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2613       pand(dst, vtmp);
2614     } else {
2615       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2616     }
2617   }
2618 }
2619 
2620 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2621   switch(typ) {
2622     case T_BYTE:
2623     case T_BOOLEAN:
2624       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2625       break;
2626     case T_SHORT:
2627     case T_CHAR:
2628       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2629       break;
2630     case T_INT:
2631     case T_FLOAT:
2632       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2633       break;
2634     case T_LONG:
2635     case T_DOUBLE:
2636       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2637       break;
2638     default:
2639       assert(false,"Should not reach here.");
2640       break;
2641   }
2642 }
2643 
2644 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2645   assert(rscratch != noreg || always_reachable(src2), "missing");
2646 
2647   switch(typ) {
2648     case T_BOOLEAN:
2649     case T_BYTE:
2650       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2651       break;
2652     case T_CHAR:
2653     case T_SHORT:
2654       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2655       break;
2656     case T_INT:
2657     case T_FLOAT:
2658       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2659       break;
2660     case T_LONG:
2661     case T_DOUBLE:
2662       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2663       break;
2664     default:
2665       assert(false,"Should not reach here.");
2666       break;
2667   }
2668 }
2669 
2670 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2671   switch(typ) {
2672     case T_BYTE:
2673       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2674       break;
2675     case T_SHORT:
2676       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2677       break;
2678     case T_INT:
2679     case T_FLOAT:
2680       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2681       break;
2682     case T_LONG:
2683     case T_DOUBLE:
2684       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2685       break;
2686     default:
2687       assert(false,"Should not reach here.");
2688       break;
2689   }
2690 }
2691 
2692 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2693   assert(vlen_in_bytes <= 32, "");
2694   int esize = type2aelembytes(bt);
2695   if (vlen_in_bytes == 32) {
2696     assert(vtmp == xnoreg, "required.");
2697     if (esize >= 4) {
2698       vtestps(src1, src2, AVX_256bit);
2699     } else {
2700       vptest(src1, src2, AVX_256bit);
2701     }
2702     return;
2703   }
2704   if (vlen_in_bytes < 16) {
2705     // Duplicate the lower part to fill the whole register,
2706     // Don't need to do so for src2
2707     assert(vtmp != xnoreg, "required");
2708     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2709     pshufd(vtmp, src1, shuffle_imm);
2710   } else {
2711     assert(vtmp == xnoreg, "required");
2712     vtmp = src1;
2713   }
2714   if (esize >= 4 && VM_Version::supports_avx()) {
2715     vtestps(vtmp, src2, AVX_128bit);
2716   } else {
2717     ptest(vtmp, src2);
2718   }
2719 }
2720 
2721 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2722   assert(UseAVX >= 2, "required");
2723 #ifdef ASSERT
2724   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2725   bool is_bw_supported = VM_Version::supports_avx512bw();
2726   if (is_bw && !is_bw_supported) {
2727     assert(vlen_enc != Assembler::AVX_512bit, "required");
2728     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2729            "XMM register should be 0-15");
2730   }
2731 #endif // ASSERT
2732   switch (elem_bt) {
2733     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2734     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2735     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2736     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2737     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2738     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2739     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2740   }
2741 }
2742 
2743 #ifdef _LP64
2744 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2745   assert(UseAVX >= 2, "required");
2746   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2747   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2748   if ((UseAVX > 2) &&
2749       (!is_bw || VM_Version::supports_avx512bw()) &&
2750       (!is_vl || VM_Version::supports_avx512vl())) {
2751     switch (elem_bt) {
2752       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2753       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2754       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2755       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2756       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2757     }
2758   } else {
2759     assert(vlen_enc != Assembler::AVX_512bit, "required");
2760     assert((dst->encoding() < 16),"XMM register should be 0-15");
2761     switch (elem_bt) {
2762       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2763       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2764       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2765       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2766       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2767       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2768       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2769     }
2770   }
2771 }
2772 #endif
2773 
2774 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2775   switch (to_elem_bt) {
2776     case T_SHORT:
2777       vpmovsxbw(dst, src, vlen_enc);
2778       break;
2779     case T_INT:
2780       vpmovsxbd(dst, src, vlen_enc);
2781       break;
2782     case T_FLOAT:
2783       vpmovsxbd(dst, src, vlen_enc);
2784       vcvtdq2ps(dst, dst, vlen_enc);
2785       break;
2786     case T_LONG:
2787       vpmovsxbq(dst, src, vlen_enc);
2788       break;
2789     case T_DOUBLE: {
2790       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2791       vpmovsxbd(dst, src, mid_vlen_enc);
2792       vcvtdq2pd(dst, dst, vlen_enc);
2793       break;
2794     }
2795     default:
2796       fatal("Unsupported type %s", type2name(to_elem_bt));
2797       break;
2798   }
2799 }
2800 
2801 //-------------------------------------------------------------------------------------------
2802 
2803 // IndexOf for constant substrings with size >= 8 chars
2804 // which don't need to be loaded through stack.
2805 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2806                                          Register cnt1, Register cnt2,
2807                                          int int_cnt2,  Register result,
2808                                          XMMRegister vec, Register tmp,
2809                                          int ae) {
2810   ShortBranchVerifier sbv(this);
2811   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2812   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2813 
2814   // This method uses the pcmpestri instruction with bound registers
2815   //   inputs:
2816   //     xmm - substring
2817   //     rax - substring length (elements count)
2818   //     mem - scanned string
2819   //     rdx - string length (elements count)
2820   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2821   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2822   //   outputs:
2823   //     rcx - matched index in string
2824   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2825   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2826   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2827   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2828   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2829 
2830   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2831         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2832         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2833 
2834   // Note, inline_string_indexOf() generates checks:
2835   // if (substr.count > string.count) return -1;
2836   // if (substr.count == 0) return 0;
2837   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2838 
2839   // Load substring.
2840   if (ae == StrIntrinsicNode::UL) {
2841     pmovzxbw(vec, Address(str2, 0));
2842   } else {
2843     movdqu(vec, Address(str2, 0));
2844   }
2845   movl(cnt2, int_cnt2);
2846   movptr(result, str1); // string addr
2847 
2848   if (int_cnt2 > stride) {
2849     jmpb(SCAN_TO_SUBSTR);
2850 
2851     // Reload substr for rescan, this code
2852     // is executed only for large substrings (> 8 chars)
2853     bind(RELOAD_SUBSTR);
2854     if (ae == StrIntrinsicNode::UL) {
2855       pmovzxbw(vec, Address(str2, 0));
2856     } else {
2857       movdqu(vec, Address(str2, 0));
2858     }
2859     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2860 
2861     bind(RELOAD_STR);
2862     // We came here after the beginning of the substring was
2863     // matched but the rest of it was not so we need to search
2864     // again. Start from the next element after the previous match.
2865 
2866     // cnt2 is number of substring reminding elements and
2867     // cnt1 is number of string reminding elements when cmp failed.
2868     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2869     subl(cnt1, cnt2);
2870     addl(cnt1, int_cnt2);
2871     movl(cnt2, int_cnt2); // Now restore cnt2
2872 
2873     decrementl(cnt1);     // Shift to next element
2874     cmpl(cnt1, cnt2);
2875     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2876 
2877     addptr(result, (1<<scale1));
2878 
2879   } // (int_cnt2 > 8)
2880 
2881   // Scan string for start of substr in 16-byte vectors
2882   bind(SCAN_TO_SUBSTR);
2883   pcmpestri(vec, Address(result, 0), mode);
2884   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2885   subl(cnt1, stride);
2886   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2887   cmpl(cnt1, cnt2);
2888   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2889   addptr(result, 16);
2890   jmpb(SCAN_TO_SUBSTR);
2891 
2892   // Found a potential substr
2893   bind(FOUND_CANDIDATE);
2894   // Matched whole vector if first element matched (tmp(rcx) == 0).
2895   if (int_cnt2 == stride) {
2896     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2897   } else { // int_cnt2 > 8
2898     jccb(Assembler::overflow, FOUND_SUBSTR);
2899   }
2900   // After pcmpestri tmp(rcx) contains matched element index
2901   // Compute start addr of substr
2902   lea(result, Address(result, tmp, scale1));
2903 
2904   // Make sure string is still long enough
2905   subl(cnt1, tmp);
2906   cmpl(cnt1, cnt2);
2907   if (int_cnt2 == stride) {
2908     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2909   } else { // int_cnt2 > 8
2910     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2911   }
2912   // Left less then substring.
2913 
2914   bind(RET_NOT_FOUND);
2915   movl(result, -1);
2916   jmp(EXIT);
2917 
2918   if (int_cnt2 > stride) {
2919     // This code is optimized for the case when whole substring
2920     // is matched if its head is matched.
2921     bind(MATCH_SUBSTR_HEAD);
2922     pcmpestri(vec, Address(result, 0), mode);
2923     // Reload only string if does not match
2924     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2925 
2926     Label CONT_SCAN_SUBSTR;
2927     // Compare the rest of substring (> 8 chars).
2928     bind(FOUND_SUBSTR);
2929     // First 8 chars are already matched.
2930     negptr(cnt2);
2931     addptr(cnt2, stride);
2932 
2933     bind(SCAN_SUBSTR);
2934     subl(cnt1, stride);
2935     cmpl(cnt2, -stride); // Do not read beyond substring
2936     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2937     // Back-up strings to avoid reading beyond substring:
2938     // cnt1 = cnt1 - cnt2 + 8
2939     addl(cnt1, cnt2); // cnt2 is negative
2940     addl(cnt1, stride);
2941     movl(cnt2, stride); negptr(cnt2);
2942     bind(CONT_SCAN_SUBSTR);
2943     if (int_cnt2 < (int)G) {
2944       int tail_off1 = int_cnt2<<scale1;
2945       int tail_off2 = int_cnt2<<scale2;
2946       if (ae == StrIntrinsicNode::UL) {
2947         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2948       } else {
2949         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2950       }
2951       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2952     } else {
2953       // calculate index in register to avoid integer overflow (int_cnt2*2)
2954       movl(tmp, int_cnt2);
2955       addptr(tmp, cnt2);
2956       if (ae == StrIntrinsicNode::UL) {
2957         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2958       } else {
2959         movdqu(vec, Address(str2, tmp, scale2, 0));
2960       }
2961       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2962     }
2963     // Need to reload strings pointers if not matched whole vector
2964     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2965     addptr(cnt2, stride);
2966     jcc(Assembler::negative, SCAN_SUBSTR);
2967     // Fall through if found full substring
2968 
2969   } // (int_cnt2 > 8)
2970 
2971   bind(RET_FOUND);
2972   // Found result if we matched full small substring.
2973   // Compute substr offset
2974   subptr(result, str1);
2975   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2976     shrl(result, 1); // index
2977   }
2978   bind(EXIT);
2979 
2980 } // string_indexofC8
2981 
2982 // Small strings are loaded through stack if they cross page boundary.
2983 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2984                                        Register cnt1, Register cnt2,
2985                                        int int_cnt2,  Register result,
2986                                        XMMRegister vec, Register tmp,
2987                                        int ae) {
2988   ShortBranchVerifier sbv(this);
2989   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2990   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2991 
2992   //
2993   // int_cnt2 is length of small (< 8 chars) constant substring
2994   // or (-1) for non constant substring in which case its length
2995   // is in cnt2 register.
2996   //
2997   // Note, inline_string_indexOf() generates checks:
2998   // if (substr.count > string.count) return -1;
2999   // if (substr.count == 0) return 0;
3000   //
3001   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3002   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3003   // This method uses the pcmpestri instruction with bound registers
3004   //   inputs:
3005   //     xmm - substring
3006   //     rax - substring length (elements count)
3007   //     mem - scanned string
3008   //     rdx - string length (elements count)
3009   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3010   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3011   //   outputs:
3012   //     rcx - matched index in string
3013   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3014   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3015   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3016   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3017 
3018   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3019         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3020         FOUND_CANDIDATE;
3021 
3022   { //========================================================
3023     // We don't know where these strings are located
3024     // and we can't read beyond them. Load them through stack.
3025     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3026 
3027     movptr(tmp, rsp); // save old SP
3028 
3029     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3030       if (int_cnt2 == (1>>scale2)) { // One byte
3031         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3032         load_unsigned_byte(result, Address(str2, 0));
3033         movdl(vec, result); // move 32 bits
3034       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3035         // Not enough header space in 32-bit VM: 12+3 = 15.
3036         movl(result, Address(str2, -1));
3037         shrl(result, 8);
3038         movdl(vec, result); // move 32 bits
3039       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3040         load_unsigned_short(result, Address(str2, 0));
3041         movdl(vec, result); // move 32 bits
3042       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3043         movdl(vec, Address(str2, 0)); // move 32 bits
3044       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3045         movq(vec, Address(str2, 0));  // move 64 bits
3046       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3047         // Array header size is 12 bytes in 32-bit VM
3048         // + 6 bytes for 3 chars == 18 bytes,
3049         // enough space to load vec and shift.
3050         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3051         if (ae == StrIntrinsicNode::UL) {
3052           int tail_off = int_cnt2-8;
3053           pmovzxbw(vec, Address(str2, tail_off));
3054           psrldq(vec, -2*tail_off);
3055         }
3056         else {
3057           int tail_off = int_cnt2*(1<<scale2);
3058           movdqu(vec, Address(str2, tail_off-16));
3059           psrldq(vec, 16-tail_off);
3060         }
3061       }
3062     } else { // not constant substring
3063       cmpl(cnt2, stride);
3064       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3065 
3066       // We can read beyond string if srt+16 does not cross page boundary
3067       // since heaps are aligned and mapped by pages.
3068       assert(os::vm_page_size() < (int)G, "default page should be small");
3069       movl(result, str2); // We need only low 32 bits
3070       andl(result, ((int)os::vm_page_size()-1));
3071       cmpl(result, ((int)os::vm_page_size()-16));
3072       jccb(Assembler::belowEqual, CHECK_STR);
3073 
3074       // Move small strings to stack to allow load 16 bytes into vec.
3075       subptr(rsp, 16);
3076       int stk_offset = wordSize-(1<<scale2);
3077       push(cnt2);
3078 
3079       bind(COPY_SUBSTR);
3080       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3081         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3082         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3083       } else if (ae == StrIntrinsicNode::UU) {
3084         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3085         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3086       }
3087       decrement(cnt2);
3088       jccb(Assembler::notZero, COPY_SUBSTR);
3089 
3090       pop(cnt2);
3091       movptr(str2, rsp);  // New substring address
3092     } // non constant
3093 
3094     bind(CHECK_STR);
3095     cmpl(cnt1, stride);
3096     jccb(Assembler::aboveEqual, BIG_STRINGS);
3097 
3098     // Check cross page boundary.
3099     movl(result, str1); // We need only low 32 bits
3100     andl(result, ((int)os::vm_page_size()-1));
3101     cmpl(result, ((int)os::vm_page_size()-16));
3102     jccb(Assembler::belowEqual, BIG_STRINGS);
3103 
3104     subptr(rsp, 16);
3105     int stk_offset = -(1<<scale1);
3106     if (int_cnt2 < 0) { // not constant
3107       push(cnt2);
3108       stk_offset += wordSize;
3109     }
3110     movl(cnt2, cnt1);
3111 
3112     bind(COPY_STR);
3113     if (ae == StrIntrinsicNode::LL) {
3114       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3115       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3116     } else {
3117       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3118       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3119     }
3120     decrement(cnt2);
3121     jccb(Assembler::notZero, COPY_STR);
3122 
3123     if (int_cnt2 < 0) { // not constant
3124       pop(cnt2);
3125     }
3126     movptr(str1, rsp);  // New string address
3127 
3128     bind(BIG_STRINGS);
3129     // Load substring.
3130     if (int_cnt2 < 0) { // -1
3131       if (ae == StrIntrinsicNode::UL) {
3132         pmovzxbw(vec, Address(str2, 0));
3133       } else {
3134         movdqu(vec, Address(str2, 0));
3135       }
3136       push(cnt2);       // substr count
3137       push(str2);       // substr addr
3138       push(str1);       // string addr
3139     } else {
3140       // Small (< 8 chars) constant substrings are loaded already.
3141       movl(cnt2, int_cnt2);
3142     }
3143     push(tmp);  // original SP
3144 
3145   } // Finished loading
3146 
3147   //========================================================
3148   // Start search
3149   //
3150 
3151   movptr(result, str1); // string addr
3152 
3153   if (int_cnt2  < 0) {  // Only for non constant substring
3154     jmpb(SCAN_TO_SUBSTR);
3155 
3156     // SP saved at sp+0
3157     // String saved at sp+1*wordSize
3158     // Substr saved at sp+2*wordSize
3159     // Substr count saved at sp+3*wordSize
3160 
3161     // Reload substr for rescan, this code
3162     // is executed only for large substrings (> 8 chars)
3163     bind(RELOAD_SUBSTR);
3164     movptr(str2, Address(rsp, 2*wordSize));
3165     movl(cnt2, Address(rsp, 3*wordSize));
3166     if (ae == StrIntrinsicNode::UL) {
3167       pmovzxbw(vec, Address(str2, 0));
3168     } else {
3169       movdqu(vec, Address(str2, 0));
3170     }
3171     // We came here after the beginning of the substring was
3172     // matched but the rest of it was not so we need to search
3173     // again. Start from the next element after the previous match.
3174     subptr(str1, result); // Restore counter
3175     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3176       shrl(str1, 1);
3177     }
3178     addl(cnt1, str1);
3179     decrementl(cnt1);   // Shift to next element
3180     cmpl(cnt1, cnt2);
3181     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3182 
3183     addptr(result, (1<<scale1));
3184   } // non constant
3185 
3186   // Scan string for start of substr in 16-byte vectors
3187   bind(SCAN_TO_SUBSTR);
3188   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3189   pcmpestri(vec, Address(result, 0), mode);
3190   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3191   subl(cnt1, stride);
3192   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3193   cmpl(cnt1, cnt2);
3194   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3195   addptr(result, 16);
3196 
3197   bind(ADJUST_STR);
3198   cmpl(cnt1, stride); // Do not read beyond string
3199   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3200   // Back-up string to avoid reading beyond string.
3201   lea(result, Address(result, cnt1, scale1, -16));
3202   movl(cnt1, stride);
3203   jmpb(SCAN_TO_SUBSTR);
3204 
3205   // Found a potential substr
3206   bind(FOUND_CANDIDATE);
3207   // After pcmpestri tmp(rcx) contains matched element index
3208 
3209   // Make sure string is still long enough
3210   subl(cnt1, tmp);
3211   cmpl(cnt1, cnt2);
3212   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3213   // Left less then substring.
3214 
3215   bind(RET_NOT_FOUND);
3216   movl(result, -1);
3217   jmp(CLEANUP);
3218 
3219   bind(FOUND_SUBSTR);
3220   // Compute start addr of substr
3221   lea(result, Address(result, tmp, scale1));
3222   if (int_cnt2 > 0) { // Constant substring
3223     // Repeat search for small substring (< 8 chars)
3224     // from new point without reloading substring.
3225     // Have to check that we don't read beyond string.
3226     cmpl(tmp, stride-int_cnt2);
3227     jccb(Assembler::greater, ADJUST_STR);
3228     // Fall through if matched whole substring.
3229   } else { // non constant
3230     assert(int_cnt2 == -1, "should be != 0");
3231 
3232     addl(tmp, cnt2);
3233     // Found result if we matched whole substring.
3234     cmpl(tmp, stride);
3235     jcc(Assembler::lessEqual, RET_FOUND);
3236 
3237     // Repeat search for small substring (<= 8 chars)
3238     // from new point 'str1' without reloading substring.
3239     cmpl(cnt2, stride);
3240     // Have to check that we don't read beyond string.
3241     jccb(Assembler::lessEqual, ADJUST_STR);
3242 
3243     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3244     // Compare the rest of substring (> 8 chars).
3245     movptr(str1, result);
3246 
3247     cmpl(tmp, cnt2);
3248     // First 8 chars are already matched.
3249     jccb(Assembler::equal, CHECK_NEXT);
3250 
3251     bind(SCAN_SUBSTR);
3252     pcmpestri(vec, Address(str1, 0), mode);
3253     // Need to reload strings pointers if not matched whole vector
3254     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3255 
3256     bind(CHECK_NEXT);
3257     subl(cnt2, stride);
3258     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3259     addptr(str1, 16);
3260     if (ae == StrIntrinsicNode::UL) {
3261       addptr(str2, 8);
3262     } else {
3263       addptr(str2, 16);
3264     }
3265     subl(cnt1, stride);
3266     cmpl(cnt2, stride); // Do not read beyond substring
3267     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3268     // Back-up strings to avoid reading beyond substring.
3269 
3270     if (ae == StrIntrinsicNode::UL) {
3271       lea(str2, Address(str2, cnt2, scale2, -8));
3272       lea(str1, Address(str1, cnt2, scale1, -16));
3273     } else {
3274       lea(str2, Address(str2, cnt2, scale2, -16));
3275       lea(str1, Address(str1, cnt2, scale1, -16));
3276     }
3277     subl(cnt1, cnt2);
3278     movl(cnt2, stride);
3279     addl(cnt1, stride);
3280     bind(CONT_SCAN_SUBSTR);
3281     if (ae == StrIntrinsicNode::UL) {
3282       pmovzxbw(vec, Address(str2, 0));
3283     } else {
3284       movdqu(vec, Address(str2, 0));
3285     }
3286     jmp(SCAN_SUBSTR);
3287 
3288     bind(RET_FOUND_LONG);
3289     movptr(str1, Address(rsp, wordSize));
3290   } // non constant
3291 
3292   bind(RET_FOUND);
3293   // Compute substr offset
3294   subptr(result, str1);
3295   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3296     shrl(result, 1); // index
3297   }
3298   bind(CLEANUP);
3299   pop(rsp); // restore SP
3300 
3301 } // string_indexof
3302 
3303 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3304                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3305   ShortBranchVerifier sbv(this);
3306   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3307 
3308   int stride = 8;
3309 
3310   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3311         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3312         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3313         FOUND_SEQ_CHAR, DONE_LABEL;
3314 
3315   movptr(result, str1);
3316   if (UseAVX >= 2) {
3317     cmpl(cnt1, stride);
3318     jcc(Assembler::less, SCAN_TO_CHAR);
3319     cmpl(cnt1, 2*stride);
3320     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3321     movdl(vec1, ch);
3322     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3323     vpxor(vec2, vec2);
3324     movl(tmp, cnt1);
3325     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3326     andl(cnt1,0x0000000F);  //tail count (in chars)
3327 
3328     bind(SCAN_TO_16_CHAR_LOOP);
3329     vmovdqu(vec3, Address(result, 0));
3330     vpcmpeqw(vec3, vec3, vec1, 1);
3331     vptest(vec2, vec3);
3332     jcc(Assembler::carryClear, FOUND_CHAR);
3333     addptr(result, 32);
3334     subl(tmp, 2*stride);
3335     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3336     jmp(SCAN_TO_8_CHAR);
3337     bind(SCAN_TO_8_CHAR_INIT);
3338     movdl(vec1, ch);
3339     pshuflw(vec1, vec1, 0x00);
3340     pshufd(vec1, vec1, 0);
3341     pxor(vec2, vec2);
3342   }
3343   bind(SCAN_TO_8_CHAR);
3344   cmpl(cnt1, stride);
3345   jcc(Assembler::less, SCAN_TO_CHAR);
3346   if (UseAVX < 2) {
3347     movdl(vec1, ch);
3348     pshuflw(vec1, vec1, 0x00);
3349     pshufd(vec1, vec1, 0);
3350     pxor(vec2, vec2);
3351   }
3352   movl(tmp, cnt1);
3353   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3354   andl(cnt1,0x00000007);  //tail count (in chars)
3355 
3356   bind(SCAN_TO_8_CHAR_LOOP);
3357   movdqu(vec3, Address(result, 0));
3358   pcmpeqw(vec3, vec1);
3359   ptest(vec2, vec3);
3360   jcc(Assembler::carryClear, FOUND_CHAR);
3361   addptr(result, 16);
3362   subl(tmp, stride);
3363   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3364   bind(SCAN_TO_CHAR);
3365   testl(cnt1, cnt1);
3366   jcc(Assembler::zero, RET_NOT_FOUND);
3367   bind(SCAN_TO_CHAR_LOOP);
3368   load_unsigned_short(tmp, Address(result, 0));
3369   cmpl(ch, tmp);
3370   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3371   addptr(result, 2);
3372   subl(cnt1, 1);
3373   jccb(Assembler::zero, RET_NOT_FOUND);
3374   jmp(SCAN_TO_CHAR_LOOP);
3375 
3376   bind(RET_NOT_FOUND);
3377   movl(result, -1);
3378   jmpb(DONE_LABEL);
3379 
3380   bind(FOUND_CHAR);
3381   if (UseAVX >= 2) {
3382     vpmovmskb(tmp, vec3);
3383   } else {
3384     pmovmskb(tmp, vec3);
3385   }
3386   bsfl(ch, tmp);
3387   addptr(result, ch);
3388 
3389   bind(FOUND_SEQ_CHAR);
3390   subptr(result, str1);
3391   shrl(result, 1);
3392 
3393   bind(DONE_LABEL);
3394 } // string_indexof_char
3395 
3396 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3397                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3398   ShortBranchVerifier sbv(this);
3399   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3400 
3401   int stride = 16;
3402 
3403   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3404         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3405         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3406         FOUND_SEQ_CHAR, DONE_LABEL;
3407 
3408   movptr(result, str1);
3409   if (UseAVX >= 2) {
3410     cmpl(cnt1, stride);
3411     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3412     cmpl(cnt1, stride*2);
3413     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3414     movdl(vec1, ch);
3415     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3416     vpxor(vec2, vec2);
3417     movl(tmp, cnt1);
3418     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3419     andl(cnt1,0x0000001F);  //tail count (in chars)
3420 
3421     bind(SCAN_TO_32_CHAR_LOOP);
3422     vmovdqu(vec3, Address(result, 0));
3423     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3424     vptest(vec2, vec3);
3425     jcc(Assembler::carryClear, FOUND_CHAR);
3426     addptr(result, 32);
3427     subl(tmp, stride*2);
3428     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3429     jmp(SCAN_TO_16_CHAR);
3430 
3431     bind(SCAN_TO_16_CHAR_INIT);
3432     movdl(vec1, ch);
3433     pxor(vec2, vec2);
3434     pshufb(vec1, vec2);
3435   }
3436 
3437   bind(SCAN_TO_16_CHAR);
3438   cmpl(cnt1, stride);
3439   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3440   if (UseAVX < 2) {
3441     movdl(vec1, ch);
3442     pxor(vec2, vec2);
3443     pshufb(vec1, vec2);
3444   }
3445   movl(tmp, cnt1);
3446   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3447   andl(cnt1,0x0000000F);  //tail count (in bytes)
3448 
3449   bind(SCAN_TO_16_CHAR_LOOP);
3450   movdqu(vec3, Address(result, 0));
3451   pcmpeqb(vec3, vec1);
3452   ptest(vec2, vec3);
3453   jcc(Assembler::carryClear, FOUND_CHAR);
3454   addptr(result, 16);
3455   subl(tmp, stride);
3456   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3457 
3458   bind(SCAN_TO_CHAR_INIT);
3459   testl(cnt1, cnt1);
3460   jcc(Assembler::zero, RET_NOT_FOUND);
3461   bind(SCAN_TO_CHAR_LOOP);
3462   load_unsigned_byte(tmp, Address(result, 0));
3463   cmpl(ch, tmp);
3464   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3465   addptr(result, 1);
3466   subl(cnt1, 1);
3467   jccb(Assembler::zero, RET_NOT_FOUND);
3468   jmp(SCAN_TO_CHAR_LOOP);
3469 
3470   bind(RET_NOT_FOUND);
3471   movl(result, -1);
3472   jmpb(DONE_LABEL);
3473 
3474   bind(FOUND_CHAR);
3475   if (UseAVX >= 2) {
3476     vpmovmskb(tmp, vec3);
3477   } else {
3478     pmovmskb(tmp, vec3);
3479   }
3480   bsfl(ch, tmp);
3481   addptr(result, ch);
3482 
3483   bind(FOUND_SEQ_CHAR);
3484   subptr(result, str1);
3485 
3486   bind(DONE_LABEL);
3487 } // stringL_indexof_char
3488 
3489 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3490   switch (eltype) {
3491   case T_BOOLEAN: return sizeof(jboolean);
3492   case T_BYTE:  return sizeof(jbyte);
3493   case T_SHORT: return sizeof(jshort);
3494   case T_CHAR:  return sizeof(jchar);
3495   case T_INT:   return sizeof(jint);
3496   default:
3497     ShouldNotReachHere();
3498     return -1;
3499   }
3500 }
3501 
3502 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3503   switch (eltype) {
3504   // T_BOOLEAN used as surrogate for unsigned byte
3505   case T_BOOLEAN: movzbl(dst, src);   break;
3506   case T_BYTE:    movsbl(dst, src);   break;
3507   case T_SHORT:   movswl(dst, src);   break;
3508   case T_CHAR:    movzwl(dst, src);   break;
3509   case T_INT:     movl(dst, src);     break;
3510   default:
3511     ShouldNotReachHere();
3512   }
3513 }
3514 
3515 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3516   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3517 }
3518 
3519 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3520   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3521 }
3522 
3523 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3524   const int vlen = Assembler::AVX_256bit;
3525   switch (eltype) {
3526   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3527   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3528   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3529   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3530   case T_INT:
3531     // do nothing
3532     break;
3533   default:
3534     ShouldNotReachHere();
3535   }
3536 }
3537 
3538 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3539                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3540                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3541                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3542                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3543                                         BasicType eltype) {
3544   ShortBranchVerifier sbv(this);
3545   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3546   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3547   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3548 
3549   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3550         SHORT_UNROLLED_LOOP_EXIT,
3551         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3552         UNROLLED_VECTOR_LOOP_BEGIN,
3553         END;
3554   switch (eltype) {
3555   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3556   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3557   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3558   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3559   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3560   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3561   }
3562 
3563   // For "renaming" for readibility of the code
3564   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3565                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3566                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3567 
3568   const int elsize = arrays_hashcode_elsize(eltype);
3569 
3570   /*
3571     if (cnt1 >= 2) {
3572       if (cnt1 >= 32) {
3573         UNROLLED VECTOR LOOP
3574       }
3575       UNROLLED SCALAR LOOP
3576     }
3577     SINGLE SCALAR
3578    */
3579 
3580   cmpl(cnt1, 32);
3581   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3582 
3583   // cnt1 >= 32 && generate_vectorized_loop
3584   xorl(index, index);
3585 
3586   // vresult = IntVector.zero(I256);
3587   for (int idx = 0; idx < 4; idx++) {
3588     vpxor(vresult[idx], vresult[idx]);
3589   }
3590   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3591   Register bound = tmp2;
3592   Register next = tmp3;
3593   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3594   movl(next, Address(tmp2, 0));
3595   movdl(vnext, next);
3596   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3597 
3598   // index = 0;
3599   // bound = cnt1 & ~(32 - 1);
3600   movl(bound, cnt1);
3601   andl(bound, ~(32 - 1));
3602   // for (; index < bound; index += 32) {
3603   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3604   // result *= next;
3605   imull(result, next);
3606   // loop fission to upfront the cost of fetching from memory, OOO execution
3607   // can then hopefully do a better job of prefetching
3608   for (int idx = 0; idx < 4; idx++) {
3609     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3610   }
3611   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3612   for (int idx = 0; idx < 4; idx++) {
3613     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3614     arrays_hashcode_elvcast(vtmp[idx], eltype);
3615     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3616   }
3617   // index += 32;
3618   addl(index, 32);
3619   // index < bound;
3620   cmpl(index, bound);
3621   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3622   // }
3623 
3624   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3625   subl(cnt1, bound);
3626   // release bound
3627 
3628   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3629   for (int idx = 0; idx < 4; idx++) {
3630     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3631     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3632     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3633   }
3634   // result += vresult.reduceLanes(ADD);
3635   for (int idx = 0; idx < 4; idx++) {
3636     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3637   }
3638 
3639   // } else if (cnt1 < 32) {
3640 
3641   bind(SHORT_UNROLLED_BEGIN);
3642   // int i = 1;
3643   movl(index, 1);
3644   cmpl(index, cnt1);
3645   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3646 
3647   // for (; i < cnt1 ; i += 2) {
3648   bind(SHORT_UNROLLED_LOOP_BEGIN);
3649   movl(tmp3, 961);
3650   imull(result, tmp3);
3651   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3652   movl(tmp3, tmp2);
3653   shll(tmp3, 5);
3654   subl(tmp3, tmp2);
3655   addl(result, tmp3);
3656   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3657   addl(result, tmp3);
3658   addl(index, 2);
3659   cmpl(index, cnt1);
3660   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3661 
3662   // }
3663   // if (i >= cnt1) {
3664   bind(SHORT_UNROLLED_LOOP_EXIT);
3665   jccb(Assembler::greater, END);
3666   movl(tmp2, result);
3667   shll(result, 5);
3668   subl(result, tmp2);
3669   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3670   addl(result, tmp3);
3671   // }
3672   bind(END);
3673 
3674   BLOCK_COMMENT("} // arrays_hashcode");
3675 
3676 } // arrays_hashcode
3677 
3678 // helper function for string_compare
3679 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3680                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3681                                            Address::ScaleFactor scale2, Register index, int ae) {
3682   if (ae == StrIntrinsicNode::LL) {
3683     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3684     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3685   } else if (ae == StrIntrinsicNode::UU) {
3686     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3687     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3688   } else {
3689     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3690     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3691   }
3692 }
3693 
3694 // Compare strings, used for char[] and byte[].
3695 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3696                                        Register cnt1, Register cnt2, Register result,
3697                                        XMMRegister vec1, int ae, KRegister mask) {
3698   ShortBranchVerifier sbv(this);
3699   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3700   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3701   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3702   int stride2x2 = 0x40;
3703   Address::ScaleFactor scale = Address::no_scale;
3704   Address::ScaleFactor scale1 = Address::no_scale;
3705   Address::ScaleFactor scale2 = Address::no_scale;
3706 
3707   if (ae != StrIntrinsicNode::LL) {
3708     stride2x2 = 0x20;
3709   }
3710 
3711   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3712     shrl(cnt2, 1);
3713   }
3714   // Compute the minimum of the string lengths and the
3715   // difference of the string lengths (stack).
3716   // Do the conditional move stuff
3717   movl(result, cnt1);
3718   subl(cnt1, cnt2);
3719   push(cnt1);
3720   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3721 
3722   // Is the minimum length zero?
3723   testl(cnt2, cnt2);
3724   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3725   if (ae == StrIntrinsicNode::LL) {
3726     // Load first bytes
3727     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3728     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3729   } else if (ae == StrIntrinsicNode::UU) {
3730     // Load first characters
3731     load_unsigned_short(result, Address(str1, 0));
3732     load_unsigned_short(cnt1, Address(str2, 0));
3733   } else {
3734     load_unsigned_byte(result, Address(str1, 0));
3735     load_unsigned_short(cnt1, Address(str2, 0));
3736   }
3737   subl(result, cnt1);
3738   jcc(Assembler::notZero,  POP_LABEL);
3739 
3740   if (ae == StrIntrinsicNode::UU) {
3741     // Divide length by 2 to get number of chars
3742     shrl(cnt2, 1);
3743   }
3744   cmpl(cnt2, 1);
3745   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3746 
3747   // Check if the strings start at the same location and setup scale and stride
3748   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3749     cmpptr(str1, str2);
3750     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3751     if (ae == StrIntrinsicNode::LL) {
3752       scale = Address::times_1;
3753       stride = 16;
3754     } else {
3755       scale = Address::times_2;
3756       stride = 8;
3757     }
3758   } else {
3759     scale1 = Address::times_1;
3760     scale2 = Address::times_2;
3761     // scale not used
3762     stride = 8;
3763   }
3764 
3765   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3766     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3767     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3768     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3769     Label COMPARE_TAIL_LONG;
3770     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3771 
3772     int pcmpmask = 0x19;
3773     if (ae == StrIntrinsicNode::LL) {
3774       pcmpmask &= ~0x01;
3775     }
3776 
3777     // Setup to compare 16-chars (32-bytes) vectors,
3778     // start from first character again because it has aligned address.
3779     if (ae == StrIntrinsicNode::LL) {
3780       stride2 = 32;
3781     } else {
3782       stride2 = 16;
3783     }
3784     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3785       adr_stride = stride << scale;
3786     } else {
3787       adr_stride1 = 8;  //stride << scale1;
3788       adr_stride2 = 16; //stride << scale2;
3789     }
3790 
3791     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3792     // rax and rdx are used by pcmpestri as elements counters
3793     movl(result, cnt2);
3794     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3795     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3796 
3797     // fast path : compare first 2 8-char vectors.
3798     bind(COMPARE_16_CHARS);
3799     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3800       movdqu(vec1, Address(str1, 0));
3801     } else {
3802       pmovzxbw(vec1, Address(str1, 0));
3803     }
3804     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3805     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3806 
3807     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3808       movdqu(vec1, Address(str1, adr_stride));
3809       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3810     } else {
3811       pmovzxbw(vec1, Address(str1, adr_stride1));
3812       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3813     }
3814     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3815     addl(cnt1, stride);
3816 
3817     // Compare the characters at index in cnt1
3818     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3819     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3820     subl(result, cnt2);
3821     jmp(POP_LABEL);
3822 
3823     // Setup the registers to start vector comparison loop
3824     bind(COMPARE_WIDE_VECTORS);
3825     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3826       lea(str1, Address(str1, result, scale));
3827       lea(str2, Address(str2, result, scale));
3828     } else {
3829       lea(str1, Address(str1, result, scale1));
3830       lea(str2, Address(str2, result, scale2));
3831     }
3832     subl(result, stride2);
3833     subl(cnt2, stride2);
3834     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3835     negptr(result);
3836 
3837     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3838     bind(COMPARE_WIDE_VECTORS_LOOP);
3839 
3840 #ifdef _LP64
3841     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3842       cmpl(cnt2, stride2x2);
3843       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3844       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3845       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3846 
3847       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3848       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3849         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3850         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3851       } else {
3852         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3853         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3854       }
3855       kortestql(mask, mask);
3856       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3857       addptr(result, stride2x2);  // update since we already compared at this addr
3858       subl(cnt2, stride2x2);      // and sub the size too
3859       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3860 
3861       vpxor(vec1, vec1);
3862       jmpb(COMPARE_WIDE_TAIL);
3863     }//if (VM_Version::supports_avx512vlbw())
3864 #endif // _LP64
3865 
3866 
3867     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3868     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3869       vmovdqu(vec1, Address(str1, result, scale));
3870       vpxor(vec1, Address(str2, result, scale));
3871     } else {
3872       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3873       vpxor(vec1, Address(str2, result, scale2));
3874     }
3875     vptest(vec1, vec1);
3876     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3877     addptr(result, stride2);
3878     subl(cnt2, stride2);
3879     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3880     // clean upper bits of YMM registers
3881     vpxor(vec1, vec1);
3882 
3883     // compare wide vectors tail
3884     bind(COMPARE_WIDE_TAIL);
3885     testptr(result, result);
3886     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3887 
3888     movl(result, stride2);
3889     movl(cnt2, result);
3890     negptr(result);
3891     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3892 
3893     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3894     bind(VECTOR_NOT_EQUAL);
3895     // clean upper bits of YMM registers
3896     vpxor(vec1, vec1);
3897     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3898       lea(str1, Address(str1, result, scale));
3899       lea(str2, Address(str2, result, scale));
3900     } else {
3901       lea(str1, Address(str1, result, scale1));
3902       lea(str2, Address(str2, result, scale2));
3903     }
3904     jmp(COMPARE_16_CHARS);
3905 
3906     // Compare tail chars, length between 1 to 15 chars
3907     bind(COMPARE_TAIL_LONG);
3908     movl(cnt2, result);
3909     cmpl(cnt2, stride);
3910     jcc(Assembler::less, COMPARE_SMALL_STR);
3911 
3912     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3913       movdqu(vec1, Address(str1, 0));
3914     } else {
3915       pmovzxbw(vec1, Address(str1, 0));
3916     }
3917     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3918     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3919     subptr(cnt2, stride);
3920     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3921     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3922       lea(str1, Address(str1, result, scale));
3923       lea(str2, Address(str2, result, scale));
3924     } else {
3925       lea(str1, Address(str1, result, scale1));
3926       lea(str2, Address(str2, result, scale2));
3927     }
3928     negptr(cnt2);
3929     jmpb(WHILE_HEAD_LABEL);
3930 
3931     bind(COMPARE_SMALL_STR);
3932   } else if (UseSSE42Intrinsics) {
3933     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3934     int pcmpmask = 0x19;
3935     // Setup to compare 8-char (16-byte) vectors,
3936     // start from first character again because it has aligned address.
3937     movl(result, cnt2);
3938     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3939     if (ae == StrIntrinsicNode::LL) {
3940       pcmpmask &= ~0x01;
3941     }
3942     jcc(Assembler::zero, COMPARE_TAIL);
3943     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3944       lea(str1, Address(str1, result, scale));
3945       lea(str2, Address(str2, result, scale));
3946     } else {
3947       lea(str1, Address(str1, result, scale1));
3948       lea(str2, Address(str2, result, scale2));
3949     }
3950     negptr(result);
3951 
3952     // pcmpestri
3953     //   inputs:
3954     //     vec1- substring
3955     //     rax - negative string length (elements count)
3956     //     mem - scanned string
3957     //     rdx - string length (elements count)
3958     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3959     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3960     //   outputs:
3961     //     rcx - first mismatched element index
3962     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3963 
3964     bind(COMPARE_WIDE_VECTORS);
3965     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3966       movdqu(vec1, Address(str1, result, scale));
3967       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3968     } else {
3969       pmovzxbw(vec1, Address(str1, result, scale1));
3970       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3971     }
3972     // After pcmpestri cnt1(rcx) contains mismatched element index
3973 
3974     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3975     addptr(result, stride);
3976     subptr(cnt2, stride);
3977     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3978 
3979     // compare wide vectors tail
3980     testptr(result, result);
3981     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3982 
3983     movl(cnt2, stride);
3984     movl(result, stride);
3985     negptr(result);
3986     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3987       movdqu(vec1, Address(str1, result, scale));
3988       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3989     } else {
3990       pmovzxbw(vec1, Address(str1, result, scale1));
3991       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3992     }
3993     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3994 
3995     // Mismatched characters in the vectors
3996     bind(VECTOR_NOT_EQUAL);
3997     addptr(cnt1, result);
3998     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3999     subl(result, cnt2);
4000     jmpb(POP_LABEL);
4001 
4002     bind(COMPARE_TAIL); // limit is zero
4003     movl(cnt2, result);
4004     // Fallthru to tail compare
4005   }
4006   // Shift str2 and str1 to the end of the arrays, negate min
4007   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4008     lea(str1, Address(str1, cnt2, scale));
4009     lea(str2, Address(str2, cnt2, scale));
4010   } else {
4011     lea(str1, Address(str1, cnt2, scale1));
4012     lea(str2, Address(str2, cnt2, scale2));
4013   }
4014   decrementl(cnt2);  // first character was compared already
4015   negptr(cnt2);
4016 
4017   // Compare the rest of the elements
4018   bind(WHILE_HEAD_LABEL);
4019   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4020   subl(result, cnt1);
4021   jccb(Assembler::notZero, POP_LABEL);
4022   increment(cnt2);
4023   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4024 
4025   // Strings are equal up to min length.  Return the length difference.
4026   bind(LENGTH_DIFF_LABEL);
4027   pop(result);
4028   if (ae == StrIntrinsicNode::UU) {
4029     // Divide diff by 2 to get number of chars
4030     sarl(result, 1);
4031   }
4032   jmpb(DONE_LABEL);
4033 
4034 #ifdef _LP64
4035   if (VM_Version::supports_avx512vlbw()) {
4036 
4037     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4038 
4039     kmovql(cnt1, mask);
4040     notq(cnt1);
4041     bsfq(cnt2, cnt1);
4042     if (ae != StrIntrinsicNode::LL) {
4043       // Divide diff by 2 to get number of chars
4044       sarl(cnt2, 1);
4045     }
4046     addq(result, cnt2);
4047     if (ae == StrIntrinsicNode::LL) {
4048       load_unsigned_byte(cnt1, Address(str2, result));
4049       load_unsigned_byte(result, Address(str1, result));
4050     } else if (ae == StrIntrinsicNode::UU) {
4051       load_unsigned_short(cnt1, Address(str2, result, scale));
4052       load_unsigned_short(result, Address(str1, result, scale));
4053     } else {
4054       load_unsigned_short(cnt1, Address(str2, result, scale2));
4055       load_unsigned_byte(result, Address(str1, result, scale1));
4056     }
4057     subl(result, cnt1);
4058     jmpb(POP_LABEL);
4059   }//if (VM_Version::supports_avx512vlbw())
4060 #endif // _LP64
4061 
4062   // Discard the stored length difference
4063   bind(POP_LABEL);
4064   pop(cnt1);
4065 
4066   // That's it
4067   bind(DONE_LABEL);
4068   if(ae == StrIntrinsicNode::UL) {
4069     negl(result);
4070   }
4071 
4072 }
4073 
4074 // Search for Non-ASCII character (Negative byte value) in a byte array,
4075 // return the index of the first such character, otherwise the length
4076 // of the array segment searched.
4077 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4078 //   @IntrinsicCandidate
4079 //   public static int countPositives(byte[] ba, int off, int len) {
4080 //     for (int i = off; i < off + len; i++) {
4081 //       if (ba[i] < 0) {
4082 //         return i - off;
4083 //       }
4084 //     }
4085 //     return len;
4086 //   }
4087 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4088   Register result, Register tmp1,
4089   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4090   // rsi: byte array
4091   // rcx: len
4092   // rax: result
4093   ShortBranchVerifier sbv(this);
4094   assert_different_registers(ary1, len, result, tmp1);
4095   assert_different_registers(vec1, vec2);
4096   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4097 
4098   movl(result, len); // copy
4099   // len == 0
4100   testl(len, len);
4101   jcc(Assembler::zero, DONE);
4102 
4103   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4104     VM_Version::supports_avx512vlbw() &&
4105     VM_Version::supports_bmi2()) {
4106 
4107     Label test_64_loop, test_tail, BREAK_LOOP;
4108     movl(tmp1, len);
4109     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4110 
4111     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4112     andl(len,  0xffffffc0); // vector count (in chars)
4113     jccb(Assembler::zero, test_tail);
4114 
4115     lea(ary1, Address(ary1, len, Address::times_1));
4116     negptr(len);
4117 
4118     bind(test_64_loop);
4119     // Check whether our 64 elements of size byte contain negatives
4120     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4121     kortestql(mask1, mask1);
4122     jcc(Assembler::notZero, BREAK_LOOP);
4123 
4124     addptr(len, 64);
4125     jccb(Assembler::notZero, test_64_loop);
4126 
4127     bind(test_tail);
4128     // bail out when there is nothing to be done
4129     testl(tmp1, -1);
4130     jcc(Assembler::zero, DONE);
4131 
4132 
4133     // check the tail for absense of negatives
4134     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4135 #ifdef _LP64
4136     {
4137       Register tmp3_aliased = len;
4138       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4139       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4140       notq(tmp3_aliased);
4141       kmovql(mask2, tmp3_aliased);
4142     }
4143 #else
4144     Label k_init;
4145     jmp(k_init);
4146 
4147     // We could not read 64-bits from a general purpose register thus we move
4148     // data required to compose 64 1's to the instruction stream
4149     // We emit 64 byte wide series of elements from 0..63 which later on would
4150     // be used as a compare targets with tail count contained in tmp1 register.
4151     // Result would be a k register having tmp1 consecutive number or 1
4152     // counting from least significant bit.
4153     address tmp = pc();
4154     emit_int64(0x0706050403020100);
4155     emit_int64(0x0F0E0D0C0B0A0908);
4156     emit_int64(0x1716151413121110);
4157     emit_int64(0x1F1E1D1C1B1A1918);
4158     emit_int64(0x2726252423222120);
4159     emit_int64(0x2F2E2D2C2B2A2928);
4160     emit_int64(0x3736353433323130);
4161     emit_int64(0x3F3E3D3C3B3A3938);
4162 
4163     bind(k_init);
4164     lea(len, InternalAddress(tmp));
4165     // create mask to test for negative byte inside a vector
4166     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4167     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4168 
4169 #endif
4170     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4171     ktestq(mask1, mask2);
4172     jcc(Assembler::zero, DONE);
4173 
4174     // do a full check for negative registers in the tail
4175     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4176                      // ary1 already pointing to the right place
4177     jmpb(TAIL_START);
4178 
4179     bind(BREAK_LOOP);
4180     // At least one byte in the last 64 byte block was negative.
4181     // Set up to look at the last 64 bytes as if they were a tail
4182     lea(ary1, Address(ary1, len, Address::times_1));
4183     addptr(result, len);
4184     // Ignore the very last byte: if all others are positive,
4185     // it must be negative, so we can skip right to the 2+1 byte
4186     // end comparison at this point
4187     orl(result, 63);
4188     movl(len, 63);
4189     // Fallthru to tail compare
4190   } else {
4191 
4192     if (UseAVX >= 2 && UseSSE >= 2) {
4193       // With AVX2, use 32-byte vector compare
4194       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4195 
4196       // Compare 32-byte vectors
4197       testl(len, 0xffffffe0);   // vector count (in bytes)
4198       jccb(Assembler::zero, TAIL_START);
4199 
4200       andl(len, 0xffffffe0);
4201       lea(ary1, Address(ary1, len, Address::times_1));
4202       negptr(len);
4203 
4204       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4205       movdl(vec2, tmp1);
4206       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4207 
4208       bind(COMPARE_WIDE_VECTORS);
4209       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4210       vptest(vec1, vec2);
4211       jccb(Assembler::notZero, BREAK_LOOP);
4212       addptr(len, 32);
4213       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4214 
4215       testl(result, 0x0000001f);   // any bytes remaining?
4216       jcc(Assembler::zero, DONE);
4217 
4218       // Quick test using the already prepared vector mask
4219       movl(len, result);
4220       andl(len, 0x0000001f);
4221       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4222       vptest(vec1, vec2);
4223       jcc(Assembler::zero, DONE);
4224       // There are zeros, jump to the tail to determine exactly where
4225       jmpb(TAIL_START);
4226 
4227       bind(BREAK_LOOP);
4228       // At least one byte in the last 32-byte vector is negative.
4229       // Set up to look at the last 32 bytes as if they were a tail
4230       lea(ary1, Address(ary1, len, Address::times_1));
4231       addptr(result, len);
4232       // Ignore the very last byte: if all others are positive,
4233       // it must be negative, so we can skip right to the 2+1 byte
4234       // end comparison at this point
4235       orl(result, 31);
4236       movl(len, 31);
4237       // Fallthru to tail compare
4238     } else if (UseSSE42Intrinsics) {
4239       // With SSE4.2, use double quad vector compare
4240       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4241 
4242       // Compare 16-byte vectors
4243       testl(len, 0xfffffff0);   // vector count (in bytes)
4244       jcc(Assembler::zero, TAIL_START);
4245 
4246       andl(len, 0xfffffff0);
4247       lea(ary1, Address(ary1, len, Address::times_1));
4248       negptr(len);
4249 
4250       movl(tmp1, 0x80808080);
4251       movdl(vec2, tmp1);
4252       pshufd(vec2, vec2, 0);
4253 
4254       bind(COMPARE_WIDE_VECTORS);
4255       movdqu(vec1, Address(ary1, len, Address::times_1));
4256       ptest(vec1, vec2);
4257       jccb(Assembler::notZero, BREAK_LOOP);
4258       addptr(len, 16);
4259       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4260 
4261       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4262       jcc(Assembler::zero, DONE);
4263 
4264       // Quick test using the already prepared vector mask
4265       movl(len, result);
4266       andl(len, 0x0000000f);   // tail count (in bytes)
4267       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4268       ptest(vec1, vec2);
4269       jcc(Assembler::zero, DONE);
4270       jmpb(TAIL_START);
4271 
4272       bind(BREAK_LOOP);
4273       // At least one byte in the last 16-byte vector is negative.
4274       // Set up and look at the last 16 bytes as if they were a tail
4275       lea(ary1, Address(ary1, len, Address::times_1));
4276       addptr(result, len);
4277       // Ignore the very last byte: if all others are positive,
4278       // it must be negative, so we can skip right to the 2+1 byte
4279       // end comparison at this point
4280       orl(result, 15);
4281       movl(len, 15);
4282       // Fallthru to tail compare
4283     }
4284   }
4285 
4286   bind(TAIL_START);
4287   // Compare 4-byte vectors
4288   andl(len, 0xfffffffc); // vector count (in bytes)
4289   jccb(Assembler::zero, COMPARE_CHAR);
4290 
4291   lea(ary1, Address(ary1, len, Address::times_1));
4292   negptr(len);
4293 
4294   bind(COMPARE_VECTORS);
4295   movl(tmp1, Address(ary1, len, Address::times_1));
4296   andl(tmp1, 0x80808080);
4297   jccb(Assembler::notZero, TAIL_ADJUST);
4298   addptr(len, 4);
4299   jccb(Assembler::notZero, COMPARE_VECTORS);
4300 
4301   // Compare trailing char (final 2-3 bytes), if any
4302   bind(COMPARE_CHAR);
4303 
4304   testl(result, 0x2);   // tail  char
4305   jccb(Assembler::zero, COMPARE_BYTE);
4306   load_unsigned_short(tmp1, Address(ary1, 0));
4307   andl(tmp1, 0x00008080);
4308   jccb(Assembler::notZero, CHAR_ADJUST);
4309   lea(ary1, Address(ary1, 2));
4310 
4311   bind(COMPARE_BYTE);
4312   testl(result, 0x1);   // tail  byte
4313   jccb(Assembler::zero, DONE);
4314   load_unsigned_byte(tmp1, Address(ary1, 0));
4315   testl(tmp1, 0x00000080);
4316   jccb(Assembler::zero, DONE);
4317   subptr(result, 1);
4318   jmpb(DONE);
4319 
4320   bind(TAIL_ADJUST);
4321   // there are negative bits in the last 4 byte block.
4322   // Adjust result and check the next three bytes
4323   addptr(result, len);
4324   orl(result, 3);
4325   lea(ary1, Address(ary1, len, Address::times_1));
4326   jmpb(COMPARE_CHAR);
4327 
4328   bind(CHAR_ADJUST);
4329   // We are looking at a char + optional byte tail, and found that one
4330   // of the bytes in the char is negative. Adjust the result, check the
4331   // first byte and readjust if needed.
4332   andl(result, 0xfffffffc);
4333   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4334   jccb(Assembler::notZero, DONE);
4335   addptr(result, 1);
4336 
4337   // That's it
4338   bind(DONE);
4339   if (UseAVX >= 2 && UseSSE >= 2) {
4340     // clean upper bits of YMM registers
4341     vpxor(vec1, vec1);
4342     vpxor(vec2, vec2);
4343   }
4344 }
4345 
4346 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4347 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4348                                       Register limit, Register result, Register chr,
4349                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4350                                       KRegister mask, bool expand_ary2) {
4351   // for expand_ary2, limit is the (smaller) size of the second array.
4352   ShortBranchVerifier sbv(this);
4353   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4354 
4355   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4356          "Expansion only implemented for AVX2");
4357 
4358   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4359   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4360 
4361   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4362   int scaleIncr = expand_ary2 ? 8 : 16;
4363 
4364   if (is_array_equ) {
4365     // Check the input args
4366     cmpoop(ary1, ary2);
4367     jcc(Assembler::equal, TRUE_LABEL);
4368 
4369     // Need additional checks for arrays_equals.
4370     testptr(ary1, ary1);
4371     jcc(Assembler::zero, FALSE_LABEL);
4372     testptr(ary2, ary2);
4373     jcc(Assembler::zero, FALSE_LABEL);
4374 
4375     // Check the lengths
4376     movl(limit, Address(ary1, length_offset));
4377     cmpl(limit, Address(ary2, length_offset));
4378     jcc(Assembler::notEqual, FALSE_LABEL);
4379   }
4380 
4381   // count == 0
4382   testl(limit, limit);
4383   jcc(Assembler::zero, TRUE_LABEL);
4384 
4385   if (is_array_equ) {
4386     // Load array address
4387     lea(ary1, Address(ary1, base_offset));
4388     lea(ary2, Address(ary2, base_offset));
4389   }
4390 
4391   if (is_array_equ && is_char) {
4392     // arrays_equals when used for char[].
4393     shll(limit, 1);      // byte count != 0
4394   }
4395   movl(result, limit); // copy
4396 
4397   if (UseAVX >= 2) {
4398     // With AVX2, use 32-byte vector compare
4399     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4400 
4401     // Compare 32-byte vectors
4402     if (expand_ary2) {
4403       andl(result, 0x0000000f);  //   tail count (in bytes)
4404       andl(limit, 0xfffffff0);   // vector count (in bytes)
4405       jcc(Assembler::zero, COMPARE_TAIL);
4406     } else {
4407       andl(result, 0x0000001f);  //   tail count (in bytes)
4408       andl(limit, 0xffffffe0);   // vector count (in bytes)
4409       jcc(Assembler::zero, COMPARE_TAIL_16);
4410     }
4411 
4412     lea(ary1, Address(ary1, limit, scaleFactor));
4413     lea(ary2, Address(ary2, limit, Address::times_1));
4414     negptr(limit);
4415 
4416 #ifdef _LP64
4417     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4418       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4419 
4420       cmpl(limit, -64);
4421       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4422 
4423       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4424 
4425       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4426       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4427       kortestql(mask, mask);
4428       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4429       addptr(limit, 64);  // update since we already compared at this addr
4430       cmpl(limit, -64);
4431       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4432 
4433       // At this point we may still need to compare -limit+result bytes.
4434       // We could execute the next two instruction and just continue via non-wide path:
4435       //  cmpl(limit, 0);
4436       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4437       // But since we stopped at the points ary{1,2}+limit which are
4438       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4439       // (|limit| <= 32 and result < 32),
4440       // we may just compare the last 64 bytes.
4441       //
4442       addptr(result, -64);   // it is safe, bc we just came from this area
4443       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4444       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4445       kortestql(mask, mask);
4446       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4447 
4448       jmp(TRUE_LABEL);
4449 
4450       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4451 
4452     }//if (VM_Version::supports_avx512vlbw())
4453 #endif //_LP64
4454     bind(COMPARE_WIDE_VECTORS);
4455     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4456     if (expand_ary2) {
4457       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4458     } else {
4459       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4460     }
4461     vpxor(vec1, vec2);
4462 
4463     vptest(vec1, vec1);
4464     jcc(Assembler::notZero, FALSE_LABEL);
4465     addptr(limit, scaleIncr * 2);
4466     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4467 
4468     testl(result, result);
4469     jcc(Assembler::zero, TRUE_LABEL);
4470 
4471     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4472     if (expand_ary2) {
4473       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4474     } else {
4475       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4476     }
4477     vpxor(vec1, vec2);
4478 
4479     vptest(vec1, vec1);
4480     jcc(Assembler::notZero, FALSE_LABEL);
4481     jmp(TRUE_LABEL);
4482 
4483     bind(COMPARE_TAIL_16); // limit is zero
4484     movl(limit, result);
4485 
4486     // Compare 16-byte chunks
4487     andl(result, 0x0000000f);  //   tail count (in bytes)
4488     andl(limit, 0xfffffff0);   // vector count (in bytes)
4489     jcc(Assembler::zero, COMPARE_TAIL);
4490 
4491     lea(ary1, Address(ary1, limit, scaleFactor));
4492     lea(ary2, Address(ary2, limit, Address::times_1));
4493     negptr(limit);
4494 
4495     bind(COMPARE_WIDE_VECTORS_16);
4496     movdqu(vec1, Address(ary1, limit, scaleFactor));
4497     if (expand_ary2) {
4498       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4499     } else {
4500       movdqu(vec2, Address(ary2, limit, Address::times_1));
4501     }
4502     pxor(vec1, vec2);
4503 
4504     ptest(vec1, vec1);
4505     jcc(Assembler::notZero, FALSE_LABEL);
4506     addptr(limit, scaleIncr);
4507     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4508 
4509     bind(COMPARE_TAIL); // limit is zero
4510     movl(limit, result);
4511     // Fallthru to tail compare
4512   } else if (UseSSE42Intrinsics) {
4513     // With SSE4.2, use double quad vector compare
4514     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4515 
4516     // Compare 16-byte vectors
4517     andl(result, 0x0000000f);  //   tail count (in bytes)
4518     andl(limit, 0xfffffff0);   // vector count (in bytes)
4519     jcc(Assembler::zero, COMPARE_TAIL);
4520 
4521     lea(ary1, Address(ary1, limit, Address::times_1));
4522     lea(ary2, Address(ary2, limit, Address::times_1));
4523     negptr(limit);
4524 
4525     bind(COMPARE_WIDE_VECTORS);
4526     movdqu(vec1, Address(ary1, limit, Address::times_1));
4527     movdqu(vec2, Address(ary2, limit, Address::times_1));
4528     pxor(vec1, vec2);
4529 
4530     ptest(vec1, vec1);
4531     jcc(Assembler::notZero, FALSE_LABEL);
4532     addptr(limit, 16);
4533     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4534 
4535     testl(result, result);
4536     jcc(Assembler::zero, TRUE_LABEL);
4537 
4538     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4539     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4540     pxor(vec1, vec2);
4541 
4542     ptest(vec1, vec1);
4543     jccb(Assembler::notZero, FALSE_LABEL);
4544     jmpb(TRUE_LABEL);
4545 
4546     bind(COMPARE_TAIL); // limit is zero
4547     movl(limit, result);
4548     // Fallthru to tail compare
4549   }
4550 
4551   // Compare 4-byte vectors
4552   if (expand_ary2) {
4553     testl(result, result);
4554     jccb(Assembler::zero, TRUE_LABEL);
4555   } else {
4556     andl(limit, 0xfffffffc); // vector count (in bytes)
4557     jccb(Assembler::zero, COMPARE_CHAR);
4558   }
4559 
4560   lea(ary1, Address(ary1, limit, scaleFactor));
4561   lea(ary2, Address(ary2, limit, Address::times_1));
4562   negptr(limit);
4563 
4564   bind(COMPARE_VECTORS);
4565   if (expand_ary2) {
4566     // There are no "vector" operations for bytes to shorts
4567     movzbl(chr, Address(ary2, limit, Address::times_1));
4568     cmpw(Address(ary1, limit, Address::times_2), chr);
4569     jccb(Assembler::notEqual, FALSE_LABEL);
4570     addptr(limit, 1);
4571     jcc(Assembler::notZero, COMPARE_VECTORS);
4572     jmp(TRUE_LABEL);
4573   } else {
4574     movl(chr, Address(ary1, limit, Address::times_1));
4575     cmpl(chr, Address(ary2, limit, Address::times_1));
4576     jccb(Assembler::notEqual, FALSE_LABEL);
4577     addptr(limit, 4);
4578     jcc(Assembler::notZero, COMPARE_VECTORS);
4579   }
4580 
4581   // Compare trailing char (final 2 bytes), if any
4582   bind(COMPARE_CHAR);
4583   testl(result, 0x2);   // tail  char
4584   jccb(Assembler::zero, COMPARE_BYTE);
4585   load_unsigned_short(chr, Address(ary1, 0));
4586   load_unsigned_short(limit, Address(ary2, 0));
4587   cmpl(chr, limit);
4588   jccb(Assembler::notEqual, FALSE_LABEL);
4589 
4590   if (is_array_equ && is_char) {
4591     bind(COMPARE_BYTE);
4592   } else {
4593     lea(ary1, Address(ary1, 2));
4594     lea(ary2, Address(ary2, 2));
4595 
4596     bind(COMPARE_BYTE);
4597     testl(result, 0x1);   // tail  byte
4598     jccb(Assembler::zero, TRUE_LABEL);
4599     load_unsigned_byte(chr, Address(ary1, 0));
4600     load_unsigned_byte(limit, Address(ary2, 0));
4601     cmpl(chr, limit);
4602     jccb(Assembler::notEqual, FALSE_LABEL);
4603   }
4604   bind(TRUE_LABEL);
4605   movl(result, 1);   // return true
4606   jmpb(DONE);
4607 
4608   bind(FALSE_LABEL);
4609   xorl(result, result); // return false
4610 
4611   // That's it
4612   bind(DONE);
4613   if (UseAVX >= 2) {
4614     // clean upper bits of YMM registers
4615     vpxor(vec1, vec1);
4616     vpxor(vec2, vec2);
4617   }
4618 }
4619 
4620 #ifdef _LP64
4621 
4622 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4623 #define __ masm.
4624   Register dst = stub.data<0>();
4625   XMMRegister src = stub.data<1>();
4626   address target = stub.data<2>();
4627   __ bind(stub.entry());
4628   __ subptr(rsp, 8);
4629   __ movdbl(Address(rsp), src);
4630   __ call(RuntimeAddress(target));
4631   __ pop(dst);
4632   __ jmp(stub.continuation());
4633 #undef __
4634 }
4635 
4636 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4637   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4638   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4639 
4640   address slowpath_target;
4641   if (dst_bt == T_INT) {
4642     if (src_bt == T_FLOAT) {
4643       cvttss2sil(dst, src);
4644       cmpl(dst, 0x80000000);
4645       slowpath_target = StubRoutines::x86::f2i_fixup();
4646     } else {
4647       cvttsd2sil(dst, src);
4648       cmpl(dst, 0x80000000);
4649       slowpath_target = StubRoutines::x86::d2i_fixup();
4650     }
4651   } else {
4652     if (src_bt == T_FLOAT) {
4653       cvttss2siq(dst, src);
4654       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4655       slowpath_target = StubRoutines::x86::f2l_fixup();
4656     } else {
4657       cvttsd2siq(dst, src);
4658       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4659       slowpath_target = StubRoutines::x86::d2l_fixup();
4660     }
4661   }
4662 
4663   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4664   jcc(Assembler::equal, stub->entry());
4665   bind(stub->continuation());
4666 }
4667 
4668 #endif // _LP64
4669 
4670 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4671                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4672   switch(ideal_opc) {
4673     case Op_LShiftVS:
4674       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4675     case Op_LShiftVI:
4676       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4677     case Op_LShiftVL:
4678       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4679     case Op_RShiftVS:
4680       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4681     case Op_RShiftVI:
4682       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4683     case Op_RShiftVL:
4684       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4685     case Op_URShiftVS:
4686       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4687     case Op_URShiftVI:
4688       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4689     case Op_URShiftVL:
4690       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4691     case Op_RotateRightV:
4692       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4693     case Op_RotateLeftV:
4694       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4695     default:
4696       fatal("Unsupported masked operation"); break;
4697   }
4698 }
4699 
4700 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4701                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4702                                     bool is_varshift) {
4703   switch (ideal_opc) {
4704     case Op_AddVB:
4705       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4706     case Op_AddVS:
4707       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4708     case Op_AddVI:
4709       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4710     case Op_AddVL:
4711       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4712     case Op_AddVF:
4713       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4714     case Op_AddVD:
4715       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4716     case Op_SubVB:
4717       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4718     case Op_SubVS:
4719       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4720     case Op_SubVI:
4721       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4722     case Op_SubVL:
4723       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4724     case Op_SubVF:
4725       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4726     case Op_SubVD:
4727       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4728     case Op_MulVS:
4729       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4730     case Op_MulVI:
4731       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4732     case Op_MulVL:
4733       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4734     case Op_MulVF:
4735       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4736     case Op_MulVD:
4737       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4738     case Op_DivVF:
4739       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4740     case Op_DivVD:
4741       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4742     case Op_SqrtVF:
4743       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4744     case Op_SqrtVD:
4745       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4746     case Op_AbsVB:
4747       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4748     case Op_AbsVS:
4749       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4750     case Op_AbsVI:
4751       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4752     case Op_AbsVL:
4753       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4754     case Op_FmaVF:
4755       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4756     case Op_FmaVD:
4757       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4758     case Op_VectorRearrange:
4759       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4760     case Op_LShiftVS:
4761       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4762     case Op_LShiftVI:
4763       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4764     case Op_LShiftVL:
4765       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4766     case Op_RShiftVS:
4767       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4768     case Op_RShiftVI:
4769       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4770     case Op_RShiftVL:
4771       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4772     case Op_URShiftVS:
4773       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4774     case Op_URShiftVI:
4775       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4776     case Op_URShiftVL:
4777       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4778     case Op_RotateLeftV:
4779       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4780     case Op_RotateRightV:
4781       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4782     case Op_MaxV:
4783       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4784     case Op_MinV:
4785       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4786     case Op_XorV:
4787       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4788     case Op_OrV:
4789       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4790     case Op_AndV:
4791       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4792     default:
4793       fatal("Unsupported masked operation"); break;
4794   }
4795 }
4796 
4797 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4798                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4799   switch (ideal_opc) {
4800     case Op_AddVB:
4801       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4802     case Op_AddVS:
4803       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4804     case Op_AddVI:
4805       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4806     case Op_AddVL:
4807       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4808     case Op_AddVF:
4809       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4810     case Op_AddVD:
4811       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4812     case Op_SubVB:
4813       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4814     case Op_SubVS:
4815       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4816     case Op_SubVI:
4817       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4818     case Op_SubVL:
4819       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4820     case Op_SubVF:
4821       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4822     case Op_SubVD:
4823       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4824     case Op_MulVS:
4825       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4826     case Op_MulVI:
4827       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4828     case Op_MulVL:
4829       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4830     case Op_MulVF:
4831       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4832     case Op_MulVD:
4833       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4834     case Op_DivVF:
4835       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4836     case Op_DivVD:
4837       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4838     case Op_FmaVF:
4839       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4840     case Op_FmaVD:
4841       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4842     case Op_MaxV:
4843       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4844     case Op_MinV:
4845       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4846     case Op_XorV:
4847       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4848     case Op_OrV:
4849       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4850     case Op_AndV:
4851       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4852     default:
4853       fatal("Unsupported masked operation"); break;
4854   }
4855 }
4856 
4857 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4858                                   KRegister src1, KRegister src2) {
4859   BasicType etype = T_ILLEGAL;
4860   switch(mask_len) {
4861     case 2:
4862     case 4:
4863     case 8:  etype = T_BYTE; break;
4864     case 16: etype = T_SHORT; break;
4865     case 32: etype = T_INT; break;
4866     case 64: etype = T_LONG; break;
4867     default: fatal("Unsupported type"); break;
4868   }
4869   assert(etype != T_ILLEGAL, "");
4870   switch(ideal_opc) {
4871     case Op_AndVMask:
4872       kand(etype, dst, src1, src2); break;
4873     case Op_OrVMask:
4874       kor(etype, dst, src1, src2); break;
4875     case Op_XorVMask:
4876       kxor(etype, dst, src1, src2); break;
4877     default:
4878       fatal("Unsupported masked operation"); break;
4879   }
4880 }
4881 
4882 /*
4883  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4884  * If src is NaN, the result is 0.
4885  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4886  * the result is equal to the value of Integer.MIN_VALUE.
4887  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4888  * the result is equal to the value of Integer.MAX_VALUE.
4889  */
4890 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4891                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4892                                                                    Register rscratch, AddressLiteral float_sign_flip,
4893                                                                    int vec_enc) {
4894   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4895   Label done;
4896   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4897   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4898   vptest(xtmp2, xtmp2, vec_enc);
4899   jccb(Assembler::equal, done);
4900 
4901   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4902   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4903 
4904   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4905   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4906   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4907 
4908   // Recompute the mask for remaining special value.
4909   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4910   // Extract SRC values corresponding to TRUE mask lanes.
4911   vpand(xtmp4, xtmp2, src, vec_enc);
4912   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4913   // values are set.
4914   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4915 
4916   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4917   bind(done);
4918 }
4919 
4920 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4921                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4922                                                                     Register rscratch, AddressLiteral float_sign_flip,
4923                                                                     int vec_enc) {
4924   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4925   Label done;
4926   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4927   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4928   kortestwl(ktmp1, ktmp1);
4929   jccb(Assembler::equal, done);
4930 
4931   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4932   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4933   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4934 
4935   kxorwl(ktmp1, ktmp1, ktmp2);
4936   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4937   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4938   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4939   bind(done);
4940 }
4941 
4942 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4943                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4944                                                                      Register rscratch, AddressLiteral double_sign_flip,
4945                                                                      int vec_enc) {
4946   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4947 
4948   Label done;
4949   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4950   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4951   kortestwl(ktmp1, ktmp1);
4952   jccb(Assembler::equal, done);
4953 
4954   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4955   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4956   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4957 
4958   kxorwl(ktmp1, ktmp1, ktmp2);
4959   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4960   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4961   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4962   bind(done);
4963 }
4964 
4965 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4966                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4967                                                                      Register rscratch, AddressLiteral float_sign_flip,
4968                                                                      int vec_enc) {
4969   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4970   Label done;
4971   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4972   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4973   kortestwl(ktmp1, ktmp1);
4974   jccb(Assembler::equal, done);
4975 
4976   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4977   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4978   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4979 
4980   kxorwl(ktmp1, ktmp1, ktmp2);
4981   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4982   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4983   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4984   bind(done);
4985 }
4986 
4987 /*
4988  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4989  * If src is NaN, the result is 0.
4990  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4991  * the result is equal to the value of Long.MIN_VALUE.
4992  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4993  * the result is equal to the value of Long.MAX_VALUE.
4994  */
4995 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4996                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4997                                                                       Register rscratch, AddressLiteral double_sign_flip,
4998                                                                       int vec_enc) {
4999   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5000 
5001   Label done;
5002   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5003   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5004   kortestwl(ktmp1, ktmp1);
5005   jccb(Assembler::equal, done);
5006 
5007   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5008   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5009   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5010 
5011   kxorwl(ktmp1, ktmp1, ktmp2);
5012   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5013   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5014   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5015   bind(done);
5016 }
5017 
5018 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5019                                                              XMMRegister xtmp, int index, int vec_enc) {
5020    assert(vec_enc < Assembler::AVX_512bit, "");
5021    if (vec_enc == Assembler::AVX_256bit) {
5022      vextractf128_high(xtmp, src);
5023      vshufps(dst, src, xtmp, index, vec_enc);
5024    } else {
5025      vshufps(dst, src, zero, index, vec_enc);
5026    }
5027 }
5028 
5029 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5030                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5031                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5032   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5033 
5034   Label done;
5035   // Compare the destination lanes with float_sign_flip
5036   // value to get mask for all special values.
5037   movdqu(xtmp1, float_sign_flip, rscratch);
5038   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5039   ptest(xtmp2, xtmp2);
5040   jccb(Assembler::equal, done);
5041 
5042   // Flip float_sign_flip to get max integer value.
5043   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5044   pxor(xtmp1, xtmp4);
5045 
5046   // Set detination lanes corresponding to unordered source lanes as zero.
5047   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5048   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5049 
5050   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5051   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5052   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5053 
5054   // Recompute the mask for remaining special value.
5055   pxor(xtmp2, xtmp3);
5056   // Extract mask corresponding to non-negative source lanes.
5057   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5058 
5059   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5060   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5061   pand(xtmp3, xtmp2);
5062 
5063   // Replace destination lanes holding special value(0x80000000) with max int
5064   // if corresponding source lane holds a +ve value.
5065   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5066   bind(done);
5067 }
5068 
5069 
5070 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5071                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5072   switch(to_elem_bt) {
5073     case T_SHORT:
5074       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5075       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5076       vpackusdw(dst, dst, zero, vec_enc);
5077       if (vec_enc == Assembler::AVX_256bit) {
5078         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5079       }
5080       break;
5081     case  T_BYTE:
5082       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5083       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5084       vpackusdw(dst, dst, zero, vec_enc);
5085       if (vec_enc == Assembler::AVX_256bit) {
5086         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5087       }
5088       vpackuswb(dst, dst, zero, vec_enc);
5089       break;
5090     default: assert(false, "%s", type2name(to_elem_bt));
5091   }
5092 }
5093 
5094 /*
5095  * Algorithm for vector D2L and F2I conversions:-
5096  * a) Perform vector D2L/F2I cast.
5097  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5098  *    It signifies that source value could be any of the special floating point
5099  *    values(NaN,-Inf,Inf,Max,-Min).
5100  * c) Set destination to zero if source is NaN value.
5101  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5102  */
5103 
5104 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5105                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5106                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5107   int to_elem_sz = type2aelembytes(to_elem_bt);
5108   assert(to_elem_sz <= 4, "");
5109   vcvttps2dq(dst, src, vec_enc);
5110   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5111   if (to_elem_sz < 4) {
5112     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5113     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5114   }
5115 }
5116 
5117 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5118                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5119                                             Register rscratch, int vec_enc) {
5120   int to_elem_sz = type2aelembytes(to_elem_bt);
5121   assert(to_elem_sz <= 4, "");
5122   vcvttps2dq(dst, src, vec_enc);
5123   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5124   switch(to_elem_bt) {
5125     case T_INT:
5126       break;
5127     case T_SHORT:
5128       evpmovdw(dst, dst, vec_enc);
5129       break;
5130     case T_BYTE:
5131       evpmovdb(dst, dst, vec_enc);
5132       break;
5133     default: assert(false, "%s", type2name(to_elem_bt));
5134   }
5135 }
5136 
5137 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5138                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5139                                             Register rscratch, int vec_enc) {
5140   evcvttps2qq(dst, src, vec_enc);
5141   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5142 }
5143 
5144 // Handling for downcasting from double to integer or sub-word types on AVX2.
5145 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5146                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5147                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5148   int to_elem_sz = type2aelembytes(to_elem_bt);
5149   assert(to_elem_sz < 8, "");
5150   vcvttpd2dq(dst, src, vec_enc);
5151   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5152                                               float_sign_flip, vec_enc);
5153   if (to_elem_sz < 4) {
5154     // xtmp4 holds all zero lanes.
5155     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5156   }
5157 }
5158 
5159 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5160                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5161                                             KRegister ktmp2, AddressLiteral sign_flip,
5162                                             Register rscratch, int vec_enc) {
5163   if (VM_Version::supports_avx512dq()) {
5164     evcvttpd2qq(dst, src, vec_enc);
5165     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5166     switch(to_elem_bt) {
5167       case T_LONG:
5168         break;
5169       case T_INT:
5170         evpmovsqd(dst, dst, vec_enc);
5171         break;
5172       case T_SHORT:
5173         evpmovsqd(dst, dst, vec_enc);
5174         evpmovdw(dst, dst, vec_enc);
5175         break;
5176       case T_BYTE:
5177         evpmovsqd(dst, dst, vec_enc);
5178         evpmovdb(dst, dst, vec_enc);
5179         break;
5180       default: assert(false, "%s", type2name(to_elem_bt));
5181     }
5182   } else {
5183     assert(type2aelembytes(to_elem_bt) <= 4, "");
5184     vcvttpd2dq(dst, src, vec_enc);
5185     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5186     switch(to_elem_bt) {
5187       case T_INT:
5188         break;
5189       case T_SHORT:
5190         evpmovdw(dst, dst, vec_enc);
5191         break;
5192       case T_BYTE:
5193         evpmovdb(dst, dst, vec_enc);
5194         break;
5195       default: assert(false, "%s", type2name(to_elem_bt));
5196     }
5197   }
5198 }
5199 
5200 #ifdef _LP64
5201 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5202                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5203                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5204   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5205   // and re-instantiate original MXCSR.RC mode after that.
5206   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5207 
5208   mov64(tmp, julong_cast(0.5L));
5209   evpbroadcastq(xtmp1, tmp, vec_enc);
5210   vaddpd(xtmp1, src , xtmp1, vec_enc);
5211   evcvtpd2qq(dst, xtmp1, vec_enc);
5212   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5213                                                 double_sign_flip, vec_enc);;
5214 
5215   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5216 }
5217 
5218 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5219                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5220                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5221   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5222   // and re-instantiate original MXCSR.RC mode after that.
5223   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5224 
5225   movl(tmp, jint_cast(0.5));
5226   movq(xtmp1, tmp);
5227   vbroadcastss(xtmp1, xtmp1, vec_enc);
5228   vaddps(xtmp1, src , xtmp1, vec_enc);
5229   vcvtps2dq(dst, xtmp1, vec_enc);
5230   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5231                                               float_sign_flip, vec_enc);
5232 
5233   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5234 }
5235 
5236 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5237                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5238                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5239   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5240   // and re-instantiate original MXCSR.RC mode after that.
5241   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5242 
5243   movl(tmp, jint_cast(0.5));
5244   movq(xtmp1, tmp);
5245   vbroadcastss(xtmp1, xtmp1, vec_enc);
5246   vaddps(xtmp1, src , xtmp1, vec_enc);
5247   vcvtps2dq(dst, xtmp1, vec_enc);
5248   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5249 
5250   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5251 }
5252 #endif // _LP64
5253 
5254 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5255                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5256   switch (from_elem_bt) {
5257     case T_BYTE:
5258       switch (to_elem_bt) {
5259         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5260         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5261         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5262         default: ShouldNotReachHere();
5263       }
5264       break;
5265     case T_SHORT:
5266       switch (to_elem_bt) {
5267         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5268         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5269         default: ShouldNotReachHere();
5270       }
5271       break;
5272     case T_INT:
5273       assert(to_elem_bt == T_LONG, "");
5274       vpmovzxdq(dst, src, vlen_enc);
5275       break;
5276     default:
5277       ShouldNotReachHere();
5278   }
5279 }
5280 
5281 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5282                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5283   switch (from_elem_bt) {
5284     case T_BYTE:
5285       switch (to_elem_bt) {
5286         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5287         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5288         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5289         default: ShouldNotReachHere();
5290       }
5291       break;
5292     case T_SHORT:
5293       switch (to_elem_bt) {
5294         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5295         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5296         default: ShouldNotReachHere();
5297       }
5298       break;
5299     case T_INT:
5300       assert(to_elem_bt == T_LONG, "");
5301       vpmovsxdq(dst, src, vlen_enc);
5302       break;
5303     default:
5304       ShouldNotReachHere();
5305   }
5306 }
5307 
5308 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5309                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5310   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5311   assert(vlen_enc != AVX_512bit, "");
5312 
5313   int dst_bt_size = type2aelembytes(dst_bt);
5314   int src_bt_size = type2aelembytes(src_bt);
5315   if (dst_bt_size > src_bt_size) {
5316     switch (dst_bt_size / src_bt_size) {
5317       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5318       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5319       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5320       default: ShouldNotReachHere();
5321     }
5322   } else {
5323     assert(dst_bt_size < src_bt_size, "");
5324     switch (src_bt_size / dst_bt_size) {
5325       case 2: {
5326         if (vlen_enc == AVX_128bit) {
5327           vpacksswb(dst, src, src, vlen_enc);
5328         } else {
5329           vpacksswb(dst, src, src, vlen_enc);
5330           vpermq(dst, dst, 0x08, vlen_enc);
5331         }
5332         break;
5333       }
5334       case 4: {
5335         if (vlen_enc == AVX_128bit) {
5336           vpackssdw(dst, src, src, vlen_enc);
5337           vpacksswb(dst, dst, dst, vlen_enc);
5338         } else {
5339           vpackssdw(dst, src, src, vlen_enc);
5340           vpermq(dst, dst, 0x08, vlen_enc);
5341           vpacksswb(dst, dst, dst, AVX_128bit);
5342         }
5343         break;
5344       }
5345       case 8: {
5346         if (vlen_enc == AVX_128bit) {
5347           vpshufd(dst, src, 0x08, vlen_enc);
5348           vpackssdw(dst, dst, dst, vlen_enc);
5349           vpacksswb(dst, dst, dst, vlen_enc);
5350         } else {
5351           vpshufd(dst, src, 0x08, vlen_enc);
5352           vpermq(dst, dst, 0x08, vlen_enc);
5353           vpackssdw(dst, dst, dst, AVX_128bit);
5354           vpacksswb(dst, dst, dst, AVX_128bit);
5355         }
5356         break;
5357       }
5358       default: ShouldNotReachHere();
5359     }
5360   }
5361 }
5362 
5363 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5364                                    bool merge, BasicType bt, int vlen_enc) {
5365   if (bt == T_INT) {
5366     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5367   } else {
5368     assert(bt == T_LONG, "");
5369     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5370   }
5371 }
5372 
5373 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5374                                    bool merge, BasicType bt, int vlen_enc) {
5375   if (bt == T_INT) {
5376     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5377   } else {
5378     assert(bt == T_LONG, "");
5379     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5380   }
5381 }
5382 
5383 #ifdef _LP64
5384 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5385                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5386                                                int vec_enc) {
5387   int index = 0;
5388   int vindex = 0;
5389   mov64(rtmp1, 0x0101010101010101L);
5390   pdepq(rtmp1, src, rtmp1);
5391   if (mask_len > 8) {
5392     movq(rtmp2, src);
5393     vpxor(xtmp, xtmp, xtmp, vec_enc);
5394     movq(xtmp, rtmp1);
5395   }
5396   movq(dst, rtmp1);
5397 
5398   mask_len -= 8;
5399   while (mask_len > 0) {
5400     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5401     index++;
5402     if ((index % 2) == 0) {
5403       pxor(xtmp, xtmp);
5404     }
5405     mov64(rtmp1, 0x0101010101010101L);
5406     shrq(rtmp2, 8);
5407     pdepq(rtmp1, rtmp2, rtmp1);
5408     pinsrq(xtmp, rtmp1, index % 2);
5409     vindex = index / 2;
5410     if (vindex) {
5411       // Write entire 16 byte vector when both 64 bit
5412       // lanes are update to save redundant instructions.
5413       if (index % 2) {
5414         vinsertf128(dst, dst, xtmp, vindex);
5415       }
5416     } else {
5417       vmovdqu(dst, xtmp);
5418     }
5419     mask_len -= 8;
5420   }
5421 }
5422 
5423 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5424   switch(opc) {
5425     case Op_VectorMaskTrueCount:
5426       popcntq(dst, tmp);
5427       break;
5428     case Op_VectorMaskLastTrue:
5429       if (VM_Version::supports_lzcnt()) {
5430         lzcntq(tmp, tmp);
5431         movl(dst, 63);
5432         subl(dst, tmp);
5433       } else {
5434         movl(dst, -1);
5435         bsrq(tmp, tmp);
5436         cmov32(Assembler::notZero, dst, tmp);
5437       }
5438       break;
5439     case Op_VectorMaskFirstTrue:
5440       if (VM_Version::supports_bmi1()) {
5441         if (masklen < 32) {
5442           orl(tmp, 1 << masklen);
5443           tzcntl(dst, tmp);
5444         } else if (masklen == 32) {
5445           tzcntl(dst, tmp);
5446         } else {
5447           assert(masklen == 64, "");
5448           tzcntq(dst, tmp);
5449         }
5450       } else {
5451         if (masklen < 32) {
5452           orl(tmp, 1 << masklen);
5453           bsfl(dst, tmp);
5454         } else {
5455           assert(masklen == 32 || masklen == 64, "");
5456           movl(dst, masklen);
5457           if (masklen == 32)  {
5458             bsfl(tmp, tmp);
5459           } else {
5460             bsfq(tmp, tmp);
5461           }
5462           cmov32(Assembler::notZero, dst, tmp);
5463         }
5464       }
5465       break;
5466     case Op_VectorMaskToLong:
5467       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5468       break;
5469     default: assert(false, "Unhandled mask operation");
5470   }
5471 }
5472 
5473 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5474                                               int masklen, int masksize, int vec_enc) {
5475   assert(VM_Version::supports_popcnt(), "");
5476 
5477   if(VM_Version::supports_avx512bw()) {
5478     kmovql(tmp, mask);
5479   } else {
5480     assert(masklen <= 16, "");
5481     kmovwl(tmp, mask);
5482   }
5483 
5484   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5485   // operations needs to be clipped.
5486   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5487     andq(tmp, (1 << masklen) - 1);
5488   }
5489 
5490   vector_mask_operation_helper(opc, dst, tmp, masklen);
5491 }
5492 
5493 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5494                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5495   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5496          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5497   assert(VM_Version::supports_popcnt(), "");
5498 
5499   bool need_clip = false;
5500   switch(bt) {
5501     case T_BOOLEAN:
5502       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5503       vpxor(xtmp, xtmp, xtmp, vec_enc);
5504       vpsubb(xtmp, xtmp, mask, vec_enc);
5505       vpmovmskb(tmp, xtmp, vec_enc);
5506       need_clip = masklen < 16;
5507       break;
5508     case T_BYTE:
5509       vpmovmskb(tmp, mask, vec_enc);
5510       need_clip = masklen < 16;
5511       break;
5512     case T_SHORT:
5513       vpacksswb(xtmp, mask, mask, vec_enc);
5514       if (masklen >= 16) {
5515         vpermpd(xtmp, xtmp, 8, vec_enc);
5516       }
5517       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5518       need_clip = masklen < 16;
5519       break;
5520     case T_INT:
5521     case T_FLOAT:
5522       vmovmskps(tmp, mask, vec_enc);
5523       need_clip = masklen < 4;
5524       break;
5525     case T_LONG:
5526     case T_DOUBLE:
5527       vmovmskpd(tmp, mask, vec_enc);
5528       need_clip = masklen < 2;
5529       break;
5530     default: assert(false, "Unhandled type, %s", type2name(bt));
5531   }
5532 
5533   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5534   // operations needs to be clipped.
5535   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5536     // need_clip implies masklen < 32
5537     andq(tmp, (1 << masklen) - 1);
5538   }
5539 
5540   vector_mask_operation_helper(opc, dst, tmp, masklen);
5541 }
5542 
5543 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5544                                              Register rtmp2, int mask_len) {
5545   kmov(rtmp1, src);
5546   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5547   mov64(rtmp2, -1L);
5548   pextq(rtmp2, rtmp2, rtmp1);
5549   kmov(dst, rtmp2);
5550 }
5551 
5552 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5553                                                     XMMRegister mask, Register rtmp, Register rscratch,
5554                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5555                                                     int vec_enc) {
5556   assert(type2aelembytes(bt) >= 4, "");
5557   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5558   address compress_perm_table = nullptr;
5559   address expand_perm_table = nullptr;
5560   if (type2aelembytes(bt) == 8) {
5561     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5562     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5563     vmovmskpd(rtmp, mask, vec_enc);
5564   } else {
5565     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5566     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5567     vmovmskps(rtmp, mask, vec_enc);
5568   }
5569   shlq(rtmp, 5); // for 32 byte permute row.
5570   if (opcode == Op_CompressV) {
5571     lea(rscratch, ExternalAddress(compress_perm_table));
5572   } else {
5573     lea(rscratch, ExternalAddress(expand_perm_table));
5574   }
5575   addptr(rtmp, rscratch);
5576   vmovdqu(permv, Address(rtmp));
5577   vpermps(dst, permv, src, Assembler::AVX_256bit);
5578   vpxor(xtmp, xtmp, xtmp, vec_enc);
5579   // Blend the result with zero vector using permute mask, each column entry
5580   // in a permute table row contains either a valid permute index or a -1 (default)
5581   // value, this can potentially be used as a blending mask after
5582   // compressing/expanding the source vector lanes.
5583   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5584 }
5585 
5586 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5587                                                bool merge, BasicType bt, int vec_enc) {
5588   if (opcode == Op_CompressV) {
5589     switch(bt) {
5590     case T_BYTE:
5591       evpcompressb(dst, mask, src, merge, vec_enc);
5592       break;
5593     case T_CHAR:
5594     case T_SHORT:
5595       evpcompressw(dst, mask, src, merge, vec_enc);
5596       break;
5597     case T_INT:
5598       evpcompressd(dst, mask, src, merge, vec_enc);
5599       break;
5600     case T_FLOAT:
5601       evcompressps(dst, mask, src, merge, vec_enc);
5602       break;
5603     case T_LONG:
5604       evpcompressq(dst, mask, src, merge, vec_enc);
5605       break;
5606     case T_DOUBLE:
5607       evcompresspd(dst, mask, src, merge, vec_enc);
5608       break;
5609     default:
5610       fatal("Unsupported type %s", type2name(bt));
5611       break;
5612     }
5613   } else {
5614     assert(opcode == Op_ExpandV, "");
5615     switch(bt) {
5616     case T_BYTE:
5617       evpexpandb(dst, mask, src, merge, vec_enc);
5618       break;
5619     case T_CHAR:
5620     case T_SHORT:
5621       evpexpandw(dst, mask, src, merge, vec_enc);
5622       break;
5623     case T_INT:
5624       evpexpandd(dst, mask, src, merge, vec_enc);
5625       break;
5626     case T_FLOAT:
5627       evexpandps(dst, mask, src, merge, vec_enc);
5628       break;
5629     case T_LONG:
5630       evpexpandq(dst, mask, src, merge, vec_enc);
5631       break;
5632     case T_DOUBLE:
5633       evexpandpd(dst, mask, src, merge, vec_enc);
5634       break;
5635     default:
5636       fatal("Unsupported type %s", type2name(bt));
5637       break;
5638     }
5639   }
5640 }
5641 #endif
5642 
5643 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5644                                            KRegister ktmp1, int vec_enc) {
5645   if (opcode == Op_SignumVD) {
5646     vsubpd(dst, zero, one, vec_enc);
5647     // if src < 0 ? -1 : 1
5648     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5649     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5650     // if src == NaN, -0.0 or 0.0 return src.
5651     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5652     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5653   } else {
5654     assert(opcode == Op_SignumVF, "");
5655     vsubps(dst, zero, one, vec_enc);
5656     // if src < 0 ? -1 : 1
5657     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5658     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5659     // if src == NaN, -0.0 or 0.0 return src.
5660     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5661     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5662   }
5663 }
5664 
5665 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5666                                           XMMRegister xtmp1, int vec_enc) {
5667   if (opcode == Op_SignumVD) {
5668     vsubpd(dst, zero, one, vec_enc);
5669     // if src < 0 ? -1 : 1
5670     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5671     // if src == NaN, -0.0 or 0.0 return src.
5672     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5673     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5674   } else {
5675     assert(opcode == Op_SignumVF, "");
5676     vsubps(dst, zero, one, vec_enc);
5677     // if src < 0 ? -1 : 1
5678     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5679     // if src == NaN, -0.0 or 0.0 return src.
5680     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5681     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5682   }
5683 }
5684 
5685 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5686   if (VM_Version::supports_avx512bw()) {
5687     if (mask_len > 32) {
5688       kmovql(dst, src);
5689     } else {
5690       kmovdl(dst, src);
5691       if (mask_len != 32) {
5692         kshiftrdl(dst, dst, 32 - mask_len);
5693       }
5694     }
5695   } else {
5696     assert(mask_len <= 16, "");
5697     kmovwl(dst, src);
5698     if (mask_len != 16) {
5699       kshiftrwl(dst, dst, 16 - mask_len);
5700     }
5701   }
5702 }
5703 
5704 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5705   int lane_size = type2aelembytes(bt);
5706   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5707   if ((is_LP64 || lane_size < 8) &&
5708       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5709        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5710     movptr(rtmp, imm32);
5711     switch(lane_size) {
5712       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5713       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5714       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5715       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5716       fatal("Unsupported lane size %d", lane_size);
5717       break;
5718     }
5719   } else {
5720     movptr(rtmp, imm32);
5721     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5722     switch(lane_size) {
5723       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5724       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5725       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5726       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5727       fatal("Unsupported lane size %d", lane_size);
5728       break;
5729     }
5730   }
5731 }
5732 
5733 //
5734 // Following is lookup table based popcount computation algorithm:-
5735 //       Index   Bit set count
5736 //     [ 0000 ->   0,
5737 //       0001 ->   1,
5738 //       0010 ->   1,
5739 //       0011 ->   2,
5740 //       0100 ->   1,
5741 //       0101 ->   2,
5742 //       0110 ->   2,
5743 //       0111 ->   3,
5744 //       1000 ->   1,
5745 //       1001 ->   2,
5746 //       1010 ->   3,
5747 //       1011 ->   3,
5748 //       1100 ->   2,
5749 //       1101 ->   3,
5750 //       1111 ->   4 ]
5751 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5752 //     shuffle indices for lookup table access.
5753 //  b. Right shift each byte of vector lane by 4 positions.
5754 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5755 //     shuffle indices for lookup table access.
5756 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5757 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5758 //     count of all the bytes of a quadword.
5759 //  f. Perform step e. for upper 128bit vector lane.
5760 //  g. Pack the bitset count of quadwords back to double word.
5761 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5762 
5763 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5764                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5765   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5766   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5767   vpsrlw(dst, src, 4, vec_enc);
5768   vpand(dst, dst, xtmp1, vec_enc);
5769   vpand(xtmp1, src, xtmp1, vec_enc);
5770   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5771   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5772   vpshufb(dst, xtmp2, dst, vec_enc);
5773   vpaddb(dst, dst, xtmp1, vec_enc);
5774 }
5775 
5776 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5777                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5778   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5779   // Following code is as per steps e,f,g and h of above algorithm.
5780   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5781   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5782   vpsadbw(dst, dst, xtmp2, vec_enc);
5783   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5784   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5785   vpackuswb(dst, xtmp1, dst, vec_enc);
5786 }
5787 
5788 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5789                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5790   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5791   // Add the popcount of upper and lower bytes of word.
5792   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5793   vpsrlw(dst, xtmp1, 8, vec_enc);
5794   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5795   vpaddw(dst, dst, xtmp1, vec_enc);
5796 }
5797 
5798 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5799                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5800   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5801   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5802   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5803 }
5804 
5805 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5806                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5807   switch(bt) {
5808     case T_LONG:
5809       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5810       break;
5811     case T_INT:
5812       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5813       break;
5814     case T_CHAR:
5815     case T_SHORT:
5816       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5817       break;
5818     case T_BYTE:
5819     case T_BOOLEAN:
5820       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5821       break;
5822     default:
5823       fatal("Unsupported type %s", type2name(bt));
5824       break;
5825   }
5826 }
5827 
5828 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5829                                                       KRegister mask, bool merge, int vec_enc) {
5830   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5831   switch(bt) {
5832     case T_LONG:
5833       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5834       evpopcntq(dst, mask, src, merge, vec_enc);
5835       break;
5836     case T_INT:
5837       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5838       evpopcntd(dst, mask, src, merge, vec_enc);
5839       break;
5840     case T_CHAR:
5841     case T_SHORT:
5842       assert(VM_Version::supports_avx512_bitalg(), "");
5843       evpopcntw(dst, mask, src, merge, vec_enc);
5844       break;
5845     case T_BYTE:
5846     case T_BOOLEAN:
5847       assert(VM_Version::supports_avx512_bitalg(), "");
5848       evpopcntb(dst, mask, src, merge, vec_enc);
5849       break;
5850     default:
5851       fatal("Unsupported type %s", type2name(bt));
5852       break;
5853   }
5854 }
5855 
5856 #ifndef _LP64
5857 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5858   assert(VM_Version::supports_avx512bw(), "");
5859   kmovdl(tmp, src);
5860   kunpckdql(dst, tmp, tmp);
5861 }
5862 #endif
5863 
5864 // Bit reversal algorithm first reverses the bits of each byte followed by
5865 // a byte level reversal for multi-byte primitive types (short/int/long).
5866 // Algorithm performs a lookup table access to get reverse bit sequence
5867 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5868 // is obtained by swapping the reverse bit sequences of upper and lower
5869 // nibble of a byte.
5870 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5871                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5872   if (VM_Version::supports_avx512vlbw()) {
5873 
5874     // Get the reverse bit sequence of lower nibble of each byte.
5875     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5876     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5877     evpandq(dst, xtmp2, src, vec_enc);
5878     vpshufb(dst, xtmp1, dst, vec_enc);
5879     vpsllq(dst, dst, 4, vec_enc);
5880 
5881     // Get the reverse bit sequence of upper nibble of each byte.
5882     vpandn(xtmp2, xtmp2, src, vec_enc);
5883     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5884     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5885 
5886     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5887     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5888     evporq(xtmp2, dst, xtmp2, vec_enc);
5889     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5890 
5891   } else if(vec_enc == Assembler::AVX_512bit) {
5892     // Shift based bit reversal.
5893     assert(bt == T_LONG || bt == T_INT, "");
5894 
5895     // Swap lower and upper nibble of each byte.
5896     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5897 
5898     // Swap two least and most significant bits of each nibble.
5899     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5900 
5901     // Swap adjacent pair of bits.
5902     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5903     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5904 
5905     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5906     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5907   } else {
5908     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5909     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5910 
5911     // Get the reverse bit sequence of lower nibble of each byte.
5912     vpand(dst, xtmp2, src, vec_enc);
5913     vpshufb(dst, xtmp1, dst, vec_enc);
5914     vpsllq(dst, dst, 4, vec_enc);
5915 
5916     // Get the reverse bit sequence of upper nibble of each byte.
5917     vpandn(xtmp2, xtmp2, src, vec_enc);
5918     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5919     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5920 
5921     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5922     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5923     vpor(xtmp2, dst, xtmp2, vec_enc);
5924     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5925   }
5926 }
5927 
5928 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5929                                                 XMMRegister xtmp, Register rscratch) {
5930   assert(VM_Version::supports_gfni(), "");
5931   assert(rscratch != noreg || always_reachable(mask), "missing");
5932 
5933   // Galois field instruction based bit reversal based on following algorithm.
5934   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5935   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5936   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5937   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5938 }
5939 
5940 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5941                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5942   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5943   evpandq(dst, xtmp1, src, vec_enc);
5944   vpsllq(dst, dst, nbits, vec_enc);
5945   vpandn(xtmp1, xtmp1, src, vec_enc);
5946   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5947   evporq(dst, dst, xtmp1, vec_enc);
5948 }
5949 
5950 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5951                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5952   // Shift based bit reversal.
5953   assert(VM_Version::supports_evex(), "");
5954   switch(bt) {
5955     case T_LONG:
5956       // Swap upper and lower double word of each quad word.
5957       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5958       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5959       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5960       break;
5961     case T_INT:
5962       // Swap upper and lower word of each double word.
5963       evprord(xtmp1, k0, src, 16, true, vec_enc);
5964       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5965       break;
5966     case T_CHAR:
5967     case T_SHORT:
5968       // Swap upper and lower byte of each word.
5969       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5970       break;
5971     case T_BYTE:
5972       evmovdquq(dst, k0, src, true, vec_enc);
5973       break;
5974     default:
5975       fatal("Unsupported type %s", type2name(bt));
5976       break;
5977   }
5978 }
5979 
5980 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5981   if (bt == T_BYTE) {
5982     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5983       evmovdquq(dst, k0, src, true, vec_enc);
5984     } else {
5985       vmovdqu(dst, src);
5986     }
5987     return;
5988   }
5989   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5990   // pre-computed shuffle indices.
5991   switch(bt) {
5992     case T_LONG:
5993       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5994       break;
5995     case T_INT:
5996       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5997       break;
5998     case T_CHAR:
5999     case T_SHORT:
6000       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6001       break;
6002     default:
6003       fatal("Unsupported type %s", type2name(bt));
6004       break;
6005   }
6006   vpshufb(dst, src, dst, vec_enc);
6007 }
6008 
6009 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6010                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6011                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6012   assert(is_integral_type(bt), "");
6013   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6014   assert(VM_Version::supports_avx512cd(), "");
6015   switch(bt) {
6016     case T_LONG:
6017       evplzcntq(dst, ktmp, src, merge, vec_enc);
6018       break;
6019     case T_INT:
6020       evplzcntd(dst, ktmp, src, merge, vec_enc);
6021       break;
6022     case T_SHORT:
6023       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6024       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6025       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6026       vpunpckhwd(dst, xtmp1, src, vec_enc);
6027       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6028       vpackusdw(dst, xtmp2, dst, vec_enc);
6029       break;
6030     case T_BYTE:
6031       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6032       // accessing the lookup table.
6033       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6034       // accessing the lookup table.
6035       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6036       assert(VM_Version::supports_avx512bw(), "");
6037       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6038       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6039       vpand(xtmp2, dst, src, vec_enc);
6040       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6041       vpsrlw(xtmp3, src, 4, vec_enc);
6042       vpand(xtmp3, dst, xtmp3, vec_enc);
6043       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6044       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6045       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6046       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6047       break;
6048     default:
6049       fatal("Unsupported type %s", type2name(bt));
6050       break;
6051   }
6052 }
6053 
6054 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6055                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6056   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6057   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6058   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6059   // accessing the lookup table.
6060   vpand(dst, xtmp2, src, vec_enc);
6061   vpshufb(dst, xtmp1, dst, vec_enc);
6062   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6063   // accessing the lookup table.
6064   vpsrlw(xtmp3, src, 4, vec_enc);
6065   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6066   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6067   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6068   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6069   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6070   vpaddb(dst, dst, xtmp2, vec_enc);
6071   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6072 }
6073 
6074 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6075                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6076   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6077   // Add zero counts of lower byte and upper byte of a word if
6078   // upper byte holds a zero value.
6079   vpsrlw(xtmp3, src, 8, vec_enc);
6080   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6081   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6082   vpsllw(xtmp2, dst, 8, vec_enc);
6083   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6084   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6085   vpsrlw(dst, dst, 8, vec_enc);
6086 }
6087 
6088 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6089                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6090   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6091   // hence biased exponent can be used to compute leading zero count as per
6092   // following formula:-
6093   // LZCNT = 32 - (biased_exp - 127)
6094   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6095 
6096   // Broadcast 0xFF
6097   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6098   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6099 
6100   // Extract biased exponent.
6101   vcvtdq2ps(dst, src, vec_enc);
6102   vpsrld(dst, dst, 23, vec_enc);
6103   vpand(dst, dst, xtmp1, vec_enc);
6104 
6105   // Broadcast 127.
6106   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6107   // Exponent = biased_exp - 127
6108   vpsubd(dst, dst, xtmp1, vec_enc);
6109 
6110   // Exponent = Exponent  + 1
6111   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6112   vpaddd(dst, dst, xtmp3, vec_enc);
6113 
6114   // Replace -ve exponent with zero, exponent is -ve when src
6115   // lane contains a zero value.
6116   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6117   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6118 
6119   // Rematerialize broadcast 32.
6120   vpslld(xtmp1, xtmp3, 5, vec_enc);
6121   // Exponent is 32 if corresponding source lane contains max_int value.
6122   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6123   // LZCNT = 32 - exponent
6124   vpsubd(dst, xtmp1, dst, vec_enc);
6125 
6126   // Replace LZCNT with a value 1 if corresponding source lane
6127   // contains max_int value.
6128   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6129 
6130   // Replace biased_exp with 0 if source lane value is less than zero.
6131   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6132   vblendvps(dst, dst, xtmp2, src, vec_enc);
6133 }
6134 
6135 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6136                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6137   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6138   // Add zero counts of lower word and upper word of a double word if
6139   // upper word holds a zero value.
6140   vpsrld(xtmp3, src, 16, vec_enc);
6141   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6142   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6143   vpslld(xtmp2, dst, 16, vec_enc);
6144   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6145   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6146   vpsrld(dst, dst, 16, vec_enc);
6147   // Add zero counts of lower doubleword and upper doubleword of a
6148   // quadword if upper doubleword holds a zero value.
6149   vpsrlq(xtmp3, src, 32, vec_enc);
6150   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6151   vpsllq(xtmp2, dst, 32, vec_enc);
6152   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6153   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6154   vpsrlq(dst, dst, 32, vec_enc);
6155 }
6156 
6157 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6158                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6159                                                        Register rtmp, int vec_enc) {
6160   assert(is_integral_type(bt), "unexpected type");
6161   assert(vec_enc < Assembler::AVX_512bit, "");
6162   switch(bt) {
6163     case T_LONG:
6164       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6165       break;
6166     case T_INT:
6167       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6168       break;
6169     case T_SHORT:
6170       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6171       break;
6172     case T_BYTE:
6173       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6174       break;
6175     default:
6176       fatal("Unsupported type %s", type2name(bt));
6177       break;
6178   }
6179 }
6180 
6181 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6182   switch(bt) {
6183     case T_BYTE:
6184       vpsubb(dst, src1, src2, vec_enc);
6185       break;
6186     case T_SHORT:
6187       vpsubw(dst, src1, src2, vec_enc);
6188       break;
6189     case T_INT:
6190       vpsubd(dst, src1, src2, vec_enc);
6191       break;
6192     case T_LONG:
6193       vpsubq(dst, src1, src2, vec_enc);
6194       break;
6195     default:
6196       fatal("Unsupported type %s", type2name(bt));
6197       break;
6198   }
6199 }
6200 
6201 // Trailing zero count computation is based on leading zero count operation as per
6202 // following equation. All AVX3 targets support AVX512CD feature which offers
6203 // direct vector instruction to compute leading zero count.
6204 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6205 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6206                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6207                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6208   assert(is_integral_type(bt), "");
6209   // xtmp = -1
6210   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6211   // xtmp = xtmp + src
6212   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6213   // xtmp = xtmp & ~src
6214   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6215   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6216   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6217   vpsub(bt, dst, xtmp4, dst, vec_enc);
6218 }
6219 
6220 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6221 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6222 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6223                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6224   assert(is_integral_type(bt), "");
6225   // xtmp = 0
6226   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6227   // xtmp = 0 - src
6228   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6229   // xtmp = xtmp | src
6230   vpor(xtmp3, xtmp3, src, vec_enc);
6231   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6232   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6233   vpsub(bt, dst, xtmp1, dst, vec_enc);
6234 }
6235 
6236 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6237   Label done;
6238   Label neg_divisor_fastpath;
6239   cmpl(divisor, 0);
6240   jccb(Assembler::less, neg_divisor_fastpath);
6241   xorl(rdx, rdx);
6242   divl(divisor);
6243   jmpb(done);
6244   bind(neg_divisor_fastpath);
6245   // Fastpath for divisor < 0:
6246   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6247   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6248   movl(rdx, rax);
6249   subl(rdx, divisor);
6250   if (VM_Version::supports_bmi1()) {
6251     andnl(rax, rdx, rax);
6252   } else {
6253     notl(rdx);
6254     andl(rax, rdx);
6255   }
6256   shrl(rax, 31);
6257   bind(done);
6258 }
6259 
6260 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6261   Label done;
6262   Label neg_divisor_fastpath;
6263   cmpl(divisor, 0);
6264   jccb(Assembler::less, neg_divisor_fastpath);
6265   xorl(rdx, rdx);
6266   divl(divisor);
6267   jmpb(done);
6268   bind(neg_divisor_fastpath);
6269   // Fastpath when divisor < 0:
6270   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6271   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6272   movl(rdx, rax);
6273   subl(rax, divisor);
6274   if (VM_Version::supports_bmi1()) {
6275     andnl(rax, rax, rdx);
6276   } else {
6277     notl(rax);
6278     andl(rax, rdx);
6279   }
6280   sarl(rax, 31);
6281   andl(rax, divisor);
6282   subl(rdx, rax);
6283   bind(done);
6284 }
6285 
6286 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6287   Label done;
6288   Label neg_divisor_fastpath;
6289 
6290   cmpl(divisor, 0);
6291   jccb(Assembler::less, neg_divisor_fastpath);
6292   xorl(rdx, rdx);
6293   divl(divisor);
6294   jmpb(done);
6295   bind(neg_divisor_fastpath);
6296   // Fastpath for divisor < 0:
6297   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6298   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6299   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6300   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6301   movl(rdx, rax);
6302   subl(rax, divisor);
6303   if (VM_Version::supports_bmi1()) {
6304     andnl(rax, rax, rdx);
6305   } else {
6306     notl(rax);
6307     andl(rax, rdx);
6308   }
6309   movl(tmp, rax);
6310   shrl(rax, 31); // quotient
6311   sarl(tmp, 31);
6312   andl(tmp, divisor);
6313   subl(rdx, tmp); // remainder
6314   bind(done);
6315 }
6316 
6317 #ifdef _LP64
6318 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6319                                  XMMRegister xtmp2, Register rtmp) {
6320   if(VM_Version::supports_gfni()) {
6321     // Galois field instruction based bit reversal based on following algorithm.
6322     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6323     mov64(rtmp, 0x8040201008040201L);
6324     movq(xtmp1, src);
6325     movq(xtmp2, rtmp);
6326     gf2p8affineqb(xtmp1, xtmp2, 0);
6327     movq(dst, xtmp1);
6328   } else {
6329     // Swap even and odd numbered bits.
6330     movl(rtmp, src);
6331     andl(rtmp, 0x55555555);
6332     shll(rtmp, 1);
6333     movl(dst, src);
6334     andl(dst, 0xAAAAAAAA);
6335     shrl(dst, 1);
6336     orl(dst, rtmp);
6337 
6338     // Swap LSB and MSB 2 bits of each nibble.
6339     movl(rtmp, dst);
6340     andl(rtmp, 0x33333333);
6341     shll(rtmp, 2);
6342     andl(dst, 0xCCCCCCCC);
6343     shrl(dst, 2);
6344     orl(dst, rtmp);
6345 
6346     // Swap LSB and MSB 4 bits of each byte.
6347     movl(rtmp, dst);
6348     andl(rtmp, 0x0F0F0F0F);
6349     shll(rtmp, 4);
6350     andl(dst, 0xF0F0F0F0);
6351     shrl(dst, 4);
6352     orl(dst, rtmp);
6353   }
6354   bswapl(dst);
6355 }
6356 
6357 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6358                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6359   if(VM_Version::supports_gfni()) {
6360     // Galois field instruction based bit reversal based on following algorithm.
6361     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6362     mov64(rtmp1, 0x8040201008040201L);
6363     movq(xtmp1, src);
6364     movq(xtmp2, rtmp1);
6365     gf2p8affineqb(xtmp1, xtmp2, 0);
6366     movq(dst, xtmp1);
6367   } else {
6368     // Swap even and odd numbered bits.
6369     movq(rtmp1, src);
6370     mov64(rtmp2, 0x5555555555555555L);
6371     andq(rtmp1, rtmp2);
6372     shlq(rtmp1, 1);
6373     movq(dst, src);
6374     notq(rtmp2);
6375     andq(dst, rtmp2);
6376     shrq(dst, 1);
6377     orq(dst, rtmp1);
6378 
6379     // Swap LSB and MSB 2 bits of each nibble.
6380     movq(rtmp1, dst);
6381     mov64(rtmp2, 0x3333333333333333L);
6382     andq(rtmp1, rtmp2);
6383     shlq(rtmp1, 2);
6384     notq(rtmp2);
6385     andq(dst, rtmp2);
6386     shrq(dst, 2);
6387     orq(dst, rtmp1);
6388 
6389     // Swap LSB and MSB 4 bits of each byte.
6390     movq(rtmp1, dst);
6391     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6392     andq(rtmp1, rtmp2);
6393     shlq(rtmp1, 4);
6394     notq(rtmp2);
6395     andq(dst, rtmp2);
6396     shrq(dst, 4);
6397     orq(dst, rtmp1);
6398   }
6399   bswapq(dst);
6400 }
6401 
6402 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6403   Label done;
6404   Label neg_divisor_fastpath;
6405   cmpq(divisor, 0);
6406   jccb(Assembler::less, neg_divisor_fastpath);
6407   xorl(rdx, rdx);
6408   divq(divisor);
6409   jmpb(done);
6410   bind(neg_divisor_fastpath);
6411   // Fastpath for divisor < 0:
6412   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6413   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6414   movq(rdx, rax);
6415   subq(rdx, divisor);
6416   if (VM_Version::supports_bmi1()) {
6417     andnq(rax, rdx, rax);
6418   } else {
6419     notq(rdx);
6420     andq(rax, rdx);
6421   }
6422   shrq(rax, 63);
6423   bind(done);
6424 }
6425 
6426 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6427   Label done;
6428   Label neg_divisor_fastpath;
6429   cmpq(divisor, 0);
6430   jccb(Assembler::less, neg_divisor_fastpath);
6431   xorq(rdx, rdx);
6432   divq(divisor);
6433   jmp(done);
6434   bind(neg_divisor_fastpath);
6435   // Fastpath when divisor < 0:
6436   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6437   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6438   movq(rdx, rax);
6439   subq(rax, divisor);
6440   if (VM_Version::supports_bmi1()) {
6441     andnq(rax, rax, rdx);
6442   } else {
6443     notq(rax);
6444     andq(rax, rdx);
6445   }
6446   sarq(rax, 63);
6447   andq(rax, divisor);
6448   subq(rdx, rax);
6449   bind(done);
6450 }
6451 
6452 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6453   Label done;
6454   Label neg_divisor_fastpath;
6455   cmpq(divisor, 0);
6456   jccb(Assembler::less, neg_divisor_fastpath);
6457   xorq(rdx, rdx);
6458   divq(divisor);
6459   jmp(done);
6460   bind(neg_divisor_fastpath);
6461   // Fastpath for divisor < 0:
6462   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6463   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6464   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6465   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6466   movq(rdx, rax);
6467   subq(rax, divisor);
6468   if (VM_Version::supports_bmi1()) {
6469     andnq(rax, rax, rdx);
6470   } else {
6471     notq(rax);
6472     andq(rax, rdx);
6473   }
6474   movq(tmp, rax);
6475   shrq(rax, 63); // quotient
6476   sarq(tmp, 63);
6477   andq(tmp, divisor);
6478   subq(rdx, tmp); // remainder
6479   bind(done);
6480 }
6481 #endif
6482 
6483 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6484                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6485                                         int vlen_enc) {
6486   assert(VM_Version::supports_avx512bw(), "");
6487   // Byte shuffles are inlane operations and indices are determined using
6488   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6489   // normalized to index range 0-15. This makes sure that all the multiples
6490   // of an index value are placed at same relative position in 128 bit
6491   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6492   // will be 16th element in their respective 128 bit lanes.
6493   movl(rtmp, 16);
6494   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6495 
6496   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6497   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6498   // original shuffle indices and move the shuffled lanes corresponding to true
6499   // mask to destination vector.
6500   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6501   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6502   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6503 
6504   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6505   // and broadcasting second 128 bit lane.
6506   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6507   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6508   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6509   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6510   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6511 
6512   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6513   // and broadcasting third 128 bit lane.
6514   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6515   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6516   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6517   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6518   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6519 
6520   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6521   // and broadcasting third 128 bit lane.
6522   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6523   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6524   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6525   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6526   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6527 }
6528 
6529 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6530                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6531   if (vlen_enc == AVX_128bit) {
6532     vpermilps(dst, src, shuffle, vlen_enc);
6533   } else if (bt == T_INT) {
6534     vpermd(dst, shuffle, src, vlen_enc);
6535   } else {
6536     assert(bt == T_FLOAT, "");
6537     vpermps(dst, shuffle, src, vlen_enc);
6538   }
6539 }