1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  53   if (C->clinit_barrier_on_entry()) {
  54     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  55     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  56 
  57     Label L_skip_barrier;
  58     Register klass = rscratch1;
  59 
  60     mov_metadata(klass, C->method()->holder()->constant_encoding());
  61     clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
  62 
  63     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  64 
  65     bind(L_skip_barrier);
  66   }
  67 
  68   int framesize = C->output()->frame_size_in_bytes();
  69   int bangsize = C->output()->bang_size_in_bytes();
  70   bool fp_mode_24b = false;
  71   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  72 
  73   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  74   // NativeJump::patch_verified_entry will be able to patch out the entry
  75   // code safely. The push to verify stack depth is ok at 5 bytes,
  76   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  77   // stack bang then we must use the 6 byte frame allocation even if
  78   // we have no frame. :-(
  79   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  80 
  81   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  82   // Remove word for return addr
  83   framesize -= wordSize;
  84   stack_bang_size -= wordSize;
  85 
  86   // Calls to C2R adapters often do not accept exceptional returns.
  87   // We require that their callers must bang for them.  But be careful, because
  88   // some VM calls (such as call site linkage) can use several kilobytes of
  89   // stack.  But the stack safety zone should account for that.
  90   // See bugs 4446381, 4468289, 4497237.
  91   if (stack_bang_size > 0) {
  92     generate_stack_overflow_check(stack_bang_size);
  93 
  94     // We always push rbp, so that on return to interpreter rbp, will be
  95     // restored correctly and we can correct the stack.
  96     push(rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       mov(rbp, rsp);
 100     }
 101     // Remove word for ebp
 102     framesize -= wordSize;
 103 
 104     // Create frame
 105     if (framesize) {
 106       subptr(rsp, framesize);
 107     }
 108   } else {
 109     // Create frame (force generation of a 4 byte immediate value)
 110     subptr_imm32(rsp, framesize);
 111 
 112     // Save RBP register now.
 113     framesize -= wordSize;
 114     movptr(Address(rsp, framesize), rbp);
 115     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 116     if (PreserveFramePointer) {
 117       movptr(rbp, rsp);
 118       if (framesize > 0) {
 119         addptr(rbp, framesize);
 120       }
 121     }
 122   }
 123 
 124   if (C->needs_stack_repair()) {
 125     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 126     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 127     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
 128   }
 129 
 130   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 131     framesize -= wordSize;
 132     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 133   }
 134 
 135 #ifndef _LP64
 136   // If method sets FPU control word do it now
 137   if (fp_mode_24b) {
 138     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 139   }
 140   if (UseSSE >= 2 && VerifyFPU) {
 141     verify_FPU(0, "FPU stack must be clean on entry");
 142   }
 143 #endif
 144 
 145 #ifdef ASSERT
 146   if (VerifyStackAtCalls) {
 147     Label L;
 148     push(rax);
 149     mov(rax, rsp);
 150     andptr(rax, StackAlignmentInBytes-1);
 151     cmpptr(rax, StackAlignmentInBytes-wordSize);
 152     pop(rax);
 153     jcc(Assembler::equal, L);
 154     STOP("Stack is not properly aligned!");
 155     bind(L);
 156   }
 157 #endif
 158 }
 159 
 160 void C2_MacroAssembler::entry_barrier() {
 161   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 162 #ifdef _LP64
 163   // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 164   Label dummy_slow_path;
 165   Label dummy_continuation;
 166   Label* slow_path = &dummy_slow_path;
 167   Label* continuation = &dummy_continuation;
 168   if (!Compile::current()->output()->in_scratch_emit_size()) {
 169     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 170     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 171     Compile::current()->output()->add_stub(stub);
 172     slow_path = &stub->entry();
 173     continuation = &stub->continuation();
 174   }
 175   bs->nmethod_entry_barrier(this, slow_path, continuation);
 176 #else
 177   // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 178   bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 179 #endif
 180 }
 181 
 182 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 183   switch (vlen_in_bytes) {
 184     case  4: // fall-through
 185     case  8: // fall-through
 186     case 16: return Assembler::AVX_128bit;
 187     case 32: return Assembler::AVX_256bit;
 188     case 64: return Assembler::AVX_512bit;
 189 
 190     default: {
 191       ShouldNotReachHere();
 192       return Assembler::AVX_NoVec;
 193     }
 194   }
 195 }
 196 
 197 // fast_lock and fast_unlock used by C2
 198 
 199 // Because the transitions from emitted code to the runtime
 200 // monitorenter/exit helper stubs are so slow it's critical that
 201 // we inline both the stack-locking fast path and the inflated fast path.
 202 //
 203 // See also: cmpFastLock and cmpFastUnlock.
 204 //
 205 // What follows is a specialized inline transliteration of the code
 206 // in enter() and exit(). If we're concerned about I$ bloat another
 207 // option would be to emit TrySlowEnter and TrySlowExit methods
 208 // at startup-time.  These methods would accept arguments as
 209 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 210 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 211 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 212 // In practice, however, the # of lock sites is bounded and is usually small.
 213 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 214 // if the processor uses simple bimodal branch predictors keyed by EIP
 215 // Since the helper routines would be called from multiple synchronization
 216 // sites.
 217 //
 218 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 219 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 220 // to those specialized methods.  That'd give us a mostly platform-independent
 221 // implementation that the JITs could optimize and inline at their pleasure.
 222 // Done correctly, the only time we'd need to cross to native could would be
 223 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 224 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 225 // (b) explicit barriers or fence operations.
 226 //
 227 // TODO:
 228 //
 229 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 230 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 231 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 232 //    the lock operators would typically be faster than reifying Self.
 233 //
 234 // *  Ideally I'd define the primitives as:
 235 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 236 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 237 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 238 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 239 //    Furthermore the register assignments are overconstrained, possibly resulting in
 240 //    sub-optimal code near the synchronization site.
 241 //
 242 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 243 //    Alternately, use a better sp-proximity test.
 244 //
 245 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 246 //    Either one is sufficient to uniquely identify a thread.
 247 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 248 //
 249 // *  Intrinsify notify() and notifyAll() for the common cases where the
 250 //    object is locked by the calling thread but the waitlist is empty.
 251 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 252 //
 253 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 254 //    But beware of excessive branch density on AMD Opterons.
 255 //
 256 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 257 //    or failure of the fast path.  If the fast path fails then we pass
 258 //    control to the slow path, typically in C.  In fast_lock and
 259 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 260 //    will emit a conditional branch immediately after the node.
 261 //    So we have branches to branches and lots of ICC.ZF games.
 262 //    Instead, it might be better to have C2 pass a "FailureLabel"
 263 //    into fast_lock and fast_unlock.  In the case of success, control
 264 //    will drop through the node.  ICC.ZF is undefined at exit.
 265 //    In the case of failure, the node will branch directly to the
 266 //    FailureLabel
 267 
 268 
 269 // obj: object to lock
 270 // box: on-stack box address (displaced header location) - KILLED
 271 // rax,: tmp -- KILLED
 272 // scr: tmp -- KILLED
 273 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 274                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 275                                  Metadata* method_data) {
 276   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 277   // Ensure the register assignments are disjoint
 278   assert(tmpReg == rax, "");
 279   assert(cx1Reg == noreg, "");
 280   assert(cx2Reg == noreg, "");
 281   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 282 
 283   // Possible cases that we'll encounter in fast_lock
 284   // ------------------------------------------------
 285   // * Inflated
 286   //    -- unlocked
 287   //    -- Locked
 288   //       = by self
 289   //       = by other
 290   // * neutral
 291   // * stack-locked
 292   //    -- by self
 293   //       = sp-proximity test hits
 294   //       = sp-proximity test generates false-negative
 295   //    -- by other
 296   //
 297 
 298   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 299 
 300   if (DiagnoseSyncOnValueBasedClasses != 0) {
 301     load_klass(tmpReg, objReg, scrReg);
 302     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 303     jcc(Assembler::notZero, DONE_LABEL);
 304   }
 305 
 306   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 307   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 308   jcc(Assembler::notZero, IsInflated);
 309 
 310   if (LockingMode == LM_MONITOR) {
 311     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 312     testptr(objReg, objReg);
 313   } else {
 314     assert(LockingMode == LM_LEGACY, "must be");
 315     // Attempt stack-locking ...
 316     orptr (tmpReg, markWord::unlocked_value);
 317     if (EnableValhalla) {
 318       // Mask inline_type bit such that we go to the slow path if object is an inline type
 319       andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place));
 320     }
 321     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 322     lock();
 323     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 324     jcc(Assembler::equal, COUNT);           // Success
 325 
 326     // Recursive locking.
 327     // The object is stack-locked: markword contains stack pointer to BasicLock.
 328     // Locked by current thread if difference with current SP is less than one page.
 329     subptr(tmpReg, rsp);
 330     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 331     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 332     movptr(Address(boxReg, 0), tmpReg);
 333   }
 334   jmp(DONE_LABEL);
 335 
 336   bind(IsInflated);
 337   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 338 
 339 #ifndef _LP64
 340   // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 341   orl(boxReg, 1);  // set ICC.ZF=0 to indicate failure
 342 #else
 343   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 344   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 345   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 346 
 347   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 348   movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset()));
 349   movq(scrReg, tmpReg);
 350   xorq(tmpReg, tmpReg);
 351   lock();
 352   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 353 
 354   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 355   jccb(Assembler::equal, COUNT);    // CAS above succeeded; propagate ZF = 1 (success)
 356 
 357   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 358   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 359   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 360   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 361 #endif // _LP64
 362   bind(DONE_LABEL);
 363 
 364   // ZFlag == 1 count in fast path
 365   // ZFlag == 0 count in slow path
 366   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 367 
 368   bind(COUNT);
 369   if (LockingMode == LM_LEGACY) {
 370 #ifdef _LP64
 371     // Count monitors in fast path
 372     increment(Address(thread, JavaThread::held_monitor_count_offset()));
 373 #endif
 374   }
 375   xorl(tmpReg, tmpReg); // Set ZF == 1
 376 
 377   bind(NO_COUNT);
 378 
 379   // At NO_COUNT the icc ZFlag is set as follows ...
 380   // fast_unlock uses the same protocol.
 381   // ZFlag == 1 -> Success
 382   // ZFlag == 0 -> Failure - force control through the slow path
 383 }
 384 
 385 // obj: object to unlock
 386 // box: box address (displaced header location), killed.  Must be EAX.
 387 // tmp: killed, cannot be obj nor box.
 388 //
 389 // Some commentary on balanced locking:
 390 //
 391 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 392 // Methods that don't have provably balanced locking are forced to run in the
 393 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 394 // The interpreter provides two properties:
 395 // I1:  At return-time the interpreter automatically and quietly unlocks any
 396 //      objects acquired the current activation (frame).  Recall that the
 397 //      interpreter maintains an on-stack list of locks currently held by
 398 //      a frame.
 399 // I2:  If a method attempts to unlock an object that is not held by the
 400 //      the frame the interpreter throws IMSX.
 401 //
 402 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 403 // B() doesn't have provably balanced locking so it runs in the interpreter.
 404 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 405 // is still locked by A().
 406 //
 407 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 408 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 409 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 410 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 411 // Arguably given that the spec legislates the JNI case as undefined our implementation
 412 // could reasonably *avoid* checking owner in fast_unlock().
 413 // In the interest of performance we elide m->Owner==Self check in unlock.
 414 // A perfectly viable alternative is to elide the owner check except when
 415 // Xcheck:jni is enabled.
 416 
 417 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 418   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 419   assert(boxReg == rax, "");
 420   assert_different_registers(objReg, boxReg, tmpReg);
 421 
 422   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 423 
 424   if (LockingMode == LM_LEGACY) {
 425     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 426     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 427   }
 428   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 429   if (LockingMode != LM_MONITOR) {
 430     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 431     jcc(Assembler::zero, Stacked);
 432   }
 433 
 434   // It's inflated.
 435 
 436 #ifndef _LP64
 437   // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 438   orl(boxReg, 1);  // set ICC.ZF=0 to indicate failure
 439   jmpb(DONE_LABEL);
 440 #else
 441   // Despite our balanced locking property we still check that m->_owner == Self
 442   // as java routines or native JNI code called by this thread might
 443   // have released the lock.
 444   //
 445   // If there's no contention try a 1-0 exit.  That is, exit without
 446   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 447   // we detect and recover from the race that the 1-0 exit admits.
 448   //
 449   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 450   // before it STs null into _owner, releasing the lock.  Updates
 451   // to data protected by the critical section must be visible before
 452   // we drop the lock (and thus before any other thread could acquire
 453   // the lock and observe the fields protected by the lock).
 454   // IA32's memory-model is SPO, so STs are ordered with respect to
 455   // each other and there's no need for an explicit barrier (fence).
 456   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 457   Label LSuccess, LNotRecursive;
 458 
 459   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 460   jccb(Assembler::equal, LNotRecursive);
 461 
 462   // Recursive inflated unlock
 463   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 464   jmpb(LSuccess);
 465 
 466   bind(LNotRecursive);
 467 
 468   // Set owner to null.
 469   // Release to satisfy the JMM
 470   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 471   // We need a full fence after clearing owner to avoid stranding.
 472   // StoreLoad achieves this.
 473   membar(StoreLoad);
 474 
 475   // Check if the entry_list is empty.
 476   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD);
 477   jccb(Assembler::zero, LSuccess);    // If so we are done.
 478 
 479   // Check if there is a successor.
 480   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 481   jccb(Assembler::notZero, LSuccess); // If so we are done.
 482 
 483   // Save the monitor pointer in the current thread, so we can try to
 484   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 485   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 486   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 487 
 488   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 489   jmpb  (DONE_LABEL);
 490 
 491   bind  (LSuccess);
 492   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 493   jmpb  (DONE_LABEL);
 494 #endif  // _LP64
 495 
 496   if (LockingMode == LM_LEGACY) {
 497     bind  (Stacked);
 498     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 499     lock();
 500     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 501     // Intentional fall-thru into DONE_LABEL
 502   }
 503 
 504   bind(DONE_LABEL);
 505 
 506   // ZFlag == 1 count in fast path
 507   // ZFlag == 0 count in slow path
 508   jccb(Assembler::notZero, NO_COUNT);
 509 
 510   bind(COUNT);
 511 
 512   if (LockingMode == LM_LEGACY) {
 513     // Count monitors in fast path
 514 #ifdef _LP64
 515     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 516 #endif
 517   }
 518 
 519   xorl(tmpReg, tmpReg); // Set ZF == 1
 520 
 521   bind(NO_COUNT);
 522 }
 523 
 524 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 525                                               Register t, Register thread) {
 526   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 527   assert(rax_reg == rax, "Used for CAS");
 528   assert_different_registers(obj, box, rax_reg, t, thread);
 529 
 530   // Handle inflated monitor.
 531   Label inflated;
 532   // Finish fast lock successfully. ZF value is irrelevant.
 533   Label locked;
 534   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 535   Label slow_path;
 536 
 537   if (UseObjectMonitorTable) {
 538     // Clear cache in case fast locking succeeds.
 539     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 540   }
 541 
 542   if (DiagnoseSyncOnValueBasedClasses != 0) {
 543     load_klass(rax_reg, obj, t);
 544     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 545     jcc(Assembler::notZero, slow_path);
 546   }
 547 
 548   const Register mark = t;
 549 
 550   { // Lightweight Lock
 551 
 552     Label push;
 553 
 554     const Register top = UseObjectMonitorTable ? rax_reg : box;
 555 
 556     // Load the mark.
 557     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 558 
 559     // Prefetch top.
 560     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 561 
 562     // Check for monitor (0b10).
 563     testptr(mark, markWord::monitor_value);
 564     jcc(Assembler::notZero, inflated);
 565 
 566     // Check if lock-stack is full.
 567     cmpl(top, LockStack::end_offset() - 1);
 568     jcc(Assembler::greater, slow_path);
 569 
 570     // Check if recursive.
 571     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 572     jccb(Assembler::equal, push);
 573 
 574     // Try to lock. Transition lock bits 0b01 => 0b00
 575     movptr(rax_reg, mark);
 576     orptr(rax_reg, markWord::unlocked_value);
 577     andptr(mark, ~(int32_t)markWord::unlocked_value);
 578     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 579     jcc(Assembler::notEqual, slow_path);
 580 
 581     if (UseObjectMonitorTable) {
 582       // Need to reload top, clobbered by CAS.
 583       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 584     }
 585     bind(push);
 586     // After successful lock, push object on lock-stack.
 587     movptr(Address(thread, top), obj);
 588     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 589     jmpb(locked);
 590   }
 591 
 592   { // Handle inflated monitor.
 593     bind(inflated);
 594 
 595 #ifndef _LP64
 596     // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 597     orl(box, 1);  // set ICC.ZF=0 to indicate failure
 598     jmpb(slow_path);
 599 #else
 600     const Register monitor = t;
 601 
 602     if (!UseObjectMonitorTable) {
 603       assert(mark == monitor, "should be the same here");
 604     } else {
 605       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 606       // Fetch ObjectMonitor* from the cache or take the slow-path.
 607       Label monitor_found;
 608 
 609       // Load cache address
 610       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 611 
 612       const int num_unrolled = 2;
 613       for (int i = 0; i < num_unrolled; i++) {
 614         cmpptr(obj, Address(t));
 615         jccb(Assembler::equal, monitor_found);
 616         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 617       }
 618 
 619       Label loop;
 620 
 621       // Search for obj in cache.
 622       bind(loop);
 623 
 624       // Check for match.
 625       cmpptr(obj, Address(t));
 626       jccb(Assembler::equal, monitor_found);
 627 
 628       // Search until null encountered, guaranteed _null_sentinel at end.
 629       cmpptr(Address(t), 1);
 630       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 631       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 632       jmpb(loop);
 633 
 634       // Cache hit.
 635       bind(monitor_found);
 636       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 637     }
 638     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 639     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 640     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 641 
 642     Label monitor_locked;
 643     // Lock the monitor.
 644 
 645     if (UseObjectMonitorTable) {
 646       // Cache the monitor for unlock before trashing box. On failure to acquire
 647       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 648       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 649     }
 650 
 651     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 652     xorptr(rax_reg, rax_reg);
 653     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 654     lock(); cmpxchgptr(box, owner_address);
 655     jccb(Assembler::equal, monitor_locked);
 656 
 657     // Check if recursive.
 658     cmpptr(box, rax_reg);
 659     jccb(Assembler::notEqual, slow_path);
 660 
 661     // Recursive.
 662     increment(recursions_address);
 663 
 664     bind(monitor_locked);
 665 #endif  // _LP64
 666   }
 667 
 668   bind(locked);
 669   // Set ZF = 1
 670   xorl(rax_reg, rax_reg);
 671 
 672 #ifdef ASSERT
 673   // Check that locked label is reached with ZF set.
 674   Label zf_correct;
 675   Label zf_bad_zero;
 676   jcc(Assembler::zero, zf_correct);
 677   jmp(zf_bad_zero);
 678 #endif
 679 
 680   bind(slow_path);
 681 #ifdef ASSERT
 682   // Check that slow_path label is reached with ZF not set.
 683   jcc(Assembler::notZero, zf_correct);
 684   stop("Fast Lock ZF != 0");
 685   bind(zf_bad_zero);
 686   stop("Fast Lock ZF != 1");
 687   bind(zf_correct);
 688 #endif
 689   // C2 uses the value of ZF to determine the continuation.
 690 }
 691 
 692 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 693   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 694   assert(reg_rax == rax, "Used for CAS");
 695   assert_different_registers(obj, reg_rax, t);
 696 
 697   // Handle inflated monitor.
 698   Label inflated, inflated_check_lock_stack;
 699   // Finish fast unlock successfully.  MUST jump with ZF == 1
 700   Label unlocked, slow_path;
 701 
 702   const Register mark = t;
 703   const Register monitor = t;
 704   const Register top = UseObjectMonitorTable ? t : reg_rax;
 705   const Register box = reg_rax;
 706 
 707   Label dummy;
 708   C2FastUnlockLightweightStub* stub = nullptr;
 709 
 710   if (!Compile::current()->output()->in_scratch_emit_size()) {
 711     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 712     Compile::current()->output()->add_stub(stub);
 713   }
 714 
 715   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 716 
 717   { // Lightweight Unlock
 718 
 719     // Load top.
 720     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 721 
 722     if (!UseObjectMonitorTable) {
 723       // Prefetch mark.
 724       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 725     }
 726 
 727     // Check if obj is top of lock-stack.
 728     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 729     // Top of lock stack was not obj. Must be monitor.
 730     jcc(Assembler::notEqual, inflated_check_lock_stack);
 731 
 732     // Pop lock-stack.
 733     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 734     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 735 
 736     // Check if recursive.
 737     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 738     jcc(Assembler::equal, unlocked);
 739 
 740     // We elide the monitor check, let the CAS fail instead.
 741 
 742     if (UseObjectMonitorTable) {
 743       // Load mark.
 744       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 745     }
 746 
 747     // Try to unlock. Transition lock bits 0b00 => 0b01
 748     movptr(reg_rax, mark);
 749     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 750     orptr(mark, markWord::unlocked_value);
 751     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 752     jcc(Assembler::notEqual, push_and_slow_path);
 753     jmp(unlocked);
 754   }
 755 
 756 
 757   { // Handle inflated monitor.
 758     bind(inflated_check_lock_stack);
 759 #ifdef ASSERT
 760     Label check_done;
 761     subl(top, oopSize);
 762     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 763     jcc(Assembler::below, check_done);
 764     cmpptr(obj, Address(thread, top));
 765     jccb(Assembler::notEqual, inflated_check_lock_stack);
 766     stop("Fast Unlock lock on stack");
 767     bind(check_done);
 768     if (UseObjectMonitorTable) {
 769       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 770     }
 771     testptr(mark, markWord::monitor_value);
 772     jccb(Assembler::notZero, inflated);
 773     stop("Fast Unlock not monitor");
 774 #endif
 775 
 776     bind(inflated);
 777 
 778 #ifndef _LP64
 779     // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 780     orl(t, 1);  // set ICC.ZF=0 to indicate failure
 781     jmpb(slow_path);
 782 #else
 783     if (!UseObjectMonitorTable) {
 784       assert(mark == monitor, "should be the same here");
 785     } else {
 786       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 787       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 788       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 789       cmpptr(monitor, alignof(ObjectMonitor*));
 790       jcc(Assembler::below, slow_path);
 791     }
 792     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 793     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 794     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 795     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 796     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 797 
 798     Label recursive;
 799 
 800     // Check if recursive.
 801     cmpptr(recursions_address, 0);
 802     jccb(Assembler::notZero, recursive);
 803 
 804     // Set owner to null.
 805     // Release to satisfy the JMM
 806     movptr(owner_address, NULL_WORD);
 807     // We need a full fence after clearing owner to avoid stranding.
 808     // StoreLoad achieves this.
 809     membar(StoreLoad);
 810 
 811     // Check if the entry_list is empty.
 812     cmpptr(entry_list_address, NULL_WORD);
 813     jccb(Assembler::zero, unlocked);    // If so we are done.
 814 
 815     // Check if there is a successor.
 816     cmpptr(succ_address, NULL_WORD);
 817     jccb(Assembler::notZero, unlocked); // If so we are done.
 818 
 819     // Save the monitor pointer in the current thread, so we can try to
 820     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 821     if (!UseObjectMonitorTable) {
 822       andptr(monitor, ~(int32_t)markWord::monitor_value);
 823     }
 824     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 825 
 826     orl(t, 1); // Fast Unlock ZF = 0
 827     jmpb(slow_path);
 828 
 829     // Recursive unlock.
 830     bind(recursive);
 831     decrement(recursions_address);
 832 #endif  // _LP64
 833   }
 834 
 835   bind(unlocked);
 836   xorl(t, t); // Fast Unlock ZF = 1
 837 
 838 #ifdef ASSERT
 839   // Check that unlocked label is reached with ZF set.
 840   Label zf_correct;
 841   Label zf_bad_zero;
 842   jcc(Assembler::zero, zf_correct);
 843   jmp(zf_bad_zero);
 844 #endif
 845 
 846   bind(slow_path);
 847   if (stub != nullptr) {
 848     bind(stub->slow_path_continuation());
 849   }
 850 #ifdef ASSERT
 851   // Check that stub->continuation() label is reached with ZF not set.
 852   jcc(Assembler::notZero, zf_correct);
 853   stop("Fast Unlock ZF != 0");
 854   bind(zf_bad_zero);
 855   stop("Fast Unlock ZF != 1");
 856   bind(zf_correct);
 857 #endif
 858   // C2 uses the value of ZF to determine the continuation.
 859 }
 860 
 861 //-------------------------------------------------------------------------------------------
 862 // Generic instructions support for use in .ad files C2 code generation
 863 
 864 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 865   if (dst != src) {
 866     movdqu(dst, src);
 867   }
 868   if (opcode == Op_AbsVD) {
 869     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 870   } else {
 871     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 872     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 873   }
 874 }
 875 
 876 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 877   if (opcode == Op_AbsVD) {
 878     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 879   } else {
 880     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 881     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 882   }
 883 }
 884 
 885 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 886   if (dst != src) {
 887     movdqu(dst, src);
 888   }
 889   if (opcode == Op_AbsVF) {
 890     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 891   } else {
 892     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 893     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 894   }
 895 }
 896 
 897 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 898   if (opcode == Op_AbsVF) {
 899     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 900   } else {
 901     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 902     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 903   }
 904 }
 905 
 906 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 907   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 908   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 909 
 910   if (opcode == Op_MinV) {
 911     if (elem_bt == T_BYTE) {
 912       pminsb(dst, src);
 913     } else if (elem_bt == T_SHORT) {
 914       pminsw(dst, src);
 915     } else if (elem_bt == T_INT) {
 916       pminsd(dst, src);
 917     } else {
 918       assert(elem_bt == T_LONG, "required");
 919       assert(tmp == xmm0, "required");
 920       assert_different_registers(dst, src, tmp);
 921       movdqu(xmm0, dst);
 922       pcmpgtq(xmm0, src);
 923       blendvpd(dst, src);  // xmm0 as mask
 924     }
 925   } else { // opcode == Op_MaxV
 926     if (elem_bt == T_BYTE) {
 927       pmaxsb(dst, src);
 928     } else if (elem_bt == T_SHORT) {
 929       pmaxsw(dst, src);
 930     } else if (elem_bt == T_INT) {
 931       pmaxsd(dst, src);
 932     } else {
 933       assert(elem_bt == T_LONG, "required");
 934       assert(tmp == xmm0, "required");
 935       assert_different_registers(dst, src, tmp);
 936       movdqu(xmm0, src);
 937       pcmpgtq(xmm0, dst);
 938       blendvpd(dst, src);  // xmm0 as mask
 939     }
 940   }
 941 }
 942 
 943 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 944                                   XMMRegister src1, Address src2, int vlen_enc) {
 945   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 946   if (opcode == Op_UMinV) {
 947     switch(elem_bt) {
 948       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 949       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 950       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 951       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 952       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 953     }
 954   } else {
 955     assert(opcode == Op_UMaxV, "required");
 956     switch(elem_bt) {
 957       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 958       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 959       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 960       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 961       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 962     }
 963   }
 964 }
 965 
 966 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 967   // For optimality, leverage a full vector width of 512 bits
 968   // for operations over smaller vector sizes on AVX512 targets.
 969   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 970     if (opcode == Op_UMaxV) {
 971       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 972     } else {
 973       assert(opcode == Op_UMinV, "required");
 974       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 975     }
 976   } else {
 977     // T1 = -1
 978     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 979     // T1 = -1 << 63
 980     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 981     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 982     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 983     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 984     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 985     // Mask = T2 > T1
 986     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 987     if (opcode == Op_UMaxV) {
 988       // Res = Mask ? Src2 : Src1
 989       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 990     } else {
 991       // Res = Mask ? Src1 : Src2
 992       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 993     }
 994   }
 995 }
 996 
 997 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 998                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 999   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
1000   if (opcode == Op_UMinV) {
1001     switch(elem_bt) {
1002       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
1003       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
1004       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
1005       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
1006       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1007     }
1008   } else {
1009     assert(opcode == Op_UMaxV, "required");
1010     switch(elem_bt) {
1011       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
1012       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
1013       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
1014       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
1015       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1016     }
1017   }
1018 }
1019 
1020 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1021                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1022                                  int vlen_enc) {
1023   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1024 
1025   if (opcode == Op_MinV) {
1026     if (elem_bt == T_BYTE) {
1027       vpminsb(dst, src1, src2, vlen_enc);
1028     } else if (elem_bt == T_SHORT) {
1029       vpminsw(dst, src1, src2, vlen_enc);
1030     } else if (elem_bt == T_INT) {
1031       vpminsd(dst, src1, src2, vlen_enc);
1032     } else {
1033       assert(elem_bt == T_LONG, "required");
1034       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1035         vpminsq(dst, src1, src2, vlen_enc);
1036       } else {
1037         assert_different_registers(dst, src1, src2);
1038         vpcmpgtq(dst, src1, src2, vlen_enc);
1039         vblendvpd(dst, src1, src2, dst, vlen_enc);
1040       }
1041     }
1042   } else { // opcode == Op_MaxV
1043     if (elem_bt == T_BYTE) {
1044       vpmaxsb(dst, src1, src2, vlen_enc);
1045     } else if (elem_bt == T_SHORT) {
1046       vpmaxsw(dst, src1, src2, vlen_enc);
1047     } else if (elem_bt == T_INT) {
1048       vpmaxsd(dst, src1, src2, vlen_enc);
1049     } else {
1050       assert(elem_bt == T_LONG, "required");
1051       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1052         vpmaxsq(dst, src1, src2, vlen_enc);
1053       } else {
1054         assert_different_registers(dst, src1, src2);
1055         vpcmpgtq(dst, src1, src2, vlen_enc);
1056         vblendvpd(dst, src2, src1, dst, vlen_enc);
1057       }
1058     }
1059   }
1060 }
1061 
1062 // Float/Double min max
1063 
1064 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1065                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1066                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1067                                    int vlen_enc) {
1068   assert(UseAVX > 0, "required");
1069   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1070          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1071   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1072   assert_different_registers(a, tmp, atmp, btmp);
1073   assert_different_registers(b, tmp, atmp, btmp);
1074 
1075   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1076   bool is_double_word = is_double_word_type(elem_bt);
1077 
1078   /* Note on 'non-obvious' assembly sequence:
1079    *
1080    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1081    * and Java on how they handle floats:
1082    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1083    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1084    *
1085    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1086    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1087    *                (only useful when signs differ, noop otherwise)
1088    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1089 
1090    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1091    *   btmp = (b < +0.0) ? a : b
1092    *   atmp = (b < +0.0) ? b : a
1093    *   Tmp  = Max_Float(atmp , btmp)
1094    *   Res  = (atmp == NaN) ? atmp : Tmp
1095    */
1096 
1097   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1098   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1099   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1100   XMMRegister mask;
1101 
1102   if (!is_double_word && is_min) {
1103     mask = a;
1104     vblend = &MacroAssembler::vblendvps;
1105     vmaxmin = &MacroAssembler::vminps;
1106     vcmp = &MacroAssembler::vcmpps;
1107   } else if (!is_double_word && !is_min) {
1108     mask = b;
1109     vblend = &MacroAssembler::vblendvps;
1110     vmaxmin = &MacroAssembler::vmaxps;
1111     vcmp = &MacroAssembler::vcmpps;
1112   } else if (is_double_word && is_min) {
1113     mask = a;
1114     vblend = &MacroAssembler::vblendvpd;
1115     vmaxmin = &MacroAssembler::vminpd;
1116     vcmp = &MacroAssembler::vcmppd;
1117   } else {
1118     assert(is_double_word && !is_min, "sanity");
1119     mask = b;
1120     vblend = &MacroAssembler::vblendvpd;
1121     vmaxmin = &MacroAssembler::vmaxpd;
1122     vcmp = &MacroAssembler::vcmppd;
1123   }
1124 
1125   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1126   XMMRegister maxmin, scratch;
1127   if (dst == btmp) {
1128     maxmin = btmp;
1129     scratch = tmp;
1130   } else {
1131     maxmin = tmp;
1132     scratch = btmp;
1133   }
1134 
1135   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1136   if (precompute_mask && !is_double_word) {
1137     vpsrad(tmp, mask, 32, vlen_enc);
1138     mask = tmp;
1139   } else if (precompute_mask && is_double_word) {
1140     vpxor(tmp, tmp, tmp, vlen_enc);
1141     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1142     mask = tmp;
1143   }
1144 
1145   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1146   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1147   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1148   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1149   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1150 }
1151 
1152 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1153                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1154                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1155                                     int vlen_enc) {
1156   assert(UseAVX > 2, "required");
1157   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1158          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1159   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1160   assert_different_registers(dst, a, atmp, btmp);
1161   assert_different_registers(dst, b, atmp, btmp);
1162 
1163   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1164   bool is_double_word = is_double_word_type(elem_bt);
1165   bool merge = true;
1166 
1167   if (!is_double_word && is_min) {
1168     evpmovd2m(ktmp, a, vlen_enc);
1169     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1170     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1171     vminps(dst, atmp, btmp, vlen_enc);
1172     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1173     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1174   } else if (!is_double_word && !is_min) {
1175     evpmovd2m(ktmp, b, vlen_enc);
1176     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1177     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1178     vmaxps(dst, atmp, btmp, vlen_enc);
1179     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1180     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1181   } else if (is_double_word && is_min) {
1182     evpmovq2m(ktmp, a, vlen_enc);
1183     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1184     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1185     vminpd(dst, atmp, btmp, vlen_enc);
1186     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1187     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1188   } else {
1189     assert(is_double_word && !is_min, "sanity");
1190     evpmovq2m(ktmp, b, vlen_enc);
1191     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1192     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1193     vmaxpd(dst, atmp, btmp, vlen_enc);
1194     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1195     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1196   }
1197 }
1198 
1199 // Float/Double signum
1200 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1201   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1202 
1203   Label DONE_LABEL;
1204 
1205   if (opcode == Op_SignumF) {
1206     assert(UseSSE > 0, "required");
1207     ucomiss(dst, zero);
1208     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1209     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1210     movflt(dst, one);
1211     jcc(Assembler::above, DONE_LABEL);
1212     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1213   } else if (opcode == Op_SignumD) {
1214     assert(UseSSE > 1, "required");
1215     ucomisd(dst, zero);
1216     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1217     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1218     movdbl(dst, one);
1219     jcc(Assembler::above, DONE_LABEL);
1220     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1221   }
1222 
1223   bind(DONE_LABEL);
1224 }
1225 
1226 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1227   if (sign) {
1228     pmovsxbw(dst, src);
1229   } else {
1230     pmovzxbw(dst, src);
1231   }
1232 }
1233 
1234 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1235   if (sign) {
1236     vpmovsxbw(dst, src, vector_len);
1237   } else {
1238     vpmovzxbw(dst, src, vector_len);
1239   }
1240 }
1241 
1242 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1243   if (sign) {
1244     vpmovsxbd(dst, src, vector_len);
1245   } else {
1246     vpmovzxbd(dst, src, vector_len);
1247   }
1248 }
1249 
1250 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1251   if (sign) {
1252     vpmovsxwd(dst, src, vector_len);
1253   } else {
1254     vpmovzxwd(dst, src, vector_len);
1255   }
1256 }
1257 
1258 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1259                                      int shift, int vector_len) {
1260   if (opcode == Op_RotateLeftV) {
1261     if (etype == T_INT) {
1262       evprold(dst, src, shift, vector_len);
1263     } else {
1264       assert(etype == T_LONG, "expected type T_LONG");
1265       evprolq(dst, src, shift, vector_len);
1266     }
1267   } else {
1268     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1269     if (etype == T_INT) {
1270       evprord(dst, src, shift, vector_len);
1271     } else {
1272       assert(etype == T_LONG, "expected type T_LONG");
1273       evprorq(dst, src, shift, vector_len);
1274     }
1275   }
1276 }
1277 
1278 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1279                                      XMMRegister shift, int vector_len) {
1280   if (opcode == Op_RotateLeftV) {
1281     if (etype == T_INT) {
1282       evprolvd(dst, src, shift, vector_len);
1283     } else {
1284       assert(etype == T_LONG, "expected type T_LONG");
1285       evprolvq(dst, src, shift, vector_len);
1286     }
1287   } else {
1288     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1289     if (etype == T_INT) {
1290       evprorvd(dst, src, shift, vector_len);
1291     } else {
1292       assert(etype == T_LONG, "expected type T_LONG");
1293       evprorvq(dst, src, shift, vector_len);
1294     }
1295   }
1296 }
1297 
1298 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1299   if (opcode == Op_RShiftVI) {
1300     psrad(dst, shift);
1301   } else if (opcode == Op_LShiftVI) {
1302     pslld(dst, shift);
1303   } else {
1304     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1305     psrld(dst, shift);
1306   }
1307 }
1308 
1309 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1310   switch (opcode) {
1311     case Op_RShiftVI:  psrad(dst, shift); break;
1312     case Op_LShiftVI:  pslld(dst, shift); break;
1313     case Op_URShiftVI: psrld(dst, shift); break;
1314 
1315     default: assert(false, "%s", NodeClassNames[opcode]);
1316   }
1317 }
1318 
1319 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1320   if (opcode == Op_RShiftVI) {
1321     vpsrad(dst, nds, shift, vector_len);
1322   } else if (opcode == Op_LShiftVI) {
1323     vpslld(dst, nds, shift, vector_len);
1324   } else {
1325     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1326     vpsrld(dst, nds, shift, vector_len);
1327   }
1328 }
1329 
1330 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1331   switch (opcode) {
1332     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1333     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1334     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1335 
1336     default: assert(false, "%s", NodeClassNames[opcode]);
1337   }
1338 }
1339 
1340 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1341   switch (opcode) {
1342     case Op_RShiftVB:  // fall-through
1343     case Op_RShiftVS:  psraw(dst, shift); break;
1344 
1345     case Op_LShiftVB:  // fall-through
1346     case Op_LShiftVS:  psllw(dst, shift);   break;
1347 
1348     case Op_URShiftVS: // fall-through
1349     case Op_URShiftVB: psrlw(dst, shift);  break;
1350 
1351     default: assert(false, "%s", NodeClassNames[opcode]);
1352   }
1353 }
1354 
1355 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1356   switch (opcode) {
1357     case Op_RShiftVB:  // fall-through
1358     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1359 
1360     case Op_LShiftVB:  // fall-through
1361     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1362 
1363     case Op_URShiftVS: // fall-through
1364     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1365 
1366     default: assert(false, "%s", NodeClassNames[opcode]);
1367   }
1368 }
1369 
1370 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1371   switch (opcode) {
1372     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1373     case Op_LShiftVL:  psllq(dst, shift); break;
1374     case Op_URShiftVL: psrlq(dst, shift); break;
1375 
1376     default: assert(false, "%s", NodeClassNames[opcode]);
1377   }
1378 }
1379 
1380 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1381   if (opcode == Op_RShiftVL) {
1382     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1383   } else if (opcode == Op_LShiftVL) {
1384     psllq(dst, shift);
1385   } else {
1386     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1387     psrlq(dst, shift);
1388   }
1389 }
1390 
1391 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1392   switch (opcode) {
1393     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1394     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1395     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1396 
1397     default: assert(false, "%s", NodeClassNames[opcode]);
1398   }
1399 }
1400 
1401 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1402   if (opcode == Op_RShiftVL) {
1403     evpsraq(dst, nds, shift, vector_len);
1404   } else if (opcode == Op_LShiftVL) {
1405     vpsllq(dst, nds, shift, vector_len);
1406   } else {
1407     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1408     vpsrlq(dst, nds, shift, vector_len);
1409   }
1410 }
1411 
1412 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1413   switch (opcode) {
1414     case Op_RShiftVB:  // fall-through
1415     case Op_RShiftVS:  // fall-through
1416     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1417 
1418     case Op_LShiftVB:  // fall-through
1419     case Op_LShiftVS:  // fall-through
1420     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1421 
1422     case Op_URShiftVB: // fall-through
1423     case Op_URShiftVS: // fall-through
1424     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1425 
1426     default: assert(false, "%s", NodeClassNames[opcode]);
1427   }
1428 }
1429 
1430 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1431   switch (opcode) {
1432     case Op_RShiftVB:  // fall-through
1433     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1434 
1435     case Op_LShiftVB:  // fall-through
1436     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1437 
1438     case Op_URShiftVB: // fall-through
1439     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1440 
1441     default: assert(false, "%s", NodeClassNames[opcode]);
1442   }
1443 }
1444 
1445 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1446   assert(UseAVX >= 2, "required");
1447   switch (opcode) {
1448     case Op_RShiftVL: {
1449       if (UseAVX > 2) {
1450         assert(tmp == xnoreg, "not used");
1451         if (!VM_Version::supports_avx512vl()) {
1452           vlen_enc = Assembler::AVX_512bit;
1453         }
1454         evpsravq(dst, src, shift, vlen_enc);
1455       } else {
1456         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1457         vpsrlvq(dst, src, shift, vlen_enc);
1458         vpsrlvq(tmp, tmp, shift, vlen_enc);
1459         vpxor(dst, dst, tmp, vlen_enc);
1460         vpsubq(dst, dst, tmp, vlen_enc);
1461       }
1462       break;
1463     }
1464     case Op_LShiftVL: {
1465       assert(tmp == xnoreg, "not used");
1466       vpsllvq(dst, src, shift, vlen_enc);
1467       break;
1468     }
1469     case Op_URShiftVL: {
1470       assert(tmp == xnoreg, "not used");
1471       vpsrlvq(dst, src, shift, vlen_enc);
1472       break;
1473     }
1474     default: assert(false, "%s", NodeClassNames[opcode]);
1475   }
1476 }
1477 
1478 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1479 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1480   assert(opcode == Op_LShiftVB ||
1481          opcode == Op_RShiftVB ||
1482          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1483   bool sign = (opcode != Op_URShiftVB);
1484   assert(vector_len == 0, "required");
1485   vextendbd(sign, dst, src, 1);
1486   vpmovzxbd(vtmp, shift, 1);
1487   varshiftd(opcode, dst, dst, vtmp, 1);
1488   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1489   vextracti128_high(vtmp, dst);
1490   vpackusdw(dst, dst, vtmp, 0);
1491 }
1492 
1493 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1494 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1495   assert(opcode == Op_LShiftVB ||
1496          opcode == Op_RShiftVB ||
1497          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1498   bool sign = (opcode != Op_URShiftVB);
1499   int ext_vector_len = vector_len + 1;
1500   vextendbw(sign, dst, src, ext_vector_len);
1501   vpmovzxbw(vtmp, shift, ext_vector_len);
1502   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1503   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1504   if (vector_len == 0) {
1505     vextracti128_high(vtmp, dst);
1506     vpackuswb(dst, dst, vtmp, vector_len);
1507   } else {
1508     vextracti64x4_high(vtmp, dst);
1509     vpackuswb(dst, dst, vtmp, vector_len);
1510     vpermq(dst, dst, 0xD8, vector_len);
1511   }
1512 }
1513 
1514 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1515   switch(typ) {
1516     case T_BYTE:
1517       pinsrb(dst, val, idx);
1518       break;
1519     case T_SHORT:
1520       pinsrw(dst, val, idx);
1521       break;
1522     case T_INT:
1523       pinsrd(dst, val, idx);
1524       break;
1525     case T_LONG:
1526       pinsrq(dst, val, idx);
1527       break;
1528     default:
1529       assert(false,"Should not reach here.");
1530       break;
1531   }
1532 }
1533 
1534 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1535   switch(typ) {
1536     case T_BYTE:
1537       vpinsrb(dst, src, val, idx);
1538       break;
1539     case T_SHORT:
1540       vpinsrw(dst, src, val, idx);
1541       break;
1542     case T_INT:
1543       vpinsrd(dst, src, val, idx);
1544       break;
1545     case T_LONG:
1546       vpinsrq(dst, src, val, idx);
1547       break;
1548     default:
1549       assert(false,"Should not reach here.");
1550       break;
1551   }
1552 }
1553 
1554 #ifdef _LP64
1555 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1556                                                 XMMRegister dst, Register base,
1557                                                 Register idx_base,
1558                                                 Register offset, Register mask,
1559                                                 Register mask_idx, Register rtmp,
1560                                                 int vlen_enc) {
1561   vpxor(dst, dst, dst, vlen_enc);
1562   if (elem_bt == T_SHORT) {
1563     for (int i = 0; i < 4; i++) {
1564       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1565       Label skip_load;
1566       btq(mask, mask_idx);
1567       jccb(Assembler::carryClear, skip_load);
1568       movl(rtmp, Address(idx_base, i * 4));
1569       if (offset != noreg) {
1570         addl(rtmp, offset);
1571       }
1572       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1573       bind(skip_load);
1574       incq(mask_idx);
1575     }
1576   } else {
1577     assert(elem_bt == T_BYTE, "");
1578     for (int i = 0; i < 8; i++) {
1579       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1580       Label skip_load;
1581       btq(mask, mask_idx);
1582       jccb(Assembler::carryClear, skip_load);
1583       movl(rtmp, Address(idx_base, i * 4));
1584       if (offset != noreg) {
1585         addl(rtmp, offset);
1586       }
1587       pinsrb(dst, Address(base, rtmp), i);
1588       bind(skip_load);
1589       incq(mask_idx);
1590     }
1591   }
1592 }
1593 #endif // _LP64
1594 
1595 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1596                                          Register base, Register idx_base,
1597                                          Register offset, Register rtmp,
1598                                          int vlen_enc) {
1599   vpxor(dst, dst, dst, vlen_enc);
1600   if (elem_bt == T_SHORT) {
1601     for (int i = 0; i < 4; i++) {
1602       // dst[i] = src[offset + idx_base[i]]
1603       movl(rtmp, Address(idx_base, i * 4));
1604       if (offset != noreg) {
1605         addl(rtmp, offset);
1606       }
1607       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1608     }
1609   } else {
1610     assert(elem_bt == T_BYTE, "");
1611     for (int i = 0; i < 8; i++) {
1612       // dst[i] = src[offset + idx_base[i]]
1613       movl(rtmp, Address(idx_base, i * 4));
1614       if (offset != noreg) {
1615         addl(rtmp, offset);
1616       }
1617       pinsrb(dst, Address(base, rtmp), i);
1618     }
1619   }
1620 }
1621 
1622 /*
1623  * Gather using hybrid algorithm, first partially unroll scalar loop
1624  * to accumulate values from gather indices into a quad-word(64bit) slice.
1625  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1626  * permutation to place the slice into appropriate vector lane
1627  * locations in destination vector. Following pseudo code describes the
1628  * algorithm in detail:
1629  *
1630  * DST_VEC = ZERO_VEC
1631  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1632  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1633  * FOREACH_ITER:
1634  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1635  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1636  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1637  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1638  *
1639  * With each iteration, doubleword permute indices (0,1) corresponding
1640  * to gathered quadword gets right shifted by two lane positions.
1641  *
1642  */
1643 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1644                                         Register base, Register idx_base,
1645                                         Register offset, Register mask,
1646                                         XMMRegister xtmp1, XMMRegister xtmp2,
1647                                         XMMRegister temp_dst, Register rtmp,
1648                                         Register mask_idx, Register length,
1649                                         int vector_len, int vlen_enc) {
1650   Label GATHER8_LOOP;
1651   assert(is_subword_type(elem_ty), "");
1652   movl(length, vector_len);
1653   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1654   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1655   vallones(xtmp2, vlen_enc);
1656   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1657   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1658   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1659 
1660   bind(GATHER8_LOOP);
1661     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1662     if (mask == noreg) {
1663       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1664     } else {
1665       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1666     }
1667     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1668     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1669     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1670     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1671     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1672     vpor(dst, dst, temp_dst, vlen_enc);
1673     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1674     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1675     jcc(Assembler::notEqual, GATHER8_LOOP);
1676 }
1677 
1678 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1679   switch(typ) {
1680     case T_INT:
1681       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1682       break;
1683     case T_FLOAT:
1684       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1685       break;
1686     case T_LONG:
1687       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1688       break;
1689     case T_DOUBLE:
1690       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1691       break;
1692     default:
1693       assert(false,"Should not reach here.");
1694       break;
1695   }
1696 }
1697 
1698 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1699   switch(typ) {
1700     case T_INT:
1701       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1702       break;
1703     case T_FLOAT:
1704       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1705       break;
1706     case T_LONG:
1707       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1708       break;
1709     case T_DOUBLE:
1710       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1711       break;
1712     default:
1713       assert(false,"Should not reach here.");
1714       break;
1715   }
1716 }
1717 
1718 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1719   switch(typ) {
1720     case T_INT:
1721       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1722       break;
1723     case T_FLOAT:
1724       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1725       break;
1726     case T_LONG:
1727       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1728       break;
1729     case T_DOUBLE:
1730       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1731       break;
1732     default:
1733       assert(false,"Should not reach here.");
1734       break;
1735   }
1736 }
1737 
1738 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1739   if (vlen_in_bytes <= 16) {
1740     pxor (dst, dst);
1741     psubb(dst, src);
1742     switch (elem_bt) {
1743       case T_BYTE:   /* nothing to do */ break;
1744       case T_SHORT:  pmovsxbw(dst, dst); break;
1745       case T_INT:    pmovsxbd(dst, dst); break;
1746       case T_FLOAT:  pmovsxbd(dst, dst); break;
1747       case T_LONG:   pmovsxbq(dst, dst); break;
1748       case T_DOUBLE: pmovsxbq(dst, dst); break;
1749 
1750       default: assert(false, "%s", type2name(elem_bt));
1751     }
1752   } else {
1753     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1754     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1755 
1756     vpxor (dst, dst, dst, vlen_enc);
1757     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1758 
1759     switch (elem_bt) {
1760       case T_BYTE:   /* nothing to do */            break;
1761       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1762       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1763       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1764       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1765       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1766 
1767       default: assert(false, "%s", type2name(elem_bt));
1768     }
1769   }
1770 }
1771 
1772 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1773   if (novlbwdq) {
1774     vpmovsxbd(xtmp, src, vlen_enc);
1775     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1776             Assembler::eq, true, vlen_enc, noreg);
1777   } else {
1778     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1779     vpsubb(xtmp, xtmp, src, vlen_enc);
1780     evpmovb2m(dst, xtmp, vlen_enc);
1781   }
1782 }
1783 
1784 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1785   if (is_integral_type(bt)) {
1786     switch (vlen_in_bytes) {
1787       case 4:  movdl(dst, src);   break;
1788       case 8:  movq(dst, src);    break;
1789       case 16: movdqu(dst, src);  break;
1790       case 32: vmovdqu(dst, src); break;
1791       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1792       default: ShouldNotReachHere();
1793     }
1794   } else {
1795     switch (vlen_in_bytes) {
1796       case 4:  movflt(dst, src); break;
1797       case 8:  movdbl(dst, src); break;
1798       case 16: movups(dst, src); break;
1799       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1800       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1801       default: ShouldNotReachHere();
1802     }
1803   }
1804 }
1805 
1806 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1807   assert(rscratch != noreg || always_reachable(src), "missing");
1808 
1809   if (reachable(src)) {
1810     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1811   } else {
1812     lea(rscratch, src);
1813     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1814   }
1815 }
1816 
1817 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1818   int vlen_enc = vector_length_encoding(vlen);
1819   if (VM_Version::supports_avx()) {
1820     if (bt == T_LONG) {
1821       if (VM_Version::supports_avx2()) {
1822         vpbroadcastq(dst, src, vlen_enc);
1823       } else {
1824         vmovddup(dst, src, vlen_enc);
1825       }
1826     } else if (bt == T_DOUBLE) {
1827       if (vlen_enc != Assembler::AVX_128bit) {
1828         vbroadcastsd(dst, src, vlen_enc, noreg);
1829       } else {
1830         vmovddup(dst, src, vlen_enc);
1831       }
1832     } else {
1833       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1834         vpbroadcastd(dst, src, vlen_enc);
1835       } else {
1836         vbroadcastss(dst, src, vlen_enc);
1837       }
1838     }
1839   } else if (VM_Version::supports_sse3()) {
1840     movddup(dst, src);
1841   } else {
1842     load_vector(bt, dst, src, vlen);
1843   }
1844 }
1845 
1846 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1847   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1848   int offset = exact_log2(type2aelembytes(bt)) << 6;
1849   if (is_floating_point_type(bt)) {
1850     offset += 128;
1851   }
1852   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1853   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1854 }
1855 
1856 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1857 
1858 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1859   int vector_len = Assembler::AVX_128bit;
1860 
1861   switch (opcode) {
1862     case Op_AndReductionV:  pand(dst, src); break;
1863     case Op_OrReductionV:   por (dst, src); break;
1864     case Op_XorReductionV:  pxor(dst, src); break;
1865     case Op_MinReductionV:
1866       switch (typ) {
1867         case T_BYTE:        pminsb(dst, src); break;
1868         case T_SHORT:       pminsw(dst, src); break;
1869         case T_INT:         pminsd(dst, src); break;
1870         case T_LONG:        assert(UseAVX > 2, "required");
1871                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1872         default:            assert(false, "wrong type");
1873       }
1874       break;
1875     case Op_MaxReductionV:
1876       switch (typ) {
1877         case T_BYTE:        pmaxsb(dst, src); break;
1878         case T_SHORT:       pmaxsw(dst, src); break;
1879         case T_INT:         pmaxsd(dst, src); break;
1880         case T_LONG:        assert(UseAVX > 2, "required");
1881                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1882         default:            assert(false, "wrong type");
1883       }
1884       break;
1885     case Op_AddReductionVF: addss(dst, src); break;
1886     case Op_AddReductionVD: addsd(dst, src); break;
1887     case Op_AddReductionVI:
1888       switch (typ) {
1889         case T_BYTE:        paddb(dst, src); break;
1890         case T_SHORT:       paddw(dst, src); break;
1891         case T_INT:         paddd(dst, src); break;
1892         default:            assert(false, "wrong type");
1893       }
1894       break;
1895     case Op_AddReductionVL: paddq(dst, src); break;
1896     case Op_MulReductionVF: mulss(dst, src); break;
1897     case Op_MulReductionVD: mulsd(dst, src); break;
1898     case Op_MulReductionVI:
1899       switch (typ) {
1900         case T_SHORT:       pmullw(dst, src); break;
1901         case T_INT:         pmulld(dst, src); break;
1902         default:            assert(false, "wrong type");
1903       }
1904       break;
1905     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1906                             evpmullq(dst, dst, src, vector_len); break;
1907     default:                assert(false, "wrong opcode");
1908   }
1909 }
1910 
1911 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1912   switch (opcode) {
1913     case Op_AddReductionVF: addps(dst, src); break;
1914     case Op_AddReductionVD: addpd(dst, src); break;
1915     case Op_MulReductionVF: mulps(dst, src); break;
1916     case Op_MulReductionVD: mulpd(dst, src); break;
1917     default:                assert(false, "%s", NodeClassNames[opcode]);
1918   }
1919 }
1920 
1921 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1922   int vector_len = Assembler::AVX_256bit;
1923 
1924   switch (opcode) {
1925     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1926     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1927     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1928     case Op_MinReductionV:
1929       switch (typ) {
1930         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1931         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1932         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1933         case T_LONG:        assert(UseAVX > 2, "required");
1934                             vpminsq(dst, src1, src2, vector_len); break;
1935         default:            assert(false, "wrong type");
1936       }
1937       break;
1938     case Op_MaxReductionV:
1939       switch (typ) {
1940         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1941         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1942         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1943         case T_LONG:        assert(UseAVX > 2, "required");
1944                             vpmaxsq(dst, src1, src2, vector_len); break;
1945         default:            assert(false, "wrong type");
1946       }
1947       break;
1948     case Op_AddReductionVI:
1949       switch (typ) {
1950         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1951         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1952         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1953         default:            assert(false, "wrong type");
1954       }
1955       break;
1956     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1957     case Op_MulReductionVI:
1958       switch (typ) {
1959         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1960         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1961         default:            assert(false, "wrong type");
1962       }
1963       break;
1964     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1965     default:                assert(false, "wrong opcode");
1966   }
1967 }
1968 
1969 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1970   int vector_len = Assembler::AVX_256bit;
1971 
1972   switch (opcode) {
1973     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1974     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1975     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1976     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1977     default:                assert(false, "%s", NodeClassNames[opcode]);
1978   }
1979 }
1980 
1981 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1982                                   XMMRegister dst, XMMRegister src,
1983                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1984   switch (opcode) {
1985     case Op_AddReductionVF:
1986     case Op_MulReductionVF:
1987       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1988       break;
1989 
1990     case Op_AddReductionVD:
1991     case Op_MulReductionVD:
1992       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1993       break;
1994 
1995     default: assert(false, "wrong opcode");
1996   }
1997 }
1998 
1999 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
2000                                             XMMRegister dst, XMMRegister src,
2001                                             XMMRegister vtmp1, XMMRegister vtmp2) {
2002   switch (opcode) {
2003     case Op_AddReductionVF:
2004     case Op_MulReductionVF:
2005       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2006       break;
2007 
2008     case Op_AddReductionVD:
2009     case Op_MulReductionVD:
2010       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2011       break;
2012 
2013     default: assert(false, "%s", NodeClassNames[opcode]);
2014   }
2015 }
2016 
2017 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2018                              Register dst, Register src1, XMMRegister src2,
2019                              XMMRegister vtmp1, XMMRegister vtmp2) {
2020   switch (vlen) {
2021     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2022     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2023     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2024     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2025 
2026     default: assert(false, "wrong vector length");
2027   }
2028 }
2029 
2030 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2031                              Register dst, Register src1, XMMRegister src2,
2032                              XMMRegister vtmp1, XMMRegister vtmp2) {
2033   switch (vlen) {
2034     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2035     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2036     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2037     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2038 
2039     default: assert(false, "wrong vector length");
2040   }
2041 }
2042 
2043 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2044                              Register dst, Register src1, XMMRegister src2,
2045                              XMMRegister vtmp1, XMMRegister vtmp2) {
2046   switch (vlen) {
2047     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2048     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2049     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2050     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2051 
2052     default: assert(false, "wrong vector length");
2053   }
2054 }
2055 
2056 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2057                              Register dst, Register src1, XMMRegister src2,
2058                              XMMRegister vtmp1, XMMRegister vtmp2) {
2059   switch (vlen) {
2060     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2061     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2062     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2063     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2064 
2065     default: assert(false, "wrong vector length");
2066   }
2067 }
2068 
2069 #ifdef _LP64
2070 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2071                              Register dst, Register src1, XMMRegister src2,
2072                              XMMRegister vtmp1, XMMRegister vtmp2) {
2073   switch (vlen) {
2074     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2075     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2076     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2077 
2078     default: assert(false, "wrong vector length");
2079   }
2080 }
2081 #endif // _LP64
2082 
2083 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2084   switch (vlen) {
2085     case 2:
2086       assert(vtmp2 == xnoreg, "");
2087       reduce2F(opcode, dst, src, vtmp1);
2088       break;
2089     case 4:
2090       assert(vtmp2 == xnoreg, "");
2091       reduce4F(opcode, dst, src, vtmp1);
2092       break;
2093     case 8:
2094       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2095       break;
2096     case 16:
2097       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2098       break;
2099     default: assert(false, "wrong vector length");
2100   }
2101 }
2102 
2103 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2104   switch (vlen) {
2105     case 2:
2106       assert(vtmp2 == xnoreg, "");
2107       reduce2D(opcode, dst, src, vtmp1);
2108       break;
2109     case 4:
2110       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2111       break;
2112     case 8:
2113       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2114       break;
2115     default: assert(false, "wrong vector length");
2116   }
2117 }
2118 
2119 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2120   switch (vlen) {
2121     case 2:
2122       assert(vtmp1 == xnoreg, "");
2123       assert(vtmp2 == xnoreg, "");
2124       unorderedReduce2F(opcode, dst, src);
2125       break;
2126     case 4:
2127       assert(vtmp2 == xnoreg, "");
2128       unorderedReduce4F(opcode, dst, src, vtmp1);
2129       break;
2130     case 8:
2131       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2132       break;
2133     case 16:
2134       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2135       break;
2136     default: assert(false, "wrong vector length");
2137   }
2138 }
2139 
2140 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2141   switch (vlen) {
2142     case 2:
2143       assert(vtmp1 == xnoreg, "");
2144       assert(vtmp2 == xnoreg, "");
2145       unorderedReduce2D(opcode, dst, src);
2146       break;
2147     case 4:
2148       assert(vtmp2 == xnoreg, "");
2149       unorderedReduce4D(opcode, dst, src, vtmp1);
2150       break;
2151     case 8:
2152       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2153       break;
2154     default: assert(false, "wrong vector length");
2155   }
2156 }
2157 
2158 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2159   if (opcode == Op_AddReductionVI) {
2160     if (vtmp1 != src2) {
2161       movdqu(vtmp1, src2);
2162     }
2163     phaddd(vtmp1, vtmp1);
2164   } else {
2165     pshufd(vtmp1, src2, 0x1);
2166     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2167   }
2168   movdl(vtmp2, src1);
2169   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2170   movdl(dst, vtmp1);
2171 }
2172 
2173 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2174   if (opcode == Op_AddReductionVI) {
2175     if (vtmp1 != src2) {
2176       movdqu(vtmp1, src2);
2177     }
2178     phaddd(vtmp1, src2);
2179     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2180   } else {
2181     pshufd(vtmp2, src2, 0xE);
2182     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2183     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2184   }
2185 }
2186 
2187 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2188   if (opcode == Op_AddReductionVI) {
2189     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2190     vextracti128_high(vtmp2, vtmp1);
2191     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2192     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2193   } else {
2194     vextracti128_high(vtmp1, src2);
2195     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2196     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2197   }
2198 }
2199 
2200 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2201   vextracti64x4_high(vtmp2, src2);
2202   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2203   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2204 }
2205 
2206 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2207   pshufd(vtmp2, src2, 0x1);
2208   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2209   movdqu(vtmp1, vtmp2);
2210   psrldq(vtmp1, 2);
2211   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2212   movdqu(vtmp2, vtmp1);
2213   psrldq(vtmp2, 1);
2214   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2215   movdl(vtmp2, src1);
2216   pmovsxbd(vtmp1, vtmp1);
2217   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2218   pextrb(dst, vtmp1, 0x0);
2219   movsbl(dst, dst);
2220 }
2221 
2222 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2223   pshufd(vtmp1, src2, 0xE);
2224   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2225   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2226 }
2227 
2228 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2229   vextracti128_high(vtmp2, src2);
2230   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2231   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2232 }
2233 
2234 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2235   vextracti64x4_high(vtmp1, src2);
2236   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2237   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2238 }
2239 
2240 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2241   pmovsxbw(vtmp2, src2);
2242   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2243 }
2244 
2245 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2246   if (UseAVX > 1) {
2247     int vector_len = Assembler::AVX_256bit;
2248     vpmovsxbw(vtmp1, src2, vector_len);
2249     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2250   } else {
2251     pmovsxbw(vtmp2, src2);
2252     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2253     pshufd(vtmp2, src2, 0x1);
2254     pmovsxbw(vtmp2, src2);
2255     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2256   }
2257 }
2258 
2259 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2260   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2261     int vector_len = Assembler::AVX_512bit;
2262     vpmovsxbw(vtmp1, src2, vector_len);
2263     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2264   } else {
2265     assert(UseAVX >= 2,"Should not reach here.");
2266     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2267     vextracti128_high(vtmp2, src2);
2268     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2269   }
2270 }
2271 
2272 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2273   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2274   vextracti64x4_high(vtmp2, src2);
2275   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2276 }
2277 
2278 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2279   if (opcode == Op_AddReductionVI) {
2280     if (vtmp1 != src2) {
2281       movdqu(vtmp1, src2);
2282     }
2283     phaddw(vtmp1, vtmp1);
2284     phaddw(vtmp1, vtmp1);
2285   } else {
2286     pshufd(vtmp2, src2, 0x1);
2287     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2288     movdqu(vtmp1, vtmp2);
2289     psrldq(vtmp1, 2);
2290     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2291   }
2292   movdl(vtmp2, src1);
2293   pmovsxwd(vtmp1, vtmp1);
2294   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2295   pextrw(dst, vtmp1, 0x0);
2296   movswl(dst, dst);
2297 }
2298 
2299 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2300   if (opcode == Op_AddReductionVI) {
2301     if (vtmp1 != src2) {
2302       movdqu(vtmp1, src2);
2303     }
2304     phaddw(vtmp1, src2);
2305   } else {
2306     pshufd(vtmp1, src2, 0xE);
2307     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2308   }
2309   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2310 }
2311 
2312 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2313   if (opcode == Op_AddReductionVI) {
2314     int vector_len = Assembler::AVX_256bit;
2315     vphaddw(vtmp2, src2, src2, vector_len);
2316     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2317   } else {
2318     vextracti128_high(vtmp2, src2);
2319     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2320   }
2321   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2322 }
2323 
2324 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2325   int vector_len = Assembler::AVX_256bit;
2326   vextracti64x4_high(vtmp1, src2);
2327   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2328   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2329 }
2330 
2331 #ifdef _LP64
2332 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2333   pshufd(vtmp2, src2, 0xE);
2334   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2335   movdq(vtmp1, src1);
2336   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2337   movdq(dst, vtmp1);
2338 }
2339 
2340 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2341   vextracti128_high(vtmp1, src2);
2342   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2343   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2344 }
2345 
2346 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2347   vextracti64x4_high(vtmp2, src2);
2348   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2349   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2350 }
2351 
2352 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2353   mov64(temp, -1L);
2354   bzhiq(temp, temp, len);
2355   kmovql(dst, temp);
2356 }
2357 #endif // _LP64
2358 
2359 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2360   reduce_operation_128(T_FLOAT, opcode, dst, src);
2361   pshufd(vtmp, src, 0x1);
2362   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2363 }
2364 
2365 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2366   reduce2F(opcode, dst, src, vtmp);
2367   pshufd(vtmp, src, 0x2);
2368   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2369   pshufd(vtmp, src, 0x3);
2370   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2371 }
2372 
2373 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2374   reduce4F(opcode, dst, src, vtmp2);
2375   vextractf128_high(vtmp2, src);
2376   reduce4F(opcode, dst, vtmp2, vtmp1);
2377 }
2378 
2379 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2380   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2381   vextracti64x4_high(vtmp1, src);
2382   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2383 }
2384 
2385 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2386   pshufd(dst, src, 0x1);
2387   reduce_operation_128(T_FLOAT, opcode, dst, src);
2388 }
2389 
2390 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2391   pshufd(vtmp, src, 0xE);
2392   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2393   unorderedReduce2F(opcode, dst, vtmp);
2394 }
2395 
2396 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2397   vextractf128_high(vtmp1, src);
2398   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2399   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2400 }
2401 
2402 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2403   vextractf64x4_high(vtmp2, src);
2404   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2405   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2406 }
2407 
2408 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2409   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2410   pshufd(vtmp, src, 0xE);
2411   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2412 }
2413 
2414 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2415   reduce2D(opcode, dst, src, vtmp2);
2416   vextractf128_high(vtmp2, src);
2417   reduce2D(opcode, dst, vtmp2, vtmp1);
2418 }
2419 
2420 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2421   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2422   vextracti64x4_high(vtmp1, src);
2423   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2424 }
2425 
2426 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2427   pshufd(dst, src, 0xE);
2428   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2429 }
2430 
2431 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2432   vextractf128_high(vtmp, src);
2433   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2434   unorderedReduce2D(opcode, dst, vtmp);
2435 }
2436 
2437 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2438   vextractf64x4_high(vtmp2, src);
2439   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2440   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2441 }
2442 
2443 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2444   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2445 }
2446 
2447 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2448   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2449 }
2450 
2451 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2452   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2453 }
2454 
2455 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2456                                  int vec_enc) {
2457   switch(elem_bt) {
2458     case T_INT:
2459     case T_FLOAT:
2460       vmaskmovps(dst, src, mask, vec_enc);
2461       break;
2462     case T_LONG:
2463     case T_DOUBLE:
2464       vmaskmovpd(dst, src, mask, vec_enc);
2465       break;
2466     default:
2467       fatal("Unsupported type %s", type2name(elem_bt));
2468       break;
2469   }
2470 }
2471 
2472 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2473                                  int vec_enc) {
2474   switch(elem_bt) {
2475     case T_INT:
2476     case T_FLOAT:
2477       vmaskmovps(dst, src, mask, vec_enc);
2478       break;
2479     case T_LONG:
2480     case T_DOUBLE:
2481       vmaskmovpd(dst, src, mask, vec_enc);
2482       break;
2483     default:
2484       fatal("Unsupported type %s", type2name(elem_bt));
2485       break;
2486   }
2487 }
2488 
2489 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2490                                           XMMRegister dst, XMMRegister src,
2491                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2492                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2493   const int permconst[] = {1, 14};
2494   XMMRegister wsrc = src;
2495   XMMRegister wdst = xmm_0;
2496   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2497 
2498   int vlen_enc = Assembler::AVX_128bit;
2499   if (vlen == 16) {
2500     vlen_enc = Assembler::AVX_256bit;
2501   }
2502 
2503   for (int i = log2(vlen) - 1; i >=0; i--) {
2504     if (i == 0 && !is_dst_valid) {
2505       wdst = dst;
2506     }
2507     if (i == 3) {
2508       vextracti64x4_high(wtmp, wsrc);
2509     } else if (i == 2) {
2510       vextracti128_high(wtmp, wsrc);
2511     } else { // i = [0,1]
2512       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2513     }
2514     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2515     wsrc = wdst;
2516     vlen_enc = Assembler::AVX_128bit;
2517   }
2518   if (is_dst_valid) {
2519     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2520   }
2521 }
2522 
2523 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2524                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2525                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2526   XMMRegister wsrc = src;
2527   XMMRegister wdst = xmm_0;
2528   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2529   int vlen_enc = Assembler::AVX_128bit;
2530   if (vlen == 8) {
2531     vlen_enc = Assembler::AVX_256bit;
2532   }
2533   for (int i = log2(vlen) - 1; i >=0; i--) {
2534     if (i == 0 && !is_dst_valid) {
2535       wdst = dst;
2536     }
2537     if (i == 1) {
2538       vextracti128_high(wtmp, wsrc);
2539     } else if (i == 2) {
2540       vextracti64x4_high(wtmp, wsrc);
2541     } else {
2542       assert(i == 0, "%d", i);
2543       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2544     }
2545     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2546     wsrc = wdst;
2547     vlen_enc = Assembler::AVX_128bit;
2548   }
2549   if (is_dst_valid) {
2550     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2551   }
2552 }
2553 
2554 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2555   switch (bt) {
2556     case T_BYTE:  pextrb(dst, src, idx); break;
2557     case T_SHORT: pextrw(dst, src, idx); break;
2558     case T_INT:   pextrd(dst, src, idx); break;
2559     case T_LONG:  pextrq(dst, src, idx); break;
2560 
2561     default:
2562       assert(false,"Should not reach here.");
2563       break;
2564   }
2565 }
2566 
2567 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2568   int esize =  type2aelembytes(typ);
2569   int elem_per_lane = 16/esize;
2570   int lane = elemindex / elem_per_lane;
2571   int eindex = elemindex % elem_per_lane;
2572 
2573   if (lane >= 2) {
2574     assert(UseAVX > 2, "required");
2575     vextractf32x4(dst, src, lane & 3);
2576     return dst;
2577   } else if (lane > 0) {
2578     assert(UseAVX > 0, "required");
2579     vextractf128(dst, src, lane);
2580     return dst;
2581   } else {
2582     return src;
2583   }
2584 }
2585 
2586 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2587   if (typ == T_BYTE) {
2588     movsbl(dst, dst);
2589   } else if (typ == T_SHORT) {
2590     movswl(dst, dst);
2591   }
2592 }
2593 
2594 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2595   int esize =  type2aelembytes(typ);
2596   int elem_per_lane = 16/esize;
2597   int eindex = elemindex % elem_per_lane;
2598   assert(is_integral_type(typ),"required");
2599 
2600   if (eindex == 0) {
2601     if (typ == T_LONG) {
2602       movq(dst, src);
2603     } else {
2604       movdl(dst, src);
2605       movsxl(typ, dst);
2606     }
2607   } else {
2608     extract(typ, dst, src, eindex);
2609     movsxl(typ, dst);
2610   }
2611 }
2612 
2613 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2614   int esize =  type2aelembytes(typ);
2615   int elem_per_lane = 16/esize;
2616   int eindex = elemindex % elem_per_lane;
2617   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2618 
2619   if (eindex == 0) {
2620     movq(dst, src);
2621   } else {
2622     if (typ == T_FLOAT) {
2623       if (UseAVX == 0) {
2624         movdqu(dst, src);
2625         shufps(dst, dst, eindex);
2626       } else {
2627         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2628       }
2629     } else {
2630       if (UseAVX == 0) {
2631         movdqu(dst, src);
2632         psrldq(dst, eindex*esize);
2633       } else {
2634         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2635       }
2636       movq(dst, dst);
2637     }
2638   }
2639   // Zero upper bits
2640   if (typ == T_FLOAT) {
2641     if (UseAVX == 0) {
2642       assert(vtmp != xnoreg, "required.");
2643       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2644       pand(dst, vtmp);
2645     } else {
2646       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2647     }
2648   }
2649 }
2650 
2651 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2652   switch(typ) {
2653     case T_BYTE:
2654     case T_BOOLEAN:
2655       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2656       break;
2657     case T_SHORT:
2658     case T_CHAR:
2659       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2660       break;
2661     case T_INT:
2662     case T_FLOAT:
2663       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2664       break;
2665     case T_LONG:
2666     case T_DOUBLE:
2667       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2668       break;
2669     default:
2670       assert(false,"Should not reach here.");
2671       break;
2672   }
2673 }
2674 
2675 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2676   assert(rscratch != noreg || always_reachable(src2), "missing");
2677 
2678   switch(typ) {
2679     case T_BOOLEAN:
2680     case T_BYTE:
2681       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2682       break;
2683     case T_CHAR:
2684     case T_SHORT:
2685       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2686       break;
2687     case T_INT:
2688     case T_FLOAT:
2689       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2690       break;
2691     case T_LONG:
2692     case T_DOUBLE:
2693       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2694       break;
2695     default:
2696       assert(false,"Should not reach here.");
2697       break;
2698   }
2699 }
2700 
2701 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2702   switch(typ) {
2703     case T_BYTE:
2704       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2705       break;
2706     case T_SHORT:
2707       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2708       break;
2709     case T_INT:
2710     case T_FLOAT:
2711       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2712       break;
2713     case T_LONG:
2714     case T_DOUBLE:
2715       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2716       break;
2717     default:
2718       assert(false,"Should not reach here.");
2719       break;
2720   }
2721 }
2722 
2723 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2724   assert(vlen_in_bytes <= 32, "");
2725   int esize = type2aelembytes(bt);
2726   if (vlen_in_bytes == 32) {
2727     assert(vtmp == xnoreg, "required.");
2728     if (esize >= 4) {
2729       vtestps(src1, src2, AVX_256bit);
2730     } else {
2731       vptest(src1, src2, AVX_256bit);
2732     }
2733     return;
2734   }
2735   if (vlen_in_bytes < 16) {
2736     // Duplicate the lower part to fill the whole register,
2737     // Don't need to do so for src2
2738     assert(vtmp != xnoreg, "required");
2739     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2740     pshufd(vtmp, src1, shuffle_imm);
2741   } else {
2742     assert(vtmp == xnoreg, "required");
2743     vtmp = src1;
2744   }
2745   if (esize >= 4 && VM_Version::supports_avx()) {
2746     vtestps(vtmp, src2, AVX_128bit);
2747   } else {
2748     ptest(vtmp, src2);
2749   }
2750 }
2751 
2752 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2753 #ifdef ASSERT
2754   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2755   bool is_bw_supported = VM_Version::supports_avx512bw();
2756   if (is_bw && !is_bw_supported) {
2757     assert(vlen_enc != Assembler::AVX_512bit, "required");
2758     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2759            "XMM register should be 0-15");
2760   }
2761 #endif // ASSERT
2762   switch (elem_bt) {
2763     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2764     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2765     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2766     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2767     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2768     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2769     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2770   }
2771 }
2772 
2773 #ifdef _LP64
2774 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2775   assert(UseAVX >= 2, "required");
2776   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2777   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2778   if ((UseAVX > 2) &&
2779       (!is_bw || VM_Version::supports_avx512bw()) &&
2780       (!is_vl || VM_Version::supports_avx512vl())) {
2781     switch (elem_bt) {
2782       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2783       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2784       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2785       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2786       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2787     }
2788   } else {
2789     assert(vlen_enc != Assembler::AVX_512bit, "required");
2790     assert((dst->encoding() < 16),"XMM register should be 0-15");
2791     switch (elem_bt) {
2792       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2793       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2794       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2795       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2796       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2797       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2798       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2799     }
2800   }
2801 }
2802 #endif
2803 
2804 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2805   switch (to_elem_bt) {
2806     case T_SHORT:
2807       vpmovsxbw(dst, src, vlen_enc);
2808       break;
2809     case T_INT:
2810       vpmovsxbd(dst, src, vlen_enc);
2811       break;
2812     case T_FLOAT:
2813       vpmovsxbd(dst, src, vlen_enc);
2814       vcvtdq2ps(dst, dst, vlen_enc);
2815       break;
2816     case T_LONG:
2817       vpmovsxbq(dst, src, vlen_enc);
2818       break;
2819     case T_DOUBLE: {
2820       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2821       vpmovsxbd(dst, src, mid_vlen_enc);
2822       vcvtdq2pd(dst, dst, vlen_enc);
2823       break;
2824     }
2825     default:
2826       fatal("Unsupported type %s", type2name(to_elem_bt));
2827       break;
2828   }
2829 }
2830 
2831 //-------------------------------------------------------------------------------------------
2832 
2833 // IndexOf for constant substrings with size >= 8 chars
2834 // which don't need to be loaded through stack.
2835 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2836                                          Register cnt1, Register cnt2,
2837                                          int int_cnt2,  Register result,
2838                                          XMMRegister vec, Register tmp,
2839                                          int ae) {
2840   ShortBranchVerifier sbv(this);
2841   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2842   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2843 
2844   // This method uses the pcmpestri instruction with bound registers
2845   //   inputs:
2846   //     xmm - substring
2847   //     rax - substring length (elements count)
2848   //     mem - scanned string
2849   //     rdx - string length (elements count)
2850   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2851   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2852   //   outputs:
2853   //     rcx - matched index in string
2854   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2855   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2856   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2857   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2858   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2859 
2860   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2861         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2862         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2863 
2864   // Note, inline_string_indexOf() generates checks:
2865   // if (substr.count > string.count) return -1;
2866   // if (substr.count == 0) return 0;
2867   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2868 
2869   // Load substring.
2870   if (ae == StrIntrinsicNode::UL) {
2871     pmovzxbw(vec, Address(str2, 0));
2872   } else {
2873     movdqu(vec, Address(str2, 0));
2874   }
2875   movl(cnt2, int_cnt2);
2876   movptr(result, str1); // string addr
2877 
2878   if (int_cnt2 > stride) {
2879     jmpb(SCAN_TO_SUBSTR);
2880 
2881     // Reload substr for rescan, this code
2882     // is executed only for large substrings (> 8 chars)
2883     bind(RELOAD_SUBSTR);
2884     if (ae == StrIntrinsicNode::UL) {
2885       pmovzxbw(vec, Address(str2, 0));
2886     } else {
2887       movdqu(vec, Address(str2, 0));
2888     }
2889     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2890 
2891     bind(RELOAD_STR);
2892     // We came here after the beginning of the substring was
2893     // matched but the rest of it was not so we need to search
2894     // again. Start from the next element after the previous match.
2895 
2896     // cnt2 is number of substring reminding elements and
2897     // cnt1 is number of string reminding elements when cmp failed.
2898     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2899     subl(cnt1, cnt2);
2900     addl(cnt1, int_cnt2);
2901     movl(cnt2, int_cnt2); // Now restore cnt2
2902 
2903     decrementl(cnt1);     // Shift to next element
2904     cmpl(cnt1, cnt2);
2905     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2906 
2907     addptr(result, (1<<scale1));
2908 
2909   } // (int_cnt2 > 8)
2910 
2911   // Scan string for start of substr in 16-byte vectors
2912   bind(SCAN_TO_SUBSTR);
2913   pcmpestri(vec, Address(result, 0), mode);
2914   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2915   subl(cnt1, stride);
2916   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2917   cmpl(cnt1, cnt2);
2918   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2919   addptr(result, 16);
2920   jmpb(SCAN_TO_SUBSTR);
2921 
2922   // Found a potential substr
2923   bind(FOUND_CANDIDATE);
2924   // Matched whole vector if first element matched (tmp(rcx) == 0).
2925   if (int_cnt2 == stride) {
2926     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2927   } else { // int_cnt2 > 8
2928     jccb(Assembler::overflow, FOUND_SUBSTR);
2929   }
2930   // After pcmpestri tmp(rcx) contains matched element index
2931   // Compute start addr of substr
2932   lea(result, Address(result, tmp, scale1));
2933 
2934   // Make sure string is still long enough
2935   subl(cnt1, tmp);
2936   cmpl(cnt1, cnt2);
2937   if (int_cnt2 == stride) {
2938     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2939   } else { // int_cnt2 > 8
2940     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2941   }
2942   // Left less then substring.
2943 
2944   bind(RET_NOT_FOUND);
2945   movl(result, -1);
2946   jmp(EXIT);
2947 
2948   if (int_cnt2 > stride) {
2949     // This code is optimized for the case when whole substring
2950     // is matched if its head is matched.
2951     bind(MATCH_SUBSTR_HEAD);
2952     pcmpestri(vec, Address(result, 0), mode);
2953     // Reload only string if does not match
2954     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2955 
2956     Label CONT_SCAN_SUBSTR;
2957     // Compare the rest of substring (> 8 chars).
2958     bind(FOUND_SUBSTR);
2959     // First 8 chars are already matched.
2960     negptr(cnt2);
2961     addptr(cnt2, stride);
2962 
2963     bind(SCAN_SUBSTR);
2964     subl(cnt1, stride);
2965     cmpl(cnt2, -stride); // Do not read beyond substring
2966     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2967     // Back-up strings to avoid reading beyond substring:
2968     // cnt1 = cnt1 - cnt2 + 8
2969     addl(cnt1, cnt2); // cnt2 is negative
2970     addl(cnt1, stride);
2971     movl(cnt2, stride); negptr(cnt2);
2972     bind(CONT_SCAN_SUBSTR);
2973     if (int_cnt2 < (int)G) {
2974       int tail_off1 = int_cnt2<<scale1;
2975       int tail_off2 = int_cnt2<<scale2;
2976       if (ae == StrIntrinsicNode::UL) {
2977         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2978       } else {
2979         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2980       }
2981       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2982     } else {
2983       // calculate index in register to avoid integer overflow (int_cnt2*2)
2984       movl(tmp, int_cnt2);
2985       addptr(tmp, cnt2);
2986       if (ae == StrIntrinsicNode::UL) {
2987         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2988       } else {
2989         movdqu(vec, Address(str2, tmp, scale2, 0));
2990       }
2991       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2992     }
2993     // Need to reload strings pointers if not matched whole vector
2994     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2995     addptr(cnt2, stride);
2996     jcc(Assembler::negative, SCAN_SUBSTR);
2997     // Fall through if found full substring
2998 
2999   } // (int_cnt2 > 8)
3000 
3001   bind(RET_FOUND);
3002   // Found result if we matched full small substring.
3003   // Compute substr offset
3004   subptr(result, str1);
3005   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3006     shrl(result, 1); // index
3007   }
3008   bind(EXIT);
3009 
3010 } // string_indexofC8
3011 
3012 // Small strings are loaded through stack if they cross page boundary.
3013 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
3014                                        Register cnt1, Register cnt2,
3015                                        int int_cnt2,  Register result,
3016                                        XMMRegister vec, Register tmp,
3017                                        int ae) {
3018   ShortBranchVerifier sbv(this);
3019   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3020   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3021 
3022   //
3023   // int_cnt2 is length of small (< 8 chars) constant substring
3024   // or (-1) for non constant substring in which case its length
3025   // is in cnt2 register.
3026   //
3027   // Note, inline_string_indexOf() generates checks:
3028   // if (substr.count > string.count) return -1;
3029   // if (substr.count == 0) return 0;
3030   //
3031   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3032   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3033   // This method uses the pcmpestri instruction with bound registers
3034   //   inputs:
3035   //     xmm - substring
3036   //     rax - substring length (elements count)
3037   //     mem - scanned string
3038   //     rdx - string length (elements count)
3039   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3040   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3041   //   outputs:
3042   //     rcx - matched index in string
3043   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3044   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3045   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3046   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3047 
3048   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3049         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3050         FOUND_CANDIDATE;
3051 
3052   { //========================================================
3053     // We don't know where these strings are located
3054     // and we can't read beyond them. Load them through stack.
3055     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3056 
3057     movptr(tmp, rsp); // save old SP
3058 
3059     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3060       if (int_cnt2 == (1>>scale2)) { // One byte
3061         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3062         load_unsigned_byte(result, Address(str2, 0));
3063         movdl(vec, result); // move 32 bits
3064       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3065         // Not enough header space in 32-bit VM: 12+3 = 15.
3066         movl(result, Address(str2, -1));
3067         shrl(result, 8);
3068         movdl(vec, result); // move 32 bits
3069       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3070         load_unsigned_short(result, Address(str2, 0));
3071         movdl(vec, result); // move 32 bits
3072       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3073         movdl(vec, Address(str2, 0)); // move 32 bits
3074       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3075         movq(vec, Address(str2, 0));  // move 64 bits
3076       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3077         // Array header size is 12 bytes in 32-bit VM
3078         // + 6 bytes for 3 chars == 18 bytes,
3079         // enough space to load vec and shift.
3080         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3081         if (ae == StrIntrinsicNode::UL) {
3082           int tail_off = int_cnt2-8;
3083           pmovzxbw(vec, Address(str2, tail_off));
3084           psrldq(vec, -2*tail_off);
3085         }
3086         else {
3087           int tail_off = int_cnt2*(1<<scale2);
3088           movdqu(vec, Address(str2, tail_off-16));
3089           psrldq(vec, 16-tail_off);
3090         }
3091       }
3092     } else { // not constant substring
3093       cmpl(cnt2, stride);
3094       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3095 
3096       // We can read beyond string if srt+16 does not cross page boundary
3097       // since heaps are aligned and mapped by pages.
3098       assert(os::vm_page_size() < (int)G, "default page should be small");
3099       movl(result, str2); // We need only low 32 bits
3100       andl(result, ((int)os::vm_page_size()-1));
3101       cmpl(result, ((int)os::vm_page_size()-16));
3102       jccb(Assembler::belowEqual, CHECK_STR);
3103 
3104       // Move small strings to stack to allow load 16 bytes into vec.
3105       subptr(rsp, 16);
3106       int stk_offset = wordSize-(1<<scale2);
3107       push(cnt2);
3108 
3109       bind(COPY_SUBSTR);
3110       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3111         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3112         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3113       } else if (ae == StrIntrinsicNode::UU) {
3114         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3115         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3116       }
3117       decrement(cnt2);
3118       jccb(Assembler::notZero, COPY_SUBSTR);
3119 
3120       pop(cnt2);
3121       movptr(str2, rsp);  // New substring address
3122     } // non constant
3123 
3124     bind(CHECK_STR);
3125     cmpl(cnt1, stride);
3126     jccb(Assembler::aboveEqual, BIG_STRINGS);
3127 
3128     // Check cross page boundary.
3129     movl(result, str1); // We need only low 32 bits
3130     andl(result, ((int)os::vm_page_size()-1));
3131     cmpl(result, ((int)os::vm_page_size()-16));
3132     jccb(Assembler::belowEqual, BIG_STRINGS);
3133 
3134     subptr(rsp, 16);
3135     int stk_offset = -(1<<scale1);
3136     if (int_cnt2 < 0) { // not constant
3137       push(cnt2);
3138       stk_offset += wordSize;
3139     }
3140     movl(cnt2, cnt1);
3141 
3142     bind(COPY_STR);
3143     if (ae == StrIntrinsicNode::LL) {
3144       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3145       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3146     } else {
3147       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3148       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3149     }
3150     decrement(cnt2);
3151     jccb(Assembler::notZero, COPY_STR);
3152 
3153     if (int_cnt2 < 0) { // not constant
3154       pop(cnt2);
3155     }
3156     movptr(str1, rsp);  // New string address
3157 
3158     bind(BIG_STRINGS);
3159     // Load substring.
3160     if (int_cnt2 < 0) { // -1
3161       if (ae == StrIntrinsicNode::UL) {
3162         pmovzxbw(vec, Address(str2, 0));
3163       } else {
3164         movdqu(vec, Address(str2, 0));
3165       }
3166       push(cnt2);       // substr count
3167       push(str2);       // substr addr
3168       push(str1);       // string addr
3169     } else {
3170       // Small (< 8 chars) constant substrings are loaded already.
3171       movl(cnt2, int_cnt2);
3172     }
3173     push(tmp);  // original SP
3174 
3175   } // Finished loading
3176 
3177   //========================================================
3178   // Start search
3179   //
3180 
3181   movptr(result, str1); // string addr
3182 
3183   if (int_cnt2  < 0) {  // Only for non constant substring
3184     jmpb(SCAN_TO_SUBSTR);
3185 
3186     // SP saved at sp+0
3187     // String saved at sp+1*wordSize
3188     // Substr saved at sp+2*wordSize
3189     // Substr count saved at sp+3*wordSize
3190 
3191     // Reload substr for rescan, this code
3192     // is executed only for large substrings (> 8 chars)
3193     bind(RELOAD_SUBSTR);
3194     movptr(str2, Address(rsp, 2*wordSize));
3195     movl(cnt2, Address(rsp, 3*wordSize));
3196     if (ae == StrIntrinsicNode::UL) {
3197       pmovzxbw(vec, Address(str2, 0));
3198     } else {
3199       movdqu(vec, Address(str2, 0));
3200     }
3201     // We came here after the beginning of the substring was
3202     // matched but the rest of it was not so we need to search
3203     // again. Start from the next element after the previous match.
3204     subptr(str1, result); // Restore counter
3205     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3206       shrl(str1, 1);
3207     }
3208     addl(cnt1, str1);
3209     decrementl(cnt1);   // Shift to next element
3210     cmpl(cnt1, cnt2);
3211     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3212 
3213     addptr(result, (1<<scale1));
3214   } // non constant
3215 
3216   // Scan string for start of substr in 16-byte vectors
3217   bind(SCAN_TO_SUBSTR);
3218   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3219   pcmpestri(vec, Address(result, 0), mode);
3220   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3221   subl(cnt1, stride);
3222   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3223   cmpl(cnt1, cnt2);
3224   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3225   addptr(result, 16);
3226 
3227   bind(ADJUST_STR);
3228   cmpl(cnt1, stride); // Do not read beyond string
3229   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3230   // Back-up string to avoid reading beyond string.
3231   lea(result, Address(result, cnt1, scale1, -16));
3232   movl(cnt1, stride);
3233   jmpb(SCAN_TO_SUBSTR);
3234 
3235   // Found a potential substr
3236   bind(FOUND_CANDIDATE);
3237   // After pcmpestri tmp(rcx) contains matched element index
3238 
3239   // Make sure string is still long enough
3240   subl(cnt1, tmp);
3241   cmpl(cnt1, cnt2);
3242   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3243   // Left less then substring.
3244 
3245   bind(RET_NOT_FOUND);
3246   movl(result, -1);
3247   jmp(CLEANUP);
3248 
3249   bind(FOUND_SUBSTR);
3250   // Compute start addr of substr
3251   lea(result, Address(result, tmp, scale1));
3252   if (int_cnt2 > 0) { // Constant substring
3253     // Repeat search for small substring (< 8 chars)
3254     // from new point without reloading substring.
3255     // Have to check that we don't read beyond string.
3256     cmpl(tmp, stride-int_cnt2);
3257     jccb(Assembler::greater, ADJUST_STR);
3258     // Fall through if matched whole substring.
3259   } else { // non constant
3260     assert(int_cnt2 == -1, "should be != 0");
3261 
3262     addl(tmp, cnt2);
3263     // Found result if we matched whole substring.
3264     cmpl(tmp, stride);
3265     jcc(Assembler::lessEqual, RET_FOUND);
3266 
3267     // Repeat search for small substring (<= 8 chars)
3268     // from new point 'str1' without reloading substring.
3269     cmpl(cnt2, stride);
3270     // Have to check that we don't read beyond string.
3271     jccb(Assembler::lessEqual, ADJUST_STR);
3272 
3273     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3274     // Compare the rest of substring (> 8 chars).
3275     movptr(str1, result);
3276 
3277     cmpl(tmp, cnt2);
3278     // First 8 chars are already matched.
3279     jccb(Assembler::equal, CHECK_NEXT);
3280 
3281     bind(SCAN_SUBSTR);
3282     pcmpestri(vec, Address(str1, 0), mode);
3283     // Need to reload strings pointers if not matched whole vector
3284     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3285 
3286     bind(CHECK_NEXT);
3287     subl(cnt2, stride);
3288     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3289     addptr(str1, 16);
3290     if (ae == StrIntrinsicNode::UL) {
3291       addptr(str2, 8);
3292     } else {
3293       addptr(str2, 16);
3294     }
3295     subl(cnt1, stride);
3296     cmpl(cnt2, stride); // Do not read beyond substring
3297     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3298     // Back-up strings to avoid reading beyond substring.
3299 
3300     if (ae == StrIntrinsicNode::UL) {
3301       lea(str2, Address(str2, cnt2, scale2, -8));
3302       lea(str1, Address(str1, cnt2, scale1, -16));
3303     } else {
3304       lea(str2, Address(str2, cnt2, scale2, -16));
3305       lea(str1, Address(str1, cnt2, scale1, -16));
3306     }
3307     subl(cnt1, cnt2);
3308     movl(cnt2, stride);
3309     addl(cnt1, stride);
3310     bind(CONT_SCAN_SUBSTR);
3311     if (ae == StrIntrinsicNode::UL) {
3312       pmovzxbw(vec, Address(str2, 0));
3313     } else {
3314       movdqu(vec, Address(str2, 0));
3315     }
3316     jmp(SCAN_SUBSTR);
3317 
3318     bind(RET_FOUND_LONG);
3319     movptr(str1, Address(rsp, wordSize));
3320   } // non constant
3321 
3322   bind(RET_FOUND);
3323   // Compute substr offset
3324   subptr(result, str1);
3325   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3326     shrl(result, 1); // index
3327   }
3328   bind(CLEANUP);
3329   pop(rsp); // restore SP
3330 
3331 } // string_indexof
3332 
3333 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3334                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3335   ShortBranchVerifier sbv(this);
3336   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3337 
3338   int stride = 8;
3339 
3340   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3341         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3342         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3343         FOUND_SEQ_CHAR, DONE_LABEL;
3344 
3345   movptr(result, str1);
3346   if (UseAVX >= 2) {
3347     cmpl(cnt1, stride);
3348     jcc(Assembler::less, SCAN_TO_CHAR);
3349     cmpl(cnt1, 2*stride);
3350     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3351     movdl(vec1, ch);
3352     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3353     vpxor(vec2, vec2);
3354     movl(tmp, cnt1);
3355     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3356     andl(cnt1,0x0000000F);  //tail count (in chars)
3357 
3358     bind(SCAN_TO_16_CHAR_LOOP);
3359     vmovdqu(vec3, Address(result, 0));
3360     vpcmpeqw(vec3, vec3, vec1, 1);
3361     vptest(vec2, vec3);
3362     jcc(Assembler::carryClear, FOUND_CHAR);
3363     addptr(result, 32);
3364     subl(tmp, 2*stride);
3365     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3366     jmp(SCAN_TO_8_CHAR);
3367     bind(SCAN_TO_8_CHAR_INIT);
3368     movdl(vec1, ch);
3369     pshuflw(vec1, vec1, 0x00);
3370     pshufd(vec1, vec1, 0);
3371     pxor(vec2, vec2);
3372   }
3373   bind(SCAN_TO_8_CHAR);
3374   cmpl(cnt1, stride);
3375   jcc(Assembler::less, SCAN_TO_CHAR);
3376   if (UseAVX < 2) {
3377     movdl(vec1, ch);
3378     pshuflw(vec1, vec1, 0x00);
3379     pshufd(vec1, vec1, 0);
3380     pxor(vec2, vec2);
3381   }
3382   movl(tmp, cnt1);
3383   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3384   andl(cnt1,0x00000007);  //tail count (in chars)
3385 
3386   bind(SCAN_TO_8_CHAR_LOOP);
3387   movdqu(vec3, Address(result, 0));
3388   pcmpeqw(vec3, vec1);
3389   ptest(vec2, vec3);
3390   jcc(Assembler::carryClear, FOUND_CHAR);
3391   addptr(result, 16);
3392   subl(tmp, stride);
3393   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3394   bind(SCAN_TO_CHAR);
3395   testl(cnt1, cnt1);
3396   jcc(Assembler::zero, RET_NOT_FOUND);
3397   bind(SCAN_TO_CHAR_LOOP);
3398   load_unsigned_short(tmp, Address(result, 0));
3399   cmpl(ch, tmp);
3400   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3401   addptr(result, 2);
3402   subl(cnt1, 1);
3403   jccb(Assembler::zero, RET_NOT_FOUND);
3404   jmp(SCAN_TO_CHAR_LOOP);
3405 
3406   bind(RET_NOT_FOUND);
3407   movl(result, -1);
3408   jmpb(DONE_LABEL);
3409 
3410   bind(FOUND_CHAR);
3411   if (UseAVX >= 2) {
3412     vpmovmskb(tmp, vec3);
3413   } else {
3414     pmovmskb(tmp, vec3);
3415   }
3416   bsfl(ch, tmp);
3417   addptr(result, ch);
3418 
3419   bind(FOUND_SEQ_CHAR);
3420   subptr(result, str1);
3421   shrl(result, 1);
3422 
3423   bind(DONE_LABEL);
3424 } // string_indexof_char
3425 
3426 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3427                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3428   ShortBranchVerifier sbv(this);
3429   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3430 
3431   int stride = 16;
3432 
3433   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3434         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3435         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3436         FOUND_SEQ_CHAR, DONE_LABEL;
3437 
3438   movptr(result, str1);
3439   if (UseAVX >= 2) {
3440     cmpl(cnt1, stride);
3441     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3442     cmpl(cnt1, stride*2);
3443     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3444     movdl(vec1, ch);
3445     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3446     vpxor(vec2, vec2);
3447     movl(tmp, cnt1);
3448     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3449     andl(cnt1,0x0000001F);  //tail count (in chars)
3450 
3451     bind(SCAN_TO_32_CHAR_LOOP);
3452     vmovdqu(vec3, Address(result, 0));
3453     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3454     vptest(vec2, vec3);
3455     jcc(Assembler::carryClear, FOUND_CHAR);
3456     addptr(result, 32);
3457     subl(tmp, stride*2);
3458     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3459     jmp(SCAN_TO_16_CHAR);
3460 
3461     bind(SCAN_TO_16_CHAR_INIT);
3462     movdl(vec1, ch);
3463     pxor(vec2, vec2);
3464     pshufb(vec1, vec2);
3465   }
3466 
3467   bind(SCAN_TO_16_CHAR);
3468   cmpl(cnt1, stride);
3469   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3470   if (UseAVX < 2) {
3471     movdl(vec1, ch);
3472     pxor(vec2, vec2);
3473     pshufb(vec1, vec2);
3474   }
3475   movl(tmp, cnt1);
3476   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3477   andl(cnt1,0x0000000F);  //tail count (in bytes)
3478 
3479   bind(SCAN_TO_16_CHAR_LOOP);
3480   movdqu(vec3, Address(result, 0));
3481   pcmpeqb(vec3, vec1);
3482   ptest(vec2, vec3);
3483   jcc(Assembler::carryClear, FOUND_CHAR);
3484   addptr(result, 16);
3485   subl(tmp, stride);
3486   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3487 
3488   bind(SCAN_TO_CHAR_INIT);
3489   testl(cnt1, cnt1);
3490   jcc(Assembler::zero, RET_NOT_FOUND);
3491   bind(SCAN_TO_CHAR_LOOP);
3492   load_unsigned_byte(tmp, Address(result, 0));
3493   cmpl(ch, tmp);
3494   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3495   addptr(result, 1);
3496   subl(cnt1, 1);
3497   jccb(Assembler::zero, RET_NOT_FOUND);
3498   jmp(SCAN_TO_CHAR_LOOP);
3499 
3500   bind(RET_NOT_FOUND);
3501   movl(result, -1);
3502   jmpb(DONE_LABEL);
3503 
3504   bind(FOUND_CHAR);
3505   if (UseAVX >= 2) {
3506     vpmovmskb(tmp, vec3);
3507   } else {
3508     pmovmskb(tmp, vec3);
3509   }
3510   bsfl(ch, tmp);
3511   addptr(result, ch);
3512 
3513   bind(FOUND_SEQ_CHAR);
3514   subptr(result, str1);
3515 
3516   bind(DONE_LABEL);
3517 } // stringL_indexof_char
3518 
3519 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3520   switch (eltype) {
3521   case T_BOOLEAN: return sizeof(jboolean);
3522   case T_BYTE:  return sizeof(jbyte);
3523   case T_SHORT: return sizeof(jshort);
3524   case T_CHAR:  return sizeof(jchar);
3525   case T_INT:   return sizeof(jint);
3526   default:
3527     ShouldNotReachHere();
3528     return -1;
3529   }
3530 }
3531 
3532 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3533   switch (eltype) {
3534   // T_BOOLEAN used as surrogate for unsigned byte
3535   case T_BOOLEAN: movzbl(dst, src);   break;
3536   case T_BYTE:    movsbl(dst, src);   break;
3537   case T_SHORT:   movswl(dst, src);   break;
3538   case T_CHAR:    movzwl(dst, src);   break;
3539   case T_INT:     movl(dst, src);     break;
3540   default:
3541     ShouldNotReachHere();
3542   }
3543 }
3544 
3545 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3546   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3547 }
3548 
3549 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3550   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3551 }
3552 
3553 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3554   const int vlen = Assembler::AVX_256bit;
3555   switch (eltype) {
3556   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3557   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3558   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3559   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3560   case T_INT:
3561     // do nothing
3562     break;
3563   default:
3564     ShouldNotReachHere();
3565   }
3566 }
3567 
3568 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3569                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3570                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3571                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3572                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3573                                         BasicType eltype) {
3574   ShortBranchVerifier sbv(this);
3575   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3576   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3577   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3578 
3579   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3580         SHORT_UNROLLED_LOOP_EXIT,
3581         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3582         UNROLLED_VECTOR_LOOP_BEGIN,
3583         END;
3584   switch (eltype) {
3585   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3586   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3587   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3588   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3589   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3590   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3591   }
3592 
3593   // For "renaming" for readibility of the code
3594   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3595                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3596                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3597 
3598   const int elsize = arrays_hashcode_elsize(eltype);
3599 
3600   /*
3601     if (cnt1 >= 2) {
3602       if (cnt1 >= 32) {
3603         UNROLLED VECTOR LOOP
3604       }
3605       UNROLLED SCALAR LOOP
3606     }
3607     SINGLE SCALAR
3608    */
3609 
3610   cmpl(cnt1, 32);
3611   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3612 
3613   // cnt1 >= 32 && generate_vectorized_loop
3614   xorl(index, index);
3615 
3616   // vresult = IntVector.zero(I256);
3617   for (int idx = 0; idx < 4; idx++) {
3618     vpxor(vresult[idx], vresult[idx]);
3619   }
3620   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3621   Register bound = tmp2;
3622   Register next = tmp3;
3623   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3624   movl(next, Address(tmp2, 0));
3625   movdl(vnext, next);
3626   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3627 
3628   // index = 0;
3629   // bound = cnt1 & ~(32 - 1);
3630   movl(bound, cnt1);
3631   andl(bound, ~(32 - 1));
3632   // for (; index < bound; index += 32) {
3633   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3634   // result *= next;
3635   imull(result, next);
3636   // loop fission to upfront the cost of fetching from memory, OOO execution
3637   // can then hopefully do a better job of prefetching
3638   for (int idx = 0; idx < 4; idx++) {
3639     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3640   }
3641   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3642   for (int idx = 0; idx < 4; idx++) {
3643     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3644     arrays_hashcode_elvcast(vtmp[idx], eltype);
3645     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3646   }
3647   // index += 32;
3648   addl(index, 32);
3649   // index < bound;
3650   cmpl(index, bound);
3651   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3652   // }
3653 
3654   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3655   subl(cnt1, bound);
3656   // release bound
3657 
3658   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3659   for (int idx = 0; idx < 4; idx++) {
3660     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3661     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3662     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3663   }
3664   // result += vresult.reduceLanes(ADD);
3665   for (int idx = 0; idx < 4; idx++) {
3666     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3667   }
3668 
3669   // } else if (cnt1 < 32) {
3670 
3671   bind(SHORT_UNROLLED_BEGIN);
3672   // int i = 1;
3673   movl(index, 1);
3674   cmpl(index, cnt1);
3675   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3676 
3677   // for (; i < cnt1 ; i += 2) {
3678   bind(SHORT_UNROLLED_LOOP_BEGIN);
3679   movl(tmp3, 961);
3680   imull(result, tmp3);
3681   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3682   movl(tmp3, tmp2);
3683   shll(tmp3, 5);
3684   subl(tmp3, tmp2);
3685   addl(result, tmp3);
3686   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3687   addl(result, tmp3);
3688   addl(index, 2);
3689   cmpl(index, cnt1);
3690   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3691 
3692   // }
3693   // if (i >= cnt1) {
3694   bind(SHORT_UNROLLED_LOOP_EXIT);
3695   jccb(Assembler::greater, END);
3696   movl(tmp2, result);
3697   shll(result, 5);
3698   subl(result, tmp2);
3699   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3700   addl(result, tmp3);
3701   // }
3702   bind(END);
3703 
3704   BLOCK_COMMENT("} // arrays_hashcode");
3705 
3706 } // arrays_hashcode
3707 
3708 // helper function for string_compare
3709 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3710                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3711                                            Address::ScaleFactor scale2, Register index, int ae) {
3712   if (ae == StrIntrinsicNode::LL) {
3713     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3714     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3715   } else if (ae == StrIntrinsicNode::UU) {
3716     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3717     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3718   } else {
3719     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3720     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3721   }
3722 }
3723 
3724 // Compare strings, used for char[] and byte[].
3725 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3726                                        Register cnt1, Register cnt2, Register result,
3727                                        XMMRegister vec1, int ae, KRegister mask) {
3728   ShortBranchVerifier sbv(this);
3729   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3730   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3731   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3732   int stride2x2 = 0x40;
3733   Address::ScaleFactor scale = Address::no_scale;
3734   Address::ScaleFactor scale1 = Address::no_scale;
3735   Address::ScaleFactor scale2 = Address::no_scale;
3736 
3737   if (ae != StrIntrinsicNode::LL) {
3738     stride2x2 = 0x20;
3739   }
3740 
3741   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3742     shrl(cnt2, 1);
3743   }
3744   // Compute the minimum of the string lengths and the
3745   // difference of the string lengths (stack).
3746   // Do the conditional move stuff
3747   movl(result, cnt1);
3748   subl(cnt1, cnt2);
3749   push(cnt1);
3750   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3751 
3752   // Is the minimum length zero?
3753   testl(cnt2, cnt2);
3754   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3755   if (ae == StrIntrinsicNode::LL) {
3756     // Load first bytes
3757     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3758     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3759   } else if (ae == StrIntrinsicNode::UU) {
3760     // Load first characters
3761     load_unsigned_short(result, Address(str1, 0));
3762     load_unsigned_short(cnt1, Address(str2, 0));
3763   } else {
3764     load_unsigned_byte(result, Address(str1, 0));
3765     load_unsigned_short(cnt1, Address(str2, 0));
3766   }
3767   subl(result, cnt1);
3768   jcc(Assembler::notZero,  POP_LABEL);
3769 
3770   if (ae == StrIntrinsicNode::UU) {
3771     // Divide length by 2 to get number of chars
3772     shrl(cnt2, 1);
3773   }
3774   cmpl(cnt2, 1);
3775   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3776 
3777   // Check if the strings start at the same location and setup scale and stride
3778   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3779     cmpptr(str1, str2);
3780     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3781     if (ae == StrIntrinsicNode::LL) {
3782       scale = Address::times_1;
3783       stride = 16;
3784     } else {
3785       scale = Address::times_2;
3786       stride = 8;
3787     }
3788   } else {
3789     scale1 = Address::times_1;
3790     scale2 = Address::times_2;
3791     // scale not used
3792     stride = 8;
3793   }
3794 
3795   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3796     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3797     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3798     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3799     Label COMPARE_TAIL_LONG;
3800     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3801 
3802     int pcmpmask = 0x19;
3803     if (ae == StrIntrinsicNode::LL) {
3804       pcmpmask &= ~0x01;
3805     }
3806 
3807     // Setup to compare 16-chars (32-bytes) vectors,
3808     // start from first character again because it has aligned address.
3809     if (ae == StrIntrinsicNode::LL) {
3810       stride2 = 32;
3811     } else {
3812       stride2 = 16;
3813     }
3814     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3815       adr_stride = stride << scale;
3816     } else {
3817       adr_stride1 = 8;  //stride << scale1;
3818       adr_stride2 = 16; //stride << scale2;
3819     }
3820 
3821     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3822     // rax and rdx are used by pcmpestri as elements counters
3823     movl(result, cnt2);
3824     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3825     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3826 
3827     // fast path : compare first 2 8-char vectors.
3828     bind(COMPARE_16_CHARS);
3829     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3830       movdqu(vec1, Address(str1, 0));
3831     } else {
3832       pmovzxbw(vec1, Address(str1, 0));
3833     }
3834     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3835     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3836 
3837     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3838       movdqu(vec1, Address(str1, adr_stride));
3839       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3840     } else {
3841       pmovzxbw(vec1, Address(str1, adr_stride1));
3842       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3843     }
3844     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3845     addl(cnt1, stride);
3846 
3847     // Compare the characters at index in cnt1
3848     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3849     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3850     subl(result, cnt2);
3851     jmp(POP_LABEL);
3852 
3853     // Setup the registers to start vector comparison loop
3854     bind(COMPARE_WIDE_VECTORS);
3855     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3856       lea(str1, Address(str1, result, scale));
3857       lea(str2, Address(str2, result, scale));
3858     } else {
3859       lea(str1, Address(str1, result, scale1));
3860       lea(str2, Address(str2, result, scale2));
3861     }
3862     subl(result, stride2);
3863     subl(cnt2, stride2);
3864     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3865     negptr(result);
3866 
3867     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3868     bind(COMPARE_WIDE_VECTORS_LOOP);
3869 
3870 #ifdef _LP64
3871     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3872       cmpl(cnt2, stride2x2);
3873       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3874       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3875       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3876 
3877       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3878       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3879         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3880         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3881       } else {
3882         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3883         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3884       }
3885       kortestql(mask, mask);
3886       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3887       addptr(result, stride2x2);  // update since we already compared at this addr
3888       subl(cnt2, stride2x2);      // and sub the size too
3889       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3890 
3891       vpxor(vec1, vec1);
3892       jmpb(COMPARE_WIDE_TAIL);
3893     }//if (VM_Version::supports_avx512vlbw())
3894 #endif // _LP64
3895 
3896 
3897     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3898     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3899       vmovdqu(vec1, Address(str1, result, scale));
3900       vpxor(vec1, Address(str2, result, scale));
3901     } else {
3902       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3903       vpxor(vec1, Address(str2, result, scale2));
3904     }
3905     vptest(vec1, vec1);
3906     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3907     addptr(result, stride2);
3908     subl(cnt2, stride2);
3909     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3910     // clean upper bits of YMM registers
3911     vpxor(vec1, vec1);
3912 
3913     // compare wide vectors tail
3914     bind(COMPARE_WIDE_TAIL);
3915     testptr(result, result);
3916     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3917 
3918     movl(result, stride2);
3919     movl(cnt2, result);
3920     negptr(result);
3921     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3922 
3923     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3924     bind(VECTOR_NOT_EQUAL);
3925     // clean upper bits of YMM registers
3926     vpxor(vec1, vec1);
3927     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3928       lea(str1, Address(str1, result, scale));
3929       lea(str2, Address(str2, result, scale));
3930     } else {
3931       lea(str1, Address(str1, result, scale1));
3932       lea(str2, Address(str2, result, scale2));
3933     }
3934     jmp(COMPARE_16_CHARS);
3935 
3936     // Compare tail chars, length between 1 to 15 chars
3937     bind(COMPARE_TAIL_LONG);
3938     movl(cnt2, result);
3939     cmpl(cnt2, stride);
3940     jcc(Assembler::less, COMPARE_SMALL_STR);
3941 
3942     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3943       movdqu(vec1, Address(str1, 0));
3944     } else {
3945       pmovzxbw(vec1, Address(str1, 0));
3946     }
3947     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3948     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3949     subptr(cnt2, stride);
3950     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3951     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3952       lea(str1, Address(str1, result, scale));
3953       lea(str2, Address(str2, result, scale));
3954     } else {
3955       lea(str1, Address(str1, result, scale1));
3956       lea(str2, Address(str2, result, scale2));
3957     }
3958     negptr(cnt2);
3959     jmpb(WHILE_HEAD_LABEL);
3960 
3961     bind(COMPARE_SMALL_STR);
3962   } else if (UseSSE42Intrinsics) {
3963     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3964     int pcmpmask = 0x19;
3965     // Setup to compare 8-char (16-byte) vectors,
3966     // start from first character again because it has aligned address.
3967     movl(result, cnt2);
3968     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3969     if (ae == StrIntrinsicNode::LL) {
3970       pcmpmask &= ~0x01;
3971     }
3972     jcc(Assembler::zero, COMPARE_TAIL);
3973     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3974       lea(str1, Address(str1, result, scale));
3975       lea(str2, Address(str2, result, scale));
3976     } else {
3977       lea(str1, Address(str1, result, scale1));
3978       lea(str2, Address(str2, result, scale2));
3979     }
3980     negptr(result);
3981 
3982     // pcmpestri
3983     //   inputs:
3984     //     vec1- substring
3985     //     rax - negative string length (elements count)
3986     //     mem - scanned string
3987     //     rdx - string length (elements count)
3988     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3989     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3990     //   outputs:
3991     //     rcx - first mismatched element index
3992     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3993 
3994     bind(COMPARE_WIDE_VECTORS);
3995     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3996       movdqu(vec1, Address(str1, result, scale));
3997       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3998     } else {
3999       pmovzxbw(vec1, Address(str1, result, scale1));
4000       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4001     }
4002     // After pcmpestri cnt1(rcx) contains mismatched element index
4003 
4004     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
4005     addptr(result, stride);
4006     subptr(cnt2, stride);
4007     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4008 
4009     // compare wide vectors tail
4010     testptr(result, result);
4011     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4012 
4013     movl(cnt2, stride);
4014     movl(result, stride);
4015     negptr(result);
4016     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4017       movdqu(vec1, Address(str1, result, scale));
4018       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4019     } else {
4020       pmovzxbw(vec1, Address(str1, result, scale1));
4021       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4022     }
4023     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4024 
4025     // Mismatched characters in the vectors
4026     bind(VECTOR_NOT_EQUAL);
4027     addptr(cnt1, result);
4028     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4029     subl(result, cnt2);
4030     jmpb(POP_LABEL);
4031 
4032     bind(COMPARE_TAIL); // limit is zero
4033     movl(cnt2, result);
4034     // Fallthru to tail compare
4035   }
4036   // Shift str2 and str1 to the end of the arrays, negate min
4037   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4038     lea(str1, Address(str1, cnt2, scale));
4039     lea(str2, Address(str2, cnt2, scale));
4040   } else {
4041     lea(str1, Address(str1, cnt2, scale1));
4042     lea(str2, Address(str2, cnt2, scale2));
4043   }
4044   decrementl(cnt2);  // first character was compared already
4045   negptr(cnt2);
4046 
4047   // Compare the rest of the elements
4048   bind(WHILE_HEAD_LABEL);
4049   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4050   subl(result, cnt1);
4051   jccb(Assembler::notZero, POP_LABEL);
4052   increment(cnt2);
4053   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4054 
4055   // Strings are equal up to min length.  Return the length difference.
4056   bind(LENGTH_DIFF_LABEL);
4057   pop(result);
4058   if (ae == StrIntrinsicNode::UU) {
4059     // Divide diff by 2 to get number of chars
4060     sarl(result, 1);
4061   }
4062   jmpb(DONE_LABEL);
4063 
4064 #ifdef _LP64
4065   if (VM_Version::supports_avx512vlbw()) {
4066 
4067     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4068 
4069     kmovql(cnt1, mask);
4070     notq(cnt1);
4071     bsfq(cnt2, cnt1);
4072     if (ae != StrIntrinsicNode::LL) {
4073       // Divide diff by 2 to get number of chars
4074       sarl(cnt2, 1);
4075     }
4076     addq(result, cnt2);
4077     if (ae == StrIntrinsicNode::LL) {
4078       load_unsigned_byte(cnt1, Address(str2, result));
4079       load_unsigned_byte(result, Address(str1, result));
4080     } else if (ae == StrIntrinsicNode::UU) {
4081       load_unsigned_short(cnt1, Address(str2, result, scale));
4082       load_unsigned_short(result, Address(str1, result, scale));
4083     } else {
4084       load_unsigned_short(cnt1, Address(str2, result, scale2));
4085       load_unsigned_byte(result, Address(str1, result, scale1));
4086     }
4087     subl(result, cnt1);
4088     jmpb(POP_LABEL);
4089   }//if (VM_Version::supports_avx512vlbw())
4090 #endif // _LP64
4091 
4092   // Discard the stored length difference
4093   bind(POP_LABEL);
4094   pop(cnt1);
4095 
4096   // That's it
4097   bind(DONE_LABEL);
4098   if(ae == StrIntrinsicNode::UL) {
4099     negl(result);
4100   }
4101 
4102 }
4103 
4104 // Search for Non-ASCII character (Negative byte value) in a byte array,
4105 // return the index of the first such character, otherwise the length
4106 // of the array segment searched.
4107 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4108 //   @IntrinsicCandidate
4109 //   public static int countPositives(byte[] ba, int off, int len) {
4110 //     for (int i = off; i < off + len; i++) {
4111 //       if (ba[i] < 0) {
4112 //         return i - off;
4113 //       }
4114 //     }
4115 //     return len;
4116 //   }
4117 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4118   Register result, Register tmp1,
4119   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4120   // rsi: byte array
4121   // rcx: len
4122   // rax: result
4123   ShortBranchVerifier sbv(this);
4124   assert_different_registers(ary1, len, result, tmp1);
4125   assert_different_registers(vec1, vec2);
4126   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4127 
4128   movl(result, len); // copy
4129   // len == 0
4130   testl(len, len);
4131   jcc(Assembler::zero, DONE);
4132 
4133   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4134     VM_Version::supports_avx512vlbw() &&
4135     VM_Version::supports_bmi2()) {
4136 
4137     Label test_64_loop, test_tail, BREAK_LOOP;
4138     movl(tmp1, len);
4139     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4140 
4141     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4142     andl(len,  0xffffffc0); // vector count (in chars)
4143     jccb(Assembler::zero, test_tail);
4144 
4145     lea(ary1, Address(ary1, len, Address::times_1));
4146     negptr(len);
4147 
4148     bind(test_64_loop);
4149     // Check whether our 64 elements of size byte contain negatives
4150     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4151     kortestql(mask1, mask1);
4152     jcc(Assembler::notZero, BREAK_LOOP);
4153 
4154     addptr(len, 64);
4155     jccb(Assembler::notZero, test_64_loop);
4156 
4157     bind(test_tail);
4158     // bail out when there is nothing to be done
4159     testl(tmp1, -1);
4160     jcc(Assembler::zero, DONE);
4161 
4162 
4163     // check the tail for absense of negatives
4164     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4165 #ifdef _LP64
4166     {
4167       Register tmp3_aliased = len;
4168       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4169       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4170       notq(tmp3_aliased);
4171       kmovql(mask2, tmp3_aliased);
4172     }
4173 #else
4174     Label k_init;
4175     jmp(k_init);
4176 
4177     // We could not read 64-bits from a general purpose register thus we move
4178     // data required to compose 64 1's to the instruction stream
4179     // We emit 64 byte wide series of elements from 0..63 which later on would
4180     // be used as a compare targets with tail count contained in tmp1 register.
4181     // Result would be a k register having tmp1 consecutive number or 1
4182     // counting from least significant bit.
4183     address tmp = pc();
4184     emit_int64(0x0706050403020100);
4185     emit_int64(0x0F0E0D0C0B0A0908);
4186     emit_int64(0x1716151413121110);
4187     emit_int64(0x1F1E1D1C1B1A1918);
4188     emit_int64(0x2726252423222120);
4189     emit_int64(0x2F2E2D2C2B2A2928);
4190     emit_int64(0x3736353433323130);
4191     emit_int64(0x3F3E3D3C3B3A3938);
4192 
4193     bind(k_init);
4194     lea(len, InternalAddress(tmp));
4195     // create mask to test for negative byte inside a vector
4196     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4197     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4198 
4199 #endif
4200     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4201     ktestq(mask1, mask2);
4202     jcc(Assembler::zero, DONE);
4203 
4204     // do a full check for negative registers in the tail
4205     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4206                      // ary1 already pointing to the right place
4207     jmpb(TAIL_START);
4208 
4209     bind(BREAK_LOOP);
4210     // At least one byte in the last 64 byte block was negative.
4211     // Set up to look at the last 64 bytes as if they were a tail
4212     lea(ary1, Address(ary1, len, Address::times_1));
4213     addptr(result, len);
4214     // Ignore the very last byte: if all others are positive,
4215     // it must be negative, so we can skip right to the 2+1 byte
4216     // end comparison at this point
4217     orl(result, 63);
4218     movl(len, 63);
4219     // Fallthru to tail compare
4220   } else {
4221 
4222     if (UseAVX >= 2 && UseSSE >= 2) {
4223       // With AVX2, use 32-byte vector compare
4224       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4225 
4226       // Compare 32-byte vectors
4227       testl(len, 0xffffffe0);   // vector count (in bytes)
4228       jccb(Assembler::zero, TAIL_START);
4229 
4230       andl(len, 0xffffffe0);
4231       lea(ary1, Address(ary1, len, Address::times_1));
4232       negptr(len);
4233 
4234       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4235       movdl(vec2, tmp1);
4236       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4237 
4238       bind(COMPARE_WIDE_VECTORS);
4239       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4240       vptest(vec1, vec2);
4241       jccb(Assembler::notZero, BREAK_LOOP);
4242       addptr(len, 32);
4243       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4244 
4245       testl(result, 0x0000001f);   // any bytes remaining?
4246       jcc(Assembler::zero, DONE);
4247 
4248       // Quick test using the already prepared vector mask
4249       movl(len, result);
4250       andl(len, 0x0000001f);
4251       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4252       vptest(vec1, vec2);
4253       jcc(Assembler::zero, DONE);
4254       // There are zeros, jump to the tail to determine exactly where
4255       jmpb(TAIL_START);
4256 
4257       bind(BREAK_LOOP);
4258       // At least one byte in the last 32-byte vector is negative.
4259       // Set up to look at the last 32 bytes as if they were a tail
4260       lea(ary1, Address(ary1, len, Address::times_1));
4261       addptr(result, len);
4262       // Ignore the very last byte: if all others are positive,
4263       // it must be negative, so we can skip right to the 2+1 byte
4264       // end comparison at this point
4265       orl(result, 31);
4266       movl(len, 31);
4267       // Fallthru to tail compare
4268     } else if (UseSSE42Intrinsics) {
4269       // With SSE4.2, use double quad vector compare
4270       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4271 
4272       // Compare 16-byte vectors
4273       testl(len, 0xfffffff0);   // vector count (in bytes)
4274       jcc(Assembler::zero, TAIL_START);
4275 
4276       andl(len, 0xfffffff0);
4277       lea(ary1, Address(ary1, len, Address::times_1));
4278       negptr(len);
4279 
4280       movl(tmp1, 0x80808080);
4281       movdl(vec2, tmp1);
4282       pshufd(vec2, vec2, 0);
4283 
4284       bind(COMPARE_WIDE_VECTORS);
4285       movdqu(vec1, Address(ary1, len, Address::times_1));
4286       ptest(vec1, vec2);
4287       jccb(Assembler::notZero, BREAK_LOOP);
4288       addptr(len, 16);
4289       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4290 
4291       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4292       jcc(Assembler::zero, DONE);
4293 
4294       // Quick test using the already prepared vector mask
4295       movl(len, result);
4296       andl(len, 0x0000000f);   // tail count (in bytes)
4297       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4298       ptest(vec1, vec2);
4299       jcc(Assembler::zero, DONE);
4300       jmpb(TAIL_START);
4301 
4302       bind(BREAK_LOOP);
4303       // At least one byte in the last 16-byte vector is negative.
4304       // Set up and look at the last 16 bytes as if they were a tail
4305       lea(ary1, Address(ary1, len, Address::times_1));
4306       addptr(result, len);
4307       // Ignore the very last byte: if all others are positive,
4308       // it must be negative, so we can skip right to the 2+1 byte
4309       // end comparison at this point
4310       orl(result, 15);
4311       movl(len, 15);
4312       // Fallthru to tail compare
4313     }
4314   }
4315 
4316   bind(TAIL_START);
4317   // Compare 4-byte vectors
4318   andl(len, 0xfffffffc); // vector count (in bytes)
4319   jccb(Assembler::zero, COMPARE_CHAR);
4320 
4321   lea(ary1, Address(ary1, len, Address::times_1));
4322   negptr(len);
4323 
4324   bind(COMPARE_VECTORS);
4325   movl(tmp1, Address(ary1, len, Address::times_1));
4326   andl(tmp1, 0x80808080);
4327   jccb(Assembler::notZero, TAIL_ADJUST);
4328   addptr(len, 4);
4329   jccb(Assembler::notZero, COMPARE_VECTORS);
4330 
4331   // Compare trailing char (final 2-3 bytes), if any
4332   bind(COMPARE_CHAR);
4333 
4334   testl(result, 0x2);   // tail  char
4335   jccb(Assembler::zero, COMPARE_BYTE);
4336   load_unsigned_short(tmp1, Address(ary1, 0));
4337   andl(tmp1, 0x00008080);
4338   jccb(Assembler::notZero, CHAR_ADJUST);
4339   lea(ary1, Address(ary1, 2));
4340 
4341   bind(COMPARE_BYTE);
4342   testl(result, 0x1);   // tail  byte
4343   jccb(Assembler::zero, DONE);
4344   load_unsigned_byte(tmp1, Address(ary1, 0));
4345   testl(tmp1, 0x00000080);
4346   jccb(Assembler::zero, DONE);
4347   subptr(result, 1);
4348   jmpb(DONE);
4349 
4350   bind(TAIL_ADJUST);
4351   // there are negative bits in the last 4 byte block.
4352   // Adjust result and check the next three bytes
4353   addptr(result, len);
4354   orl(result, 3);
4355   lea(ary1, Address(ary1, len, Address::times_1));
4356   jmpb(COMPARE_CHAR);
4357 
4358   bind(CHAR_ADJUST);
4359   // We are looking at a char + optional byte tail, and found that one
4360   // of the bytes in the char is negative. Adjust the result, check the
4361   // first byte and readjust if needed.
4362   andl(result, 0xfffffffc);
4363   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4364   jccb(Assembler::notZero, DONE);
4365   addptr(result, 1);
4366 
4367   // That's it
4368   bind(DONE);
4369   if (UseAVX >= 2 && UseSSE >= 2) {
4370     // clean upper bits of YMM registers
4371     vpxor(vec1, vec1);
4372     vpxor(vec2, vec2);
4373   }
4374 }
4375 
4376 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4377 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4378                                       Register limit, Register result, Register chr,
4379                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4380                                       KRegister mask, bool expand_ary2) {
4381   // for expand_ary2, limit is the (smaller) size of the second array.
4382   ShortBranchVerifier sbv(this);
4383   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4384 
4385   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4386          "Expansion only implemented for AVX2");
4387 
4388   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4389   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4390 
4391   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4392   int scaleIncr = expand_ary2 ? 8 : 16;
4393 
4394   if (is_array_equ) {
4395     // Check the input args
4396     cmpoop(ary1, ary2);
4397     jcc(Assembler::equal, TRUE_LABEL);
4398 
4399     // Need additional checks for arrays_equals.
4400     testptr(ary1, ary1);
4401     jcc(Assembler::zero, FALSE_LABEL);
4402     testptr(ary2, ary2);
4403     jcc(Assembler::zero, FALSE_LABEL);
4404 
4405     // Check the lengths
4406     movl(limit, Address(ary1, length_offset));
4407     cmpl(limit, Address(ary2, length_offset));
4408     jcc(Assembler::notEqual, FALSE_LABEL);
4409   }
4410 
4411   // count == 0
4412   testl(limit, limit);
4413   jcc(Assembler::zero, TRUE_LABEL);
4414 
4415   if (is_array_equ) {
4416     // Load array address
4417     lea(ary1, Address(ary1, base_offset));
4418     lea(ary2, Address(ary2, base_offset));
4419   }
4420 
4421   if (is_array_equ && is_char) {
4422     // arrays_equals when used for char[].
4423     shll(limit, 1);      // byte count != 0
4424   }
4425   movl(result, limit); // copy
4426 
4427   if (UseAVX >= 2) {
4428     // With AVX2, use 32-byte vector compare
4429     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4430 
4431     // Compare 32-byte vectors
4432     if (expand_ary2) {
4433       andl(result, 0x0000000f);  //   tail count (in bytes)
4434       andl(limit, 0xfffffff0);   // vector count (in bytes)
4435       jcc(Assembler::zero, COMPARE_TAIL);
4436     } else {
4437       andl(result, 0x0000001f);  //   tail count (in bytes)
4438       andl(limit, 0xffffffe0);   // vector count (in bytes)
4439       jcc(Assembler::zero, COMPARE_TAIL_16);
4440     }
4441 
4442     lea(ary1, Address(ary1, limit, scaleFactor));
4443     lea(ary2, Address(ary2, limit, Address::times_1));
4444     negptr(limit);
4445 
4446 #ifdef _LP64
4447     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4448       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4449 
4450       cmpl(limit, -64);
4451       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4452 
4453       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4454 
4455       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4456       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4457       kortestql(mask, mask);
4458       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4459       addptr(limit, 64);  // update since we already compared at this addr
4460       cmpl(limit, -64);
4461       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4462 
4463       // At this point we may still need to compare -limit+result bytes.
4464       // We could execute the next two instruction and just continue via non-wide path:
4465       //  cmpl(limit, 0);
4466       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4467       // But since we stopped at the points ary{1,2}+limit which are
4468       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4469       // (|limit| <= 32 and result < 32),
4470       // we may just compare the last 64 bytes.
4471       //
4472       addptr(result, -64);   // it is safe, bc we just came from this area
4473       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4474       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4475       kortestql(mask, mask);
4476       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4477 
4478       jmp(TRUE_LABEL);
4479 
4480       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4481 
4482     }//if (VM_Version::supports_avx512vlbw())
4483 #endif //_LP64
4484     bind(COMPARE_WIDE_VECTORS);
4485     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4486     if (expand_ary2) {
4487       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4488     } else {
4489       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4490     }
4491     vpxor(vec1, vec2);
4492 
4493     vptest(vec1, vec1);
4494     jcc(Assembler::notZero, FALSE_LABEL);
4495     addptr(limit, scaleIncr * 2);
4496     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4497 
4498     testl(result, result);
4499     jcc(Assembler::zero, TRUE_LABEL);
4500 
4501     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4502     if (expand_ary2) {
4503       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4504     } else {
4505       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4506     }
4507     vpxor(vec1, vec2);
4508 
4509     vptest(vec1, vec1);
4510     jcc(Assembler::notZero, FALSE_LABEL);
4511     jmp(TRUE_LABEL);
4512 
4513     bind(COMPARE_TAIL_16); // limit is zero
4514     movl(limit, result);
4515 
4516     // Compare 16-byte chunks
4517     andl(result, 0x0000000f);  //   tail count (in bytes)
4518     andl(limit, 0xfffffff0);   // vector count (in bytes)
4519     jcc(Assembler::zero, COMPARE_TAIL);
4520 
4521     lea(ary1, Address(ary1, limit, scaleFactor));
4522     lea(ary2, Address(ary2, limit, Address::times_1));
4523     negptr(limit);
4524 
4525     bind(COMPARE_WIDE_VECTORS_16);
4526     movdqu(vec1, Address(ary1, limit, scaleFactor));
4527     if (expand_ary2) {
4528       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4529     } else {
4530       movdqu(vec2, Address(ary2, limit, Address::times_1));
4531     }
4532     pxor(vec1, vec2);
4533 
4534     ptest(vec1, vec1);
4535     jcc(Assembler::notZero, FALSE_LABEL);
4536     addptr(limit, scaleIncr);
4537     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4538 
4539     bind(COMPARE_TAIL); // limit is zero
4540     movl(limit, result);
4541     // Fallthru to tail compare
4542   } else if (UseSSE42Intrinsics) {
4543     // With SSE4.2, use double quad vector compare
4544     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4545 
4546     // Compare 16-byte vectors
4547     andl(result, 0x0000000f);  //   tail count (in bytes)
4548     andl(limit, 0xfffffff0);   // vector count (in bytes)
4549     jcc(Assembler::zero, COMPARE_TAIL);
4550 
4551     lea(ary1, Address(ary1, limit, Address::times_1));
4552     lea(ary2, Address(ary2, limit, Address::times_1));
4553     negptr(limit);
4554 
4555     bind(COMPARE_WIDE_VECTORS);
4556     movdqu(vec1, Address(ary1, limit, Address::times_1));
4557     movdqu(vec2, Address(ary2, limit, Address::times_1));
4558     pxor(vec1, vec2);
4559 
4560     ptest(vec1, vec1);
4561     jcc(Assembler::notZero, FALSE_LABEL);
4562     addptr(limit, 16);
4563     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4564 
4565     testl(result, result);
4566     jcc(Assembler::zero, TRUE_LABEL);
4567 
4568     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4569     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4570     pxor(vec1, vec2);
4571 
4572     ptest(vec1, vec1);
4573     jccb(Assembler::notZero, FALSE_LABEL);
4574     jmpb(TRUE_LABEL);
4575 
4576     bind(COMPARE_TAIL); // limit is zero
4577     movl(limit, result);
4578     // Fallthru to tail compare
4579   }
4580 
4581   // Compare 4-byte vectors
4582   if (expand_ary2) {
4583     testl(result, result);
4584     jccb(Assembler::zero, TRUE_LABEL);
4585   } else {
4586     andl(limit, 0xfffffffc); // vector count (in bytes)
4587     jccb(Assembler::zero, COMPARE_CHAR);
4588   }
4589 
4590   lea(ary1, Address(ary1, limit, scaleFactor));
4591   lea(ary2, Address(ary2, limit, Address::times_1));
4592   negptr(limit);
4593 
4594   bind(COMPARE_VECTORS);
4595   if (expand_ary2) {
4596     // There are no "vector" operations for bytes to shorts
4597     movzbl(chr, Address(ary2, limit, Address::times_1));
4598     cmpw(Address(ary1, limit, Address::times_2), chr);
4599     jccb(Assembler::notEqual, FALSE_LABEL);
4600     addptr(limit, 1);
4601     jcc(Assembler::notZero, COMPARE_VECTORS);
4602     jmp(TRUE_LABEL);
4603   } else {
4604     movl(chr, Address(ary1, limit, Address::times_1));
4605     cmpl(chr, Address(ary2, limit, Address::times_1));
4606     jccb(Assembler::notEqual, FALSE_LABEL);
4607     addptr(limit, 4);
4608     jcc(Assembler::notZero, COMPARE_VECTORS);
4609   }
4610 
4611   // Compare trailing char (final 2 bytes), if any
4612   bind(COMPARE_CHAR);
4613   testl(result, 0x2);   // tail  char
4614   jccb(Assembler::zero, COMPARE_BYTE);
4615   load_unsigned_short(chr, Address(ary1, 0));
4616   load_unsigned_short(limit, Address(ary2, 0));
4617   cmpl(chr, limit);
4618   jccb(Assembler::notEqual, FALSE_LABEL);
4619 
4620   if (is_array_equ && is_char) {
4621     bind(COMPARE_BYTE);
4622   } else {
4623     lea(ary1, Address(ary1, 2));
4624     lea(ary2, Address(ary2, 2));
4625 
4626     bind(COMPARE_BYTE);
4627     testl(result, 0x1);   // tail  byte
4628     jccb(Assembler::zero, TRUE_LABEL);
4629     load_unsigned_byte(chr, Address(ary1, 0));
4630     load_unsigned_byte(limit, Address(ary2, 0));
4631     cmpl(chr, limit);
4632     jccb(Assembler::notEqual, FALSE_LABEL);
4633   }
4634   bind(TRUE_LABEL);
4635   movl(result, 1);   // return true
4636   jmpb(DONE);
4637 
4638   bind(FALSE_LABEL);
4639   xorl(result, result); // return false
4640 
4641   // That's it
4642   bind(DONE);
4643   if (UseAVX >= 2) {
4644     // clean upper bits of YMM registers
4645     vpxor(vec1, vec1);
4646     vpxor(vec2, vec2);
4647   }
4648 }
4649 
4650 #ifdef _LP64
4651 
4652 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4653 #define __ masm.
4654   Register dst = stub.data<0>();
4655   XMMRegister src = stub.data<1>();
4656   address target = stub.data<2>();
4657   __ bind(stub.entry());
4658   __ subptr(rsp, 8);
4659   __ movdbl(Address(rsp), src);
4660   __ call(RuntimeAddress(target));
4661   __ pop(dst);
4662   __ jmp(stub.continuation());
4663 #undef __
4664 }
4665 
4666 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4667   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4668   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4669 
4670   address slowpath_target;
4671   if (dst_bt == T_INT) {
4672     if (src_bt == T_FLOAT) {
4673       cvttss2sil(dst, src);
4674       cmpl(dst, 0x80000000);
4675       slowpath_target = StubRoutines::x86::f2i_fixup();
4676     } else {
4677       cvttsd2sil(dst, src);
4678       cmpl(dst, 0x80000000);
4679       slowpath_target = StubRoutines::x86::d2i_fixup();
4680     }
4681   } else {
4682     if (src_bt == T_FLOAT) {
4683       cvttss2siq(dst, src);
4684       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4685       slowpath_target = StubRoutines::x86::f2l_fixup();
4686     } else {
4687       cvttsd2siq(dst, src);
4688       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4689       slowpath_target = StubRoutines::x86::d2l_fixup();
4690     }
4691   }
4692 
4693   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4694   jcc(Assembler::equal, stub->entry());
4695   bind(stub->continuation());
4696 }
4697 
4698 #endif // _LP64
4699 
4700 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4701                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4702   switch(ideal_opc) {
4703     case Op_LShiftVS:
4704       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4705     case Op_LShiftVI:
4706       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4707     case Op_LShiftVL:
4708       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4709     case Op_RShiftVS:
4710       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4711     case Op_RShiftVI:
4712       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4713     case Op_RShiftVL:
4714       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4715     case Op_URShiftVS:
4716       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4717     case Op_URShiftVI:
4718       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4719     case Op_URShiftVL:
4720       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4721     case Op_RotateRightV:
4722       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4723     case Op_RotateLeftV:
4724       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4725     default:
4726       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4727       break;
4728   }
4729 }
4730 
4731 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4732                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4733   if (is_unsigned) {
4734     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4735   } else {
4736     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4737   }
4738 }
4739 
4740 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4741                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4742   switch (elem_bt) {
4743     case T_BYTE:
4744       if (ideal_opc == Op_SaturatingAddV) {
4745         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4746       } else {
4747         assert(ideal_opc == Op_SaturatingSubV, "");
4748         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4749       }
4750       break;
4751     case T_SHORT:
4752       if (ideal_opc == Op_SaturatingAddV) {
4753         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4754       } else {
4755         assert(ideal_opc == Op_SaturatingSubV, "");
4756         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4757       }
4758       break;
4759     default:
4760       fatal("Unsupported type %s", type2name(elem_bt));
4761       break;
4762   }
4763 }
4764 
4765 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4766                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4767   switch (elem_bt) {
4768     case T_BYTE:
4769       if (ideal_opc == Op_SaturatingAddV) {
4770         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4771       } else {
4772         assert(ideal_opc == Op_SaturatingSubV, "");
4773         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4774       }
4775       break;
4776     case T_SHORT:
4777       if (ideal_opc == Op_SaturatingAddV) {
4778         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4779       } else {
4780         assert(ideal_opc == Op_SaturatingSubV, "");
4781         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4782       }
4783       break;
4784     default:
4785       fatal("Unsupported type %s", type2name(elem_bt));
4786       break;
4787   }
4788 }
4789 
4790 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4791                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4792   if (is_unsigned) {
4793     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4794   } else {
4795     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4796   }
4797 }
4798 
4799 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4800                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4801   switch (elem_bt) {
4802     case T_BYTE:
4803       if (ideal_opc == Op_SaturatingAddV) {
4804         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4805       } else {
4806         assert(ideal_opc == Op_SaturatingSubV, "");
4807         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4808       }
4809       break;
4810     case T_SHORT:
4811       if (ideal_opc == Op_SaturatingAddV) {
4812         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4813       } else {
4814         assert(ideal_opc == Op_SaturatingSubV, "");
4815         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4816       }
4817       break;
4818     default:
4819       fatal("Unsupported type %s", type2name(elem_bt));
4820       break;
4821   }
4822 }
4823 
4824 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4825                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4826   switch (elem_bt) {
4827     case T_BYTE:
4828       if (ideal_opc == Op_SaturatingAddV) {
4829         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4830       } else {
4831         assert(ideal_opc == Op_SaturatingSubV, "");
4832         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4833       }
4834       break;
4835     case T_SHORT:
4836       if (ideal_opc == Op_SaturatingAddV) {
4837         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4838       } else {
4839         assert(ideal_opc == Op_SaturatingSubV, "");
4840         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4841       }
4842       break;
4843     default:
4844       fatal("Unsupported type %s", type2name(elem_bt));
4845       break;
4846   }
4847 }
4848 
4849 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4850                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4851                                     bool is_varshift) {
4852   switch (ideal_opc) {
4853     case Op_AddVB:
4854       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4855     case Op_AddVS:
4856       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4857     case Op_AddVI:
4858       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4859     case Op_AddVL:
4860       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4861     case Op_AddVF:
4862       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4863     case Op_AddVD:
4864       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4865     case Op_SubVB:
4866       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4867     case Op_SubVS:
4868       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4869     case Op_SubVI:
4870       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4871     case Op_SubVL:
4872       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4873     case Op_SubVF:
4874       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4875     case Op_SubVD:
4876       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4877     case Op_MulVS:
4878       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4879     case Op_MulVI:
4880       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4881     case Op_MulVL:
4882       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4883     case Op_MulVF:
4884       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4885     case Op_MulVD:
4886       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4887     case Op_DivVF:
4888       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4889     case Op_DivVD:
4890       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4891     case Op_SqrtVF:
4892       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4893     case Op_SqrtVD:
4894       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4895     case Op_AbsVB:
4896       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4897     case Op_AbsVS:
4898       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4899     case Op_AbsVI:
4900       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4901     case Op_AbsVL:
4902       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4903     case Op_FmaVF:
4904       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4905     case Op_FmaVD:
4906       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4907     case Op_VectorRearrange:
4908       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4909     case Op_LShiftVS:
4910       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4911     case Op_LShiftVI:
4912       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4913     case Op_LShiftVL:
4914       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4915     case Op_RShiftVS:
4916       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4917     case Op_RShiftVI:
4918       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4919     case Op_RShiftVL:
4920       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4921     case Op_URShiftVS:
4922       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4923     case Op_URShiftVI:
4924       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4925     case Op_URShiftVL:
4926       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4927     case Op_RotateLeftV:
4928       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4929     case Op_RotateRightV:
4930       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4931     case Op_MaxV:
4932       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4933     case Op_MinV:
4934       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4935     case Op_UMinV:
4936       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4937     case Op_UMaxV:
4938       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4939     case Op_XorV:
4940       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4941     case Op_OrV:
4942       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4943     case Op_AndV:
4944       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4945     default:
4946       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4947       break;
4948   }
4949 }
4950 
4951 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4952                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4953   switch (ideal_opc) {
4954     case Op_AddVB:
4955       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4956     case Op_AddVS:
4957       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4958     case Op_AddVI:
4959       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4960     case Op_AddVL:
4961       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4962     case Op_AddVF:
4963       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4964     case Op_AddVD:
4965       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4966     case Op_SubVB:
4967       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4968     case Op_SubVS:
4969       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4970     case Op_SubVI:
4971       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4972     case Op_SubVL:
4973       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4974     case Op_SubVF:
4975       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4976     case Op_SubVD:
4977       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4978     case Op_MulVS:
4979       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4980     case Op_MulVI:
4981       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4982     case Op_MulVL:
4983       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4984     case Op_MulVF:
4985       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4986     case Op_MulVD:
4987       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4988     case Op_DivVF:
4989       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4990     case Op_DivVD:
4991       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4992     case Op_FmaVF:
4993       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4994     case Op_FmaVD:
4995       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4996     case Op_MaxV:
4997       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4998     case Op_MinV:
4999       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5000     case Op_UMaxV:
5001       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5002     case Op_UMinV:
5003       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5004     case Op_XorV:
5005       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5006     case Op_OrV:
5007       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5008     case Op_AndV:
5009       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5010     default:
5011       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
5012       break;
5013   }
5014 }
5015 
5016 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
5017                                   KRegister src1, KRegister src2) {
5018   BasicType etype = T_ILLEGAL;
5019   switch(mask_len) {
5020     case 2:
5021     case 4:
5022     case 8:  etype = T_BYTE; break;
5023     case 16: etype = T_SHORT; break;
5024     case 32: etype = T_INT; break;
5025     case 64: etype = T_LONG; break;
5026     default: fatal("Unsupported type"); break;
5027   }
5028   assert(etype != T_ILLEGAL, "");
5029   switch(ideal_opc) {
5030     case Op_AndVMask:
5031       kand(etype, dst, src1, src2); break;
5032     case Op_OrVMask:
5033       kor(etype, dst, src1, src2); break;
5034     case Op_XorVMask:
5035       kxor(etype, dst, src1, src2); break;
5036     default:
5037       fatal("Unsupported masked operation"); break;
5038   }
5039 }
5040 
5041 /*
5042  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5043  * If src is NaN, the result is 0.
5044  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
5045  * the result is equal to the value of Integer.MIN_VALUE.
5046  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
5047  * the result is equal to the value of Integer.MAX_VALUE.
5048  */
5049 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5050                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5051                                                                    Register rscratch, AddressLiteral float_sign_flip,
5052                                                                    int vec_enc) {
5053   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5054   Label done;
5055   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
5056   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5057   vptest(xtmp2, xtmp2, vec_enc);
5058   jccb(Assembler::equal, done);
5059 
5060   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
5061   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
5062 
5063   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5064   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
5065   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
5066 
5067   // Recompute the mask for remaining special value.
5068   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
5069   // Extract SRC values corresponding to TRUE mask lanes.
5070   vpand(xtmp4, xtmp2, src, vec_enc);
5071   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
5072   // values are set.
5073   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5074 
5075   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5076   bind(done);
5077 }
5078 
5079 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5080                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5081                                                                     Register rscratch, AddressLiteral float_sign_flip,
5082                                                                     int vec_enc) {
5083   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5084   Label done;
5085   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5086   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5087   kortestwl(ktmp1, ktmp1);
5088   jccb(Assembler::equal, done);
5089 
5090   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5091   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5092   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5093 
5094   kxorwl(ktmp1, ktmp1, ktmp2);
5095   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5096   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5097   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5098   bind(done);
5099 }
5100 
5101 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5102                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5103                                                                      Register rscratch, AddressLiteral double_sign_flip,
5104                                                                      int vec_enc) {
5105   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5106 
5107   Label done;
5108   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5109   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5110   kortestwl(ktmp1, ktmp1);
5111   jccb(Assembler::equal, done);
5112 
5113   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5114   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5115   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5116 
5117   kxorwl(ktmp1, ktmp1, ktmp2);
5118   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5119   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5120   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5121   bind(done);
5122 }
5123 
5124 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5125                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5126                                                                      Register rscratch, AddressLiteral float_sign_flip,
5127                                                                      int vec_enc) {
5128   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5129   Label done;
5130   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5131   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5132   kortestwl(ktmp1, ktmp1);
5133   jccb(Assembler::equal, done);
5134 
5135   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5136   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5137   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5138 
5139   kxorwl(ktmp1, ktmp1, ktmp2);
5140   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5141   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5142   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5143   bind(done);
5144 }
5145 
5146 /*
5147  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5148  * If src is NaN, the result is 0.
5149  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5150  * the result is equal to the value of Long.MIN_VALUE.
5151  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5152  * the result is equal to the value of Long.MAX_VALUE.
5153  */
5154 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5155                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5156                                                                       Register rscratch, AddressLiteral double_sign_flip,
5157                                                                       int vec_enc) {
5158   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5159 
5160   Label done;
5161   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5162   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5163   kortestwl(ktmp1, ktmp1);
5164   jccb(Assembler::equal, done);
5165 
5166   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5167   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5168   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5169 
5170   kxorwl(ktmp1, ktmp1, ktmp2);
5171   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5172   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5173   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5174   bind(done);
5175 }
5176 
5177 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5178                                                              XMMRegister xtmp, int index, int vec_enc) {
5179    assert(vec_enc < Assembler::AVX_512bit, "");
5180    if (vec_enc == Assembler::AVX_256bit) {
5181      vextractf128_high(xtmp, src);
5182      vshufps(dst, src, xtmp, index, vec_enc);
5183    } else {
5184      vshufps(dst, src, zero, index, vec_enc);
5185    }
5186 }
5187 
5188 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5189                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5190                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5191   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5192 
5193   Label done;
5194   // Compare the destination lanes with float_sign_flip
5195   // value to get mask for all special values.
5196   movdqu(xtmp1, float_sign_flip, rscratch);
5197   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5198   ptest(xtmp2, xtmp2);
5199   jccb(Assembler::equal, done);
5200 
5201   // Flip float_sign_flip to get max integer value.
5202   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5203   pxor(xtmp1, xtmp4);
5204 
5205   // Set detination lanes corresponding to unordered source lanes as zero.
5206   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5207   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5208 
5209   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5210   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5211   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5212 
5213   // Recompute the mask for remaining special value.
5214   pxor(xtmp2, xtmp3);
5215   // Extract mask corresponding to non-negative source lanes.
5216   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5217 
5218   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5219   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5220   pand(xtmp3, xtmp2);
5221 
5222   // Replace destination lanes holding special value(0x80000000) with max int
5223   // if corresponding source lane holds a +ve value.
5224   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5225   bind(done);
5226 }
5227 
5228 
5229 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5230                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5231   switch(to_elem_bt) {
5232     case T_SHORT:
5233       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5234       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5235       vpackusdw(dst, dst, zero, vec_enc);
5236       if (vec_enc == Assembler::AVX_256bit) {
5237         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5238       }
5239       break;
5240     case  T_BYTE:
5241       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5242       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5243       vpackusdw(dst, dst, zero, vec_enc);
5244       if (vec_enc == Assembler::AVX_256bit) {
5245         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5246       }
5247       vpackuswb(dst, dst, zero, vec_enc);
5248       break;
5249     default: assert(false, "%s", type2name(to_elem_bt));
5250   }
5251 }
5252 
5253 /*
5254  * Algorithm for vector D2L and F2I conversions:-
5255  * a) Perform vector D2L/F2I cast.
5256  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5257  *    It signifies that source value could be any of the special floating point
5258  *    values(NaN,-Inf,Inf,Max,-Min).
5259  * c) Set destination to zero if source is NaN value.
5260  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5261  */
5262 
5263 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5264                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5265                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5266   int to_elem_sz = type2aelembytes(to_elem_bt);
5267   assert(to_elem_sz <= 4, "");
5268   vcvttps2dq(dst, src, vec_enc);
5269   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5270   if (to_elem_sz < 4) {
5271     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5272     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5273   }
5274 }
5275 
5276 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5277                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5278                                             Register rscratch, int vec_enc) {
5279   int to_elem_sz = type2aelembytes(to_elem_bt);
5280   assert(to_elem_sz <= 4, "");
5281   vcvttps2dq(dst, src, vec_enc);
5282   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5283   switch(to_elem_bt) {
5284     case T_INT:
5285       break;
5286     case T_SHORT:
5287       evpmovdw(dst, dst, vec_enc);
5288       break;
5289     case T_BYTE:
5290       evpmovdb(dst, dst, vec_enc);
5291       break;
5292     default: assert(false, "%s", type2name(to_elem_bt));
5293   }
5294 }
5295 
5296 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5297                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5298                                             Register rscratch, int vec_enc) {
5299   evcvttps2qq(dst, src, vec_enc);
5300   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5301 }
5302 
5303 // Handling for downcasting from double to integer or sub-word types on AVX2.
5304 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5305                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5306                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5307   int to_elem_sz = type2aelembytes(to_elem_bt);
5308   assert(to_elem_sz < 8, "");
5309   vcvttpd2dq(dst, src, vec_enc);
5310   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5311                                               float_sign_flip, vec_enc);
5312   if (to_elem_sz < 4) {
5313     // xtmp4 holds all zero lanes.
5314     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5315   }
5316 }
5317 
5318 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5319                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5320                                             KRegister ktmp2, AddressLiteral sign_flip,
5321                                             Register rscratch, int vec_enc) {
5322   if (VM_Version::supports_avx512dq()) {
5323     evcvttpd2qq(dst, src, vec_enc);
5324     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5325     switch(to_elem_bt) {
5326       case T_LONG:
5327         break;
5328       case T_INT:
5329         evpmovsqd(dst, dst, vec_enc);
5330         break;
5331       case T_SHORT:
5332         evpmovsqd(dst, dst, vec_enc);
5333         evpmovdw(dst, dst, vec_enc);
5334         break;
5335       case T_BYTE:
5336         evpmovsqd(dst, dst, vec_enc);
5337         evpmovdb(dst, dst, vec_enc);
5338         break;
5339       default: assert(false, "%s", type2name(to_elem_bt));
5340     }
5341   } else {
5342     assert(type2aelembytes(to_elem_bt) <= 4, "");
5343     vcvttpd2dq(dst, src, vec_enc);
5344     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5345     switch(to_elem_bt) {
5346       case T_INT:
5347         break;
5348       case T_SHORT:
5349         evpmovdw(dst, dst, vec_enc);
5350         break;
5351       case T_BYTE:
5352         evpmovdb(dst, dst, vec_enc);
5353         break;
5354       default: assert(false, "%s", type2name(to_elem_bt));
5355     }
5356   }
5357 }
5358 
5359 #ifdef _LP64
5360 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5361                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5362                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5363   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5364   // and re-instantiate original MXCSR.RC mode after that.
5365   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5366 
5367   mov64(tmp, julong_cast(0.5L));
5368   evpbroadcastq(xtmp1, tmp, vec_enc);
5369   vaddpd(xtmp1, src , xtmp1, vec_enc);
5370   evcvtpd2qq(dst, xtmp1, vec_enc);
5371   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5372                                                 double_sign_flip, vec_enc);;
5373 
5374   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5375 }
5376 
5377 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5378                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5379                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5380   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5381   // and re-instantiate original MXCSR.RC mode after that.
5382   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5383 
5384   movl(tmp, jint_cast(0.5));
5385   movq(xtmp1, tmp);
5386   vbroadcastss(xtmp1, xtmp1, vec_enc);
5387   vaddps(xtmp1, src , xtmp1, vec_enc);
5388   vcvtps2dq(dst, xtmp1, vec_enc);
5389   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5390                                               float_sign_flip, vec_enc);
5391 
5392   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5393 }
5394 
5395 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5396                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5397                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5398   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5399   // and re-instantiate original MXCSR.RC mode after that.
5400   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5401 
5402   movl(tmp, jint_cast(0.5));
5403   movq(xtmp1, tmp);
5404   vbroadcastss(xtmp1, xtmp1, vec_enc);
5405   vaddps(xtmp1, src , xtmp1, vec_enc);
5406   vcvtps2dq(dst, xtmp1, vec_enc);
5407   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5408 
5409   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5410 }
5411 #endif // _LP64
5412 
5413 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5414                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5415   switch (from_elem_bt) {
5416     case T_BYTE:
5417       switch (to_elem_bt) {
5418         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5419         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5420         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5421         default: ShouldNotReachHere();
5422       }
5423       break;
5424     case T_SHORT:
5425       switch (to_elem_bt) {
5426         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5427         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5428         default: ShouldNotReachHere();
5429       }
5430       break;
5431     case T_INT:
5432       assert(to_elem_bt == T_LONG, "");
5433       vpmovzxdq(dst, src, vlen_enc);
5434       break;
5435     default:
5436       ShouldNotReachHere();
5437   }
5438 }
5439 
5440 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5441                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5442   switch (from_elem_bt) {
5443     case T_BYTE:
5444       switch (to_elem_bt) {
5445         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5446         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5447         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5448         default: ShouldNotReachHere();
5449       }
5450       break;
5451     case T_SHORT:
5452       switch (to_elem_bt) {
5453         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5454         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5455         default: ShouldNotReachHere();
5456       }
5457       break;
5458     case T_INT:
5459       assert(to_elem_bt == T_LONG, "");
5460       vpmovsxdq(dst, src, vlen_enc);
5461       break;
5462     default:
5463       ShouldNotReachHere();
5464   }
5465 }
5466 
5467 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5468                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5469   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5470   assert(vlen_enc != AVX_512bit, "");
5471 
5472   int dst_bt_size = type2aelembytes(dst_bt);
5473   int src_bt_size = type2aelembytes(src_bt);
5474   if (dst_bt_size > src_bt_size) {
5475     switch (dst_bt_size / src_bt_size) {
5476       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5477       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5478       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5479       default: ShouldNotReachHere();
5480     }
5481   } else {
5482     assert(dst_bt_size < src_bt_size, "");
5483     switch (src_bt_size / dst_bt_size) {
5484       case 2: {
5485         if (vlen_enc == AVX_128bit) {
5486           vpacksswb(dst, src, src, vlen_enc);
5487         } else {
5488           vpacksswb(dst, src, src, vlen_enc);
5489           vpermq(dst, dst, 0x08, vlen_enc);
5490         }
5491         break;
5492       }
5493       case 4: {
5494         if (vlen_enc == AVX_128bit) {
5495           vpackssdw(dst, src, src, vlen_enc);
5496           vpacksswb(dst, dst, dst, vlen_enc);
5497         } else {
5498           vpackssdw(dst, src, src, vlen_enc);
5499           vpermq(dst, dst, 0x08, vlen_enc);
5500           vpacksswb(dst, dst, dst, AVX_128bit);
5501         }
5502         break;
5503       }
5504       case 8: {
5505         if (vlen_enc == AVX_128bit) {
5506           vpshufd(dst, src, 0x08, vlen_enc);
5507           vpackssdw(dst, dst, dst, vlen_enc);
5508           vpacksswb(dst, dst, dst, vlen_enc);
5509         } else {
5510           vpshufd(dst, src, 0x08, vlen_enc);
5511           vpermq(dst, dst, 0x08, vlen_enc);
5512           vpackssdw(dst, dst, dst, AVX_128bit);
5513           vpacksswb(dst, dst, dst, AVX_128bit);
5514         }
5515         break;
5516       }
5517       default: ShouldNotReachHere();
5518     }
5519   }
5520 }
5521 
5522 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5523                                    bool merge, BasicType bt, int vlen_enc) {
5524   if (bt == T_INT) {
5525     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5526   } else {
5527     assert(bt == T_LONG, "");
5528     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5529   }
5530 }
5531 
5532 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5533                                    bool merge, BasicType bt, int vlen_enc) {
5534   if (bt == T_INT) {
5535     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5536   } else {
5537     assert(bt == T_LONG, "");
5538     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5539   }
5540 }
5541 
5542 #ifdef _LP64
5543 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5544                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5545                                                int vec_enc) {
5546   int index = 0;
5547   int vindex = 0;
5548   mov64(rtmp1, 0x0101010101010101L);
5549   pdepq(rtmp1, src, rtmp1);
5550   if (mask_len > 8) {
5551     movq(rtmp2, src);
5552     vpxor(xtmp, xtmp, xtmp, vec_enc);
5553     movq(xtmp, rtmp1);
5554   }
5555   movq(dst, rtmp1);
5556 
5557   mask_len -= 8;
5558   while (mask_len > 0) {
5559     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5560     index++;
5561     if ((index % 2) == 0) {
5562       pxor(xtmp, xtmp);
5563     }
5564     mov64(rtmp1, 0x0101010101010101L);
5565     shrq(rtmp2, 8);
5566     pdepq(rtmp1, rtmp2, rtmp1);
5567     pinsrq(xtmp, rtmp1, index % 2);
5568     vindex = index / 2;
5569     if (vindex) {
5570       // Write entire 16 byte vector when both 64 bit
5571       // lanes are update to save redundant instructions.
5572       if (index % 2) {
5573         vinsertf128(dst, dst, xtmp, vindex);
5574       }
5575     } else {
5576       vmovdqu(dst, xtmp);
5577     }
5578     mask_len -= 8;
5579   }
5580 }
5581 
5582 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5583   switch(opc) {
5584     case Op_VectorMaskTrueCount:
5585       popcntq(dst, tmp);
5586       break;
5587     case Op_VectorMaskLastTrue:
5588       if (VM_Version::supports_lzcnt()) {
5589         lzcntq(tmp, tmp);
5590         movl(dst, 63);
5591         subl(dst, tmp);
5592       } else {
5593         movl(dst, -1);
5594         bsrq(tmp, tmp);
5595         cmov32(Assembler::notZero, dst, tmp);
5596       }
5597       break;
5598     case Op_VectorMaskFirstTrue:
5599       if (VM_Version::supports_bmi1()) {
5600         if (masklen < 32) {
5601           orl(tmp, 1 << masklen);
5602           tzcntl(dst, tmp);
5603         } else if (masklen == 32) {
5604           tzcntl(dst, tmp);
5605         } else {
5606           assert(masklen == 64, "");
5607           tzcntq(dst, tmp);
5608         }
5609       } else {
5610         if (masklen < 32) {
5611           orl(tmp, 1 << masklen);
5612           bsfl(dst, tmp);
5613         } else {
5614           assert(masklen == 32 || masklen == 64, "");
5615           movl(dst, masklen);
5616           if (masklen == 32)  {
5617             bsfl(tmp, tmp);
5618           } else {
5619             bsfq(tmp, tmp);
5620           }
5621           cmov32(Assembler::notZero, dst, tmp);
5622         }
5623       }
5624       break;
5625     case Op_VectorMaskToLong:
5626       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5627       break;
5628     default: assert(false, "Unhandled mask operation");
5629   }
5630 }
5631 
5632 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5633                                               int masklen, int masksize, int vec_enc) {
5634   assert(VM_Version::supports_popcnt(), "");
5635 
5636   if(VM_Version::supports_avx512bw()) {
5637     kmovql(tmp, mask);
5638   } else {
5639     assert(masklen <= 16, "");
5640     kmovwl(tmp, mask);
5641   }
5642 
5643   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5644   // operations needs to be clipped.
5645   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5646     andq(tmp, (1 << masklen) - 1);
5647   }
5648 
5649   vector_mask_operation_helper(opc, dst, tmp, masklen);
5650 }
5651 
5652 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5653                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5654   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5655          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5656   assert(VM_Version::supports_popcnt(), "");
5657 
5658   bool need_clip = false;
5659   switch(bt) {
5660     case T_BOOLEAN:
5661       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5662       vpxor(xtmp, xtmp, xtmp, vec_enc);
5663       vpsubb(xtmp, xtmp, mask, vec_enc);
5664       vpmovmskb(tmp, xtmp, vec_enc);
5665       need_clip = masklen < 16;
5666       break;
5667     case T_BYTE:
5668       vpmovmskb(tmp, mask, vec_enc);
5669       need_clip = masklen < 16;
5670       break;
5671     case T_SHORT:
5672       vpacksswb(xtmp, mask, mask, vec_enc);
5673       if (masklen >= 16) {
5674         vpermpd(xtmp, xtmp, 8, vec_enc);
5675       }
5676       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5677       need_clip = masklen < 16;
5678       break;
5679     case T_INT:
5680     case T_FLOAT:
5681       vmovmskps(tmp, mask, vec_enc);
5682       need_clip = masklen < 4;
5683       break;
5684     case T_LONG:
5685     case T_DOUBLE:
5686       vmovmskpd(tmp, mask, vec_enc);
5687       need_clip = masklen < 2;
5688       break;
5689     default: assert(false, "Unhandled type, %s", type2name(bt));
5690   }
5691 
5692   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5693   // operations needs to be clipped.
5694   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5695     // need_clip implies masklen < 32
5696     andq(tmp, (1 << masklen) - 1);
5697   }
5698 
5699   vector_mask_operation_helper(opc, dst, tmp, masklen);
5700 }
5701 
5702 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5703                                              Register rtmp2, int mask_len) {
5704   kmov(rtmp1, src);
5705   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5706   mov64(rtmp2, -1L);
5707   pextq(rtmp2, rtmp2, rtmp1);
5708   kmov(dst, rtmp2);
5709 }
5710 
5711 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5712                                                     XMMRegister mask, Register rtmp, Register rscratch,
5713                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5714                                                     int vec_enc) {
5715   assert(type2aelembytes(bt) >= 4, "");
5716   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5717   address compress_perm_table = nullptr;
5718   address expand_perm_table = nullptr;
5719   if (type2aelembytes(bt) == 8) {
5720     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5721     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5722     vmovmskpd(rtmp, mask, vec_enc);
5723   } else {
5724     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5725     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5726     vmovmskps(rtmp, mask, vec_enc);
5727   }
5728   shlq(rtmp, 5); // for 32 byte permute row.
5729   if (opcode == Op_CompressV) {
5730     lea(rscratch, ExternalAddress(compress_perm_table));
5731   } else {
5732     lea(rscratch, ExternalAddress(expand_perm_table));
5733   }
5734   addptr(rtmp, rscratch);
5735   vmovdqu(permv, Address(rtmp));
5736   vpermps(dst, permv, src, Assembler::AVX_256bit);
5737   vpxor(xtmp, xtmp, xtmp, vec_enc);
5738   // Blend the result with zero vector using permute mask, each column entry
5739   // in a permute table row contains either a valid permute index or a -1 (default)
5740   // value, this can potentially be used as a blending mask after
5741   // compressing/expanding the source vector lanes.
5742   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5743 }
5744 
5745 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5746                                                bool merge, BasicType bt, int vec_enc) {
5747   if (opcode == Op_CompressV) {
5748     switch(bt) {
5749     case T_BYTE:
5750       evpcompressb(dst, mask, src, merge, vec_enc);
5751       break;
5752     case T_CHAR:
5753     case T_SHORT:
5754       evpcompressw(dst, mask, src, merge, vec_enc);
5755       break;
5756     case T_INT:
5757       evpcompressd(dst, mask, src, merge, vec_enc);
5758       break;
5759     case T_FLOAT:
5760       evcompressps(dst, mask, src, merge, vec_enc);
5761       break;
5762     case T_LONG:
5763       evpcompressq(dst, mask, src, merge, vec_enc);
5764       break;
5765     case T_DOUBLE:
5766       evcompresspd(dst, mask, src, merge, vec_enc);
5767       break;
5768     default:
5769       fatal("Unsupported type %s", type2name(bt));
5770       break;
5771     }
5772   } else {
5773     assert(opcode == Op_ExpandV, "");
5774     switch(bt) {
5775     case T_BYTE:
5776       evpexpandb(dst, mask, src, merge, vec_enc);
5777       break;
5778     case T_CHAR:
5779     case T_SHORT:
5780       evpexpandw(dst, mask, src, merge, vec_enc);
5781       break;
5782     case T_INT:
5783       evpexpandd(dst, mask, src, merge, vec_enc);
5784       break;
5785     case T_FLOAT:
5786       evexpandps(dst, mask, src, merge, vec_enc);
5787       break;
5788     case T_LONG:
5789       evpexpandq(dst, mask, src, merge, vec_enc);
5790       break;
5791     case T_DOUBLE:
5792       evexpandpd(dst, mask, src, merge, vec_enc);
5793       break;
5794     default:
5795       fatal("Unsupported type %s", type2name(bt));
5796       break;
5797     }
5798   }
5799 }
5800 #endif
5801 
5802 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5803                                            KRegister ktmp1, int vec_enc) {
5804   if (opcode == Op_SignumVD) {
5805     vsubpd(dst, zero, one, vec_enc);
5806     // if src < 0 ? -1 : 1
5807     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5808     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5809     // if src == NaN, -0.0 or 0.0 return src.
5810     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5811     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5812   } else {
5813     assert(opcode == Op_SignumVF, "");
5814     vsubps(dst, zero, one, vec_enc);
5815     // if src < 0 ? -1 : 1
5816     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5817     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5818     // if src == NaN, -0.0 or 0.0 return src.
5819     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5820     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5821   }
5822 }
5823 
5824 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5825                                           XMMRegister xtmp1, int vec_enc) {
5826   if (opcode == Op_SignumVD) {
5827     vsubpd(dst, zero, one, vec_enc);
5828     // if src < 0 ? -1 : 1
5829     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5830     // if src == NaN, -0.0 or 0.0 return src.
5831     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5832     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5833   } else {
5834     assert(opcode == Op_SignumVF, "");
5835     vsubps(dst, zero, one, vec_enc);
5836     // if src < 0 ? -1 : 1
5837     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5838     // if src == NaN, -0.0 or 0.0 return src.
5839     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5840     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5841   }
5842 }
5843 
5844 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5845   if (VM_Version::supports_avx512bw()) {
5846     if (mask_len > 32) {
5847       kmovql(dst, src);
5848     } else {
5849       kmovdl(dst, src);
5850       if (mask_len != 32) {
5851         kshiftrdl(dst, dst, 32 - mask_len);
5852       }
5853     }
5854   } else {
5855     assert(mask_len <= 16, "");
5856     kmovwl(dst, src);
5857     if (mask_len != 16) {
5858       kshiftrwl(dst, dst, 16 - mask_len);
5859     }
5860   }
5861 }
5862 
5863 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5864   int lane_size = type2aelembytes(bt);
5865   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5866   if ((is_LP64 || lane_size < 8) &&
5867       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5868        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5869     movptr(rtmp, imm32);
5870     switch(lane_size) {
5871       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5872       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5873       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5874       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5875       fatal("Unsupported lane size %d", lane_size);
5876       break;
5877     }
5878   } else {
5879     movptr(rtmp, imm32);
5880     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5881     switch(lane_size) {
5882       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5883       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5884       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5885       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5886       fatal("Unsupported lane size %d", lane_size);
5887       break;
5888     }
5889   }
5890 }
5891 
5892 //
5893 // Following is lookup table based popcount computation algorithm:-
5894 //       Index   Bit set count
5895 //     [ 0000 ->   0,
5896 //       0001 ->   1,
5897 //       0010 ->   1,
5898 //       0011 ->   2,
5899 //       0100 ->   1,
5900 //       0101 ->   2,
5901 //       0110 ->   2,
5902 //       0111 ->   3,
5903 //       1000 ->   1,
5904 //       1001 ->   2,
5905 //       1010 ->   3,
5906 //       1011 ->   3,
5907 //       1100 ->   2,
5908 //       1101 ->   3,
5909 //       1111 ->   4 ]
5910 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5911 //     shuffle indices for lookup table access.
5912 //  b. Right shift each byte of vector lane by 4 positions.
5913 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5914 //     shuffle indices for lookup table access.
5915 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5916 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5917 //     count of all the bytes of a quadword.
5918 //  f. Perform step e. for upper 128bit vector lane.
5919 //  g. Pack the bitset count of quadwords back to double word.
5920 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5921 
5922 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5923                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5924   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5925   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5926   vpsrlw(dst, src, 4, vec_enc);
5927   vpand(dst, dst, xtmp1, vec_enc);
5928   vpand(xtmp1, src, xtmp1, vec_enc);
5929   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5930   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5931   vpshufb(dst, xtmp2, dst, vec_enc);
5932   vpaddb(dst, dst, xtmp1, vec_enc);
5933 }
5934 
5935 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5936                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5937   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5938   // Following code is as per steps e,f,g and h of above algorithm.
5939   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5940   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5941   vpsadbw(dst, dst, xtmp2, vec_enc);
5942   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5943   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5944   vpackuswb(dst, xtmp1, dst, vec_enc);
5945 }
5946 
5947 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5948                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5949   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5950   // Add the popcount of upper and lower bytes of word.
5951   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5952   vpsrlw(dst, xtmp1, 8, vec_enc);
5953   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5954   vpaddw(dst, dst, xtmp1, vec_enc);
5955 }
5956 
5957 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5958                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5959   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5960   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5961   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5962 }
5963 
5964 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5965                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5966   switch(bt) {
5967     case T_LONG:
5968       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5969       break;
5970     case T_INT:
5971       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5972       break;
5973     case T_CHAR:
5974     case T_SHORT:
5975       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5976       break;
5977     case T_BYTE:
5978     case T_BOOLEAN:
5979       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5980       break;
5981     default:
5982       fatal("Unsupported type %s", type2name(bt));
5983       break;
5984   }
5985 }
5986 
5987 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5988                                                       KRegister mask, bool merge, int vec_enc) {
5989   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5990   switch(bt) {
5991     case T_LONG:
5992       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5993       evpopcntq(dst, mask, src, merge, vec_enc);
5994       break;
5995     case T_INT:
5996       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5997       evpopcntd(dst, mask, src, merge, vec_enc);
5998       break;
5999     case T_CHAR:
6000     case T_SHORT:
6001       assert(VM_Version::supports_avx512_bitalg(), "");
6002       evpopcntw(dst, mask, src, merge, vec_enc);
6003       break;
6004     case T_BYTE:
6005     case T_BOOLEAN:
6006       assert(VM_Version::supports_avx512_bitalg(), "");
6007       evpopcntb(dst, mask, src, merge, vec_enc);
6008       break;
6009     default:
6010       fatal("Unsupported type %s", type2name(bt));
6011       break;
6012   }
6013 }
6014 
6015 #ifndef _LP64
6016 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
6017   assert(VM_Version::supports_avx512bw(), "");
6018   kmovdl(tmp, src);
6019   kunpckdql(dst, tmp, tmp);
6020 }
6021 #endif
6022 
6023 // Bit reversal algorithm first reverses the bits of each byte followed by
6024 // a byte level reversal for multi-byte primitive types (short/int/long).
6025 // Algorithm performs a lookup table access to get reverse bit sequence
6026 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6027 // is obtained by swapping the reverse bit sequences of upper and lower
6028 // nibble of a byte.
6029 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6030                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
6031   if (VM_Version::supports_avx512vlbw()) {
6032 
6033     // Get the reverse bit sequence of lower nibble of each byte.
6034     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6035     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6036     evpandq(dst, xtmp2, src, vec_enc);
6037     vpshufb(dst, xtmp1, dst, vec_enc);
6038     vpsllq(dst, dst, 4, vec_enc);
6039 
6040     // Get the reverse bit sequence of upper nibble of each byte.
6041     vpandn(xtmp2, xtmp2, src, vec_enc);
6042     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6043     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6044 
6045     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6046     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6047     evporq(xtmp2, dst, xtmp2, vec_enc);
6048     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6049 
6050   } else if(vec_enc == Assembler::AVX_512bit) {
6051     // Shift based bit reversal.
6052     assert(bt == T_LONG || bt == T_INT, "");
6053 
6054     // Swap lower and upper nibble of each byte.
6055     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6056 
6057     // Swap two least and most significant bits of each nibble.
6058     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6059 
6060     // Swap adjacent pair of bits.
6061     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6062     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6063 
6064     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6065     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6066   } else {
6067     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6068     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6069 
6070     // Get the reverse bit sequence of lower nibble of each byte.
6071     vpand(dst, xtmp2, src, vec_enc);
6072     vpshufb(dst, xtmp1, dst, vec_enc);
6073     vpsllq(dst, dst, 4, vec_enc);
6074 
6075     // Get the reverse bit sequence of upper nibble of each byte.
6076     vpandn(xtmp2, xtmp2, src, vec_enc);
6077     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6078     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6079 
6080     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6081     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6082     vpor(xtmp2, dst, xtmp2, vec_enc);
6083     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6084   }
6085 }
6086 
6087 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6088                                                 XMMRegister xtmp, Register rscratch) {
6089   assert(VM_Version::supports_gfni(), "");
6090   assert(rscratch != noreg || always_reachable(mask), "missing");
6091 
6092   // Galois field instruction based bit reversal based on following algorithm.
6093   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6094   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6095   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6096   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6097 }
6098 
6099 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6100                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6101   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6102   evpandq(dst, xtmp1, src, vec_enc);
6103   vpsllq(dst, dst, nbits, vec_enc);
6104   vpandn(xtmp1, xtmp1, src, vec_enc);
6105   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6106   evporq(dst, dst, xtmp1, vec_enc);
6107 }
6108 
6109 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6110                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6111   // Shift based bit reversal.
6112   assert(VM_Version::supports_evex(), "");
6113   switch(bt) {
6114     case T_LONG:
6115       // Swap upper and lower double word of each quad word.
6116       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6117       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6118       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6119       break;
6120     case T_INT:
6121       // Swap upper and lower word of each double word.
6122       evprord(xtmp1, k0, src, 16, true, vec_enc);
6123       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6124       break;
6125     case T_CHAR:
6126     case T_SHORT:
6127       // Swap upper and lower byte of each word.
6128       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6129       break;
6130     case T_BYTE:
6131       evmovdquq(dst, k0, src, true, vec_enc);
6132       break;
6133     default:
6134       fatal("Unsupported type %s", type2name(bt));
6135       break;
6136   }
6137 }
6138 
6139 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6140   if (bt == T_BYTE) {
6141     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6142       evmovdquq(dst, k0, src, true, vec_enc);
6143     } else {
6144       vmovdqu(dst, src);
6145     }
6146     return;
6147   }
6148   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6149   // pre-computed shuffle indices.
6150   switch(bt) {
6151     case T_LONG:
6152       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6153       break;
6154     case T_INT:
6155       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6156       break;
6157     case T_CHAR:
6158     case T_SHORT:
6159       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6160       break;
6161     default:
6162       fatal("Unsupported type %s", type2name(bt));
6163       break;
6164   }
6165   vpshufb(dst, src, dst, vec_enc);
6166 }
6167 
6168 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6169                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6170                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6171   assert(is_integral_type(bt), "");
6172   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6173   assert(VM_Version::supports_avx512cd(), "");
6174   switch(bt) {
6175     case T_LONG:
6176       evplzcntq(dst, ktmp, src, merge, vec_enc);
6177       break;
6178     case T_INT:
6179       evplzcntd(dst, ktmp, src, merge, vec_enc);
6180       break;
6181     case T_SHORT:
6182       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6183       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6184       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6185       vpunpckhwd(dst, xtmp1, src, vec_enc);
6186       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6187       vpackusdw(dst, xtmp2, dst, vec_enc);
6188       break;
6189     case T_BYTE:
6190       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6191       // accessing the lookup table.
6192       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6193       // accessing the lookup table.
6194       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6195       assert(VM_Version::supports_avx512bw(), "");
6196       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6197       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6198       vpand(xtmp2, dst, src, vec_enc);
6199       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6200       vpsrlw(xtmp3, src, 4, vec_enc);
6201       vpand(xtmp3, dst, xtmp3, vec_enc);
6202       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6203       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6204       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6205       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6206       break;
6207     default:
6208       fatal("Unsupported type %s", type2name(bt));
6209       break;
6210   }
6211 }
6212 
6213 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6214                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6215   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6216   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6217   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6218   // accessing the lookup table.
6219   vpand(dst, xtmp2, src, vec_enc);
6220   vpshufb(dst, xtmp1, dst, vec_enc);
6221   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6222   // accessing the lookup table.
6223   vpsrlw(xtmp3, src, 4, vec_enc);
6224   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6225   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6226   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6227   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6228   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6229   vpaddb(dst, dst, xtmp2, vec_enc);
6230   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6231 }
6232 
6233 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6234                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6235   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6236   // Add zero counts of lower byte and upper byte of a word if
6237   // upper byte holds a zero value.
6238   vpsrlw(xtmp3, src, 8, vec_enc);
6239   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6240   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6241   vpsllw(xtmp2, dst, 8, vec_enc);
6242   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6243   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6244   vpsrlw(dst, dst, 8, vec_enc);
6245 }
6246 
6247 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6248                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6249   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6250   // hence biased exponent can be used to compute leading zero count as per
6251   // following formula:-
6252   // LZCNT = 31 - (biased_exp - 127)
6253   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6254 
6255   // Broadcast 0xFF
6256   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6257   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6258 
6259   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6260   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6261   // contributes to the leading number of zeros.
6262   vpsrld(xtmp2, src, 1, vec_enc);
6263   vpandn(xtmp3, xtmp2, src, vec_enc);
6264 
6265   // Extract biased exponent.
6266   vcvtdq2ps(dst, xtmp3, vec_enc);
6267   vpsrld(dst, dst, 23, vec_enc);
6268   vpand(dst, dst, xtmp1, vec_enc);
6269 
6270   // Broadcast 127.
6271   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6272   // Exponent = biased_exp - 127
6273   vpsubd(dst, dst, xtmp1, vec_enc);
6274 
6275   // Exponent_plus_one = Exponent + 1
6276   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6277   vpaddd(dst, dst, xtmp3, vec_enc);
6278 
6279   // Replace -ve exponent with zero, exponent is -ve when src
6280   // lane contains a zero value.
6281   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6282   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6283 
6284   // Rematerialize broadcast 32.
6285   vpslld(xtmp1, xtmp3, 5, vec_enc);
6286   // Exponent is 32 if corresponding source lane contains max_int value.
6287   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6288   // LZCNT = 32 - exponent_plus_one
6289   vpsubd(dst, xtmp1, dst, vec_enc);
6290 
6291   // Replace LZCNT with a value 1 if corresponding source lane
6292   // contains max_int value.
6293   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6294 
6295   // Replace biased_exp with 0 if source lane value is less than zero.
6296   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6297   vblendvps(dst, dst, xtmp2, src, vec_enc);
6298 }
6299 
6300 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6301                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6302   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6303   // Add zero counts of lower word and upper word of a double word if
6304   // upper word holds a zero value.
6305   vpsrld(xtmp3, src, 16, vec_enc);
6306   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6307   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6308   vpslld(xtmp2, dst, 16, vec_enc);
6309   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6310   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6311   vpsrld(dst, dst, 16, vec_enc);
6312   // Add zero counts of lower doubleword and upper doubleword of a
6313   // quadword if upper doubleword holds a zero value.
6314   vpsrlq(xtmp3, src, 32, vec_enc);
6315   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6316   vpsllq(xtmp2, dst, 32, vec_enc);
6317   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6318   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6319   vpsrlq(dst, dst, 32, vec_enc);
6320 }
6321 
6322 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6323                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6324                                                        Register rtmp, int vec_enc) {
6325   assert(is_integral_type(bt), "unexpected type");
6326   assert(vec_enc < Assembler::AVX_512bit, "");
6327   switch(bt) {
6328     case T_LONG:
6329       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6330       break;
6331     case T_INT:
6332       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6333       break;
6334     case T_SHORT:
6335       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6336       break;
6337     case T_BYTE:
6338       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6339       break;
6340     default:
6341       fatal("Unsupported type %s", type2name(bt));
6342       break;
6343   }
6344 }
6345 
6346 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6347   switch(bt) {
6348     case T_BYTE:
6349       vpsubb(dst, src1, src2, vec_enc);
6350       break;
6351     case T_SHORT:
6352       vpsubw(dst, src1, src2, vec_enc);
6353       break;
6354     case T_INT:
6355       vpsubd(dst, src1, src2, vec_enc);
6356       break;
6357     case T_LONG:
6358       vpsubq(dst, src1, src2, vec_enc);
6359       break;
6360     default:
6361       fatal("Unsupported type %s", type2name(bt));
6362       break;
6363   }
6364 }
6365 
6366 // Trailing zero count computation is based on leading zero count operation as per
6367 // following equation. All AVX3 targets support AVX512CD feature which offers
6368 // direct vector instruction to compute leading zero count.
6369 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6370 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6371                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6372                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6373   assert(is_integral_type(bt), "");
6374   // xtmp = -1
6375   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6376   // xtmp = xtmp + src
6377   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6378   // xtmp = xtmp & ~src
6379   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6380   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6381   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6382   vpsub(bt, dst, xtmp4, dst, vec_enc);
6383 }
6384 
6385 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6386 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6387 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6388                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6389   assert(is_integral_type(bt), "");
6390   // xtmp = 0
6391   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6392   // xtmp = 0 - src
6393   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6394   // xtmp = xtmp | src
6395   vpor(xtmp3, xtmp3, src, vec_enc);
6396   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6397   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6398   vpsub(bt, dst, xtmp1, dst, vec_enc);
6399 }
6400 
6401 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6402   Label done;
6403   Label neg_divisor_fastpath;
6404   cmpl(divisor, 0);
6405   jccb(Assembler::less, neg_divisor_fastpath);
6406   xorl(rdx, rdx);
6407   divl(divisor);
6408   jmpb(done);
6409   bind(neg_divisor_fastpath);
6410   // Fastpath for divisor < 0:
6411   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6412   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6413   movl(rdx, rax);
6414   subl(rdx, divisor);
6415   if (VM_Version::supports_bmi1()) {
6416     andnl(rax, rdx, rax);
6417   } else {
6418     notl(rdx);
6419     andl(rax, rdx);
6420   }
6421   shrl(rax, 31);
6422   bind(done);
6423 }
6424 
6425 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6426   Label done;
6427   Label neg_divisor_fastpath;
6428   cmpl(divisor, 0);
6429   jccb(Assembler::less, neg_divisor_fastpath);
6430   xorl(rdx, rdx);
6431   divl(divisor);
6432   jmpb(done);
6433   bind(neg_divisor_fastpath);
6434   // Fastpath when divisor < 0:
6435   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6436   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6437   movl(rdx, rax);
6438   subl(rax, divisor);
6439   if (VM_Version::supports_bmi1()) {
6440     andnl(rax, rax, rdx);
6441   } else {
6442     notl(rax);
6443     andl(rax, rdx);
6444   }
6445   sarl(rax, 31);
6446   andl(rax, divisor);
6447   subl(rdx, rax);
6448   bind(done);
6449 }
6450 
6451 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6452   Label done;
6453   Label neg_divisor_fastpath;
6454 
6455   cmpl(divisor, 0);
6456   jccb(Assembler::less, neg_divisor_fastpath);
6457   xorl(rdx, rdx);
6458   divl(divisor);
6459   jmpb(done);
6460   bind(neg_divisor_fastpath);
6461   // Fastpath for divisor < 0:
6462   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6463   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6464   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6465   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6466   movl(rdx, rax);
6467   subl(rax, divisor);
6468   if (VM_Version::supports_bmi1()) {
6469     andnl(rax, rax, rdx);
6470   } else {
6471     notl(rax);
6472     andl(rax, rdx);
6473   }
6474   movl(tmp, rax);
6475   shrl(rax, 31); // quotient
6476   sarl(tmp, 31);
6477   andl(tmp, divisor);
6478   subl(rdx, tmp); // remainder
6479   bind(done);
6480 }
6481 
6482 #ifdef _LP64
6483 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6484                                  XMMRegister xtmp2, Register rtmp) {
6485   if(VM_Version::supports_gfni()) {
6486     // Galois field instruction based bit reversal based on following algorithm.
6487     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6488     mov64(rtmp, 0x8040201008040201L);
6489     movq(xtmp1, src);
6490     movq(xtmp2, rtmp);
6491     gf2p8affineqb(xtmp1, xtmp2, 0);
6492     movq(dst, xtmp1);
6493   } else {
6494     // Swap even and odd numbered bits.
6495     movl(rtmp, src);
6496     andl(rtmp, 0x55555555);
6497     shll(rtmp, 1);
6498     movl(dst, src);
6499     andl(dst, 0xAAAAAAAA);
6500     shrl(dst, 1);
6501     orl(dst, rtmp);
6502 
6503     // Swap LSB and MSB 2 bits of each nibble.
6504     movl(rtmp, dst);
6505     andl(rtmp, 0x33333333);
6506     shll(rtmp, 2);
6507     andl(dst, 0xCCCCCCCC);
6508     shrl(dst, 2);
6509     orl(dst, rtmp);
6510 
6511     // Swap LSB and MSB 4 bits of each byte.
6512     movl(rtmp, dst);
6513     andl(rtmp, 0x0F0F0F0F);
6514     shll(rtmp, 4);
6515     andl(dst, 0xF0F0F0F0);
6516     shrl(dst, 4);
6517     orl(dst, rtmp);
6518   }
6519   bswapl(dst);
6520 }
6521 
6522 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6523                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6524   if(VM_Version::supports_gfni()) {
6525     // Galois field instruction based bit reversal based on following algorithm.
6526     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6527     mov64(rtmp1, 0x8040201008040201L);
6528     movq(xtmp1, src);
6529     movq(xtmp2, rtmp1);
6530     gf2p8affineqb(xtmp1, xtmp2, 0);
6531     movq(dst, xtmp1);
6532   } else {
6533     // Swap even and odd numbered bits.
6534     movq(rtmp1, src);
6535     mov64(rtmp2, 0x5555555555555555L);
6536     andq(rtmp1, rtmp2);
6537     shlq(rtmp1, 1);
6538     movq(dst, src);
6539     notq(rtmp2);
6540     andq(dst, rtmp2);
6541     shrq(dst, 1);
6542     orq(dst, rtmp1);
6543 
6544     // Swap LSB and MSB 2 bits of each nibble.
6545     movq(rtmp1, dst);
6546     mov64(rtmp2, 0x3333333333333333L);
6547     andq(rtmp1, rtmp2);
6548     shlq(rtmp1, 2);
6549     notq(rtmp2);
6550     andq(dst, rtmp2);
6551     shrq(dst, 2);
6552     orq(dst, rtmp1);
6553 
6554     // Swap LSB and MSB 4 bits of each byte.
6555     movq(rtmp1, dst);
6556     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6557     andq(rtmp1, rtmp2);
6558     shlq(rtmp1, 4);
6559     notq(rtmp2);
6560     andq(dst, rtmp2);
6561     shrq(dst, 4);
6562     orq(dst, rtmp1);
6563   }
6564   bswapq(dst);
6565 }
6566 
6567 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6568   Label done;
6569   Label neg_divisor_fastpath;
6570   cmpq(divisor, 0);
6571   jccb(Assembler::less, neg_divisor_fastpath);
6572   xorl(rdx, rdx);
6573   divq(divisor);
6574   jmpb(done);
6575   bind(neg_divisor_fastpath);
6576   // Fastpath for divisor < 0:
6577   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6578   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6579   movq(rdx, rax);
6580   subq(rdx, divisor);
6581   if (VM_Version::supports_bmi1()) {
6582     andnq(rax, rdx, rax);
6583   } else {
6584     notq(rdx);
6585     andq(rax, rdx);
6586   }
6587   shrq(rax, 63);
6588   bind(done);
6589 }
6590 
6591 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6592   Label done;
6593   Label neg_divisor_fastpath;
6594   cmpq(divisor, 0);
6595   jccb(Assembler::less, neg_divisor_fastpath);
6596   xorq(rdx, rdx);
6597   divq(divisor);
6598   jmp(done);
6599   bind(neg_divisor_fastpath);
6600   // Fastpath when divisor < 0:
6601   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6602   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6603   movq(rdx, rax);
6604   subq(rax, divisor);
6605   if (VM_Version::supports_bmi1()) {
6606     andnq(rax, rax, rdx);
6607   } else {
6608     notq(rax);
6609     andq(rax, rdx);
6610   }
6611   sarq(rax, 63);
6612   andq(rax, divisor);
6613   subq(rdx, rax);
6614   bind(done);
6615 }
6616 
6617 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6618   Label done;
6619   Label neg_divisor_fastpath;
6620   cmpq(divisor, 0);
6621   jccb(Assembler::less, neg_divisor_fastpath);
6622   xorq(rdx, rdx);
6623   divq(divisor);
6624   jmp(done);
6625   bind(neg_divisor_fastpath);
6626   // Fastpath for divisor < 0:
6627   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6628   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6629   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6630   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6631   movq(rdx, rax);
6632   subq(rax, divisor);
6633   if (VM_Version::supports_bmi1()) {
6634     andnq(rax, rax, rdx);
6635   } else {
6636     notq(rax);
6637     andq(rax, rdx);
6638   }
6639   movq(tmp, rax);
6640   shrq(rax, 63); // quotient
6641   sarq(tmp, 63);
6642   andq(tmp, divisor);
6643   subq(rdx, tmp); // remainder
6644   bind(done);
6645 }
6646 #endif
6647 
6648 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6649                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6650                                         int vlen_enc) {
6651   assert(VM_Version::supports_avx512bw(), "");
6652   // Byte shuffles are inlane operations and indices are determined using
6653   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6654   // normalized to index range 0-15. This makes sure that all the multiples
6655   // of an index value are placed at same relative position in 128 bit
6656   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6657   // will be 16th element in their respective 128 bit lanes.
6658   movl(rtmp, 16);
6659   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6660 
6661   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6662   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6663   // original shuffle indices and move the shuffled lanes corresponding to true
6664   // mask to destination vector.
6665   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6666   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6667   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6668 
6669   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6670   // and broadcasting second 128 bit lane.
6671   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6672   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6673   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6674   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6675   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6676 
6677   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6678   // and broadcasting third 128 bit lane.
6679   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6680   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6681   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6682   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6683   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6684 
6685   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6686   // and broadcasting third 128 bit lane.
6687   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6688   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6689   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6690   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6691   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6692 }
6693 
6694 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6695                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6696   if (vlen_enc == AVX_128bit) {
6697     vpermilps(dst, src, shuffle, vlen_enc);
6698   } else if (bt == T_INT) {
6699     vpermd(dst, shuffle, src, vlen_enc);
6700   } else {
6701     assert(bt == T_FLOAT, "");
6702     vpermps(dst, shuffle, src, vlen_enc);
6703   }
6704 }
6705 
6706 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6707   switch(opcode) {
6708     case Op_AddHF: vaddsh(dst, src1, src2); break;
6709     case Op_SubHF: vsubsh(dst, src1, src2); break;
6710     case Op_MulHF: vmulsh(dst, src1, src2); break;
6711     case Op_DivHF: vdivsh(dst, src1, src2); break;
6712     case Op_MaxHF: vmaxsh(dst, src1, src2); break;
6713     case Op_MinHF: vminsh(dst, src1, src2); break;
6714     default: assert(false, "%s", NodeClassNames[opcode]); break;
6715   }
6716 }
6717 
6718 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6719   switch(elem_bt) {
6720     case T_BYTE:
6721       if (ideal_opc == Op_SaturatingAddV) {
6722         vpaddsb(dst, src1, src2, vlen_enc);
6723       } else {
6724         assert(ideal_opc == Op_SaturatingSubV, "");
6725         vpsubsb(dst, src1, src2, vlen_enc);
6726       }
6727       break;
6728     case T_SHORT:
6729       if (ideal_opc == Op_SaturatingAddV) {
6730         vpaddsw(dst, src1, src2, vlen_enc);
6731       } else {
6732         assert(ideal_opc == Op_SaturatingSubV, "");
6733         vpsubsw(dst, src1, src2, vlen_enc);
6734       }
6735       break;
6736     default:
6737       fatal("Unsupported type %s", type2name(elem_bt));
6738       break;
6739   }
6740 }
6741 
6742 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6743   switch(elem_bt) {
6744     case T_BYTE:
6745       if (ideal_opc == Op_SaturatingAddV) {
6746         vpaddusb(dst, src1, src2, vlen_enc);
6747       } else {
6748         assert(ideal_opc == Op_SaturatingSubV, "");
6749         vpsubusb(dst, src1, src2, vlen_enc);
6750       }
6751       break;
6752     case T_SHORT:
6753       if (ideal_opc == Op_SaturatingAddV) {
6754         vpaddusw(dst, src1, src2, vlen_enc);
6755       } else {
6756         assert(ideal_opc == Op_SaturatingSubV, "");
6757         vpsubusw(dst, src1, src2, vlen_enc);
6758       }
6759       break;
6760     default:
6761       fatal("Unsupported type %s", type2name(elem_bt));
6762       break;
6763   }
6764 }
6765 
6766 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6767                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6768   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6769   // overflow_mask = Inp1 <u Inp2
6770   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6771   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6772   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6773 }
6774 
6775 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6776                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6777   // Emulate unsigned comparison using signed comparison
6778   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6779   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6780   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6781   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6782 
6783   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6784 
6785   // Res = INP1 - INP2 (non-commutative and non-associative)
6786   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6787   // Res = Mask ? Zero : Res
6788   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6789   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6790 }
6791 
6792 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6793                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6794   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6795   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6796   // Res = Signed Add INP1, INP2
6797   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6798   // T1 = SRC1 | SRC2
6799   vpor(xtmp1, src1, src2, vlen_enc);
6800   // Max_Unsigned = -1
6801   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6802   // Unsigned compare:  Mask = Res <u T1
6803   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6804   // res  = Mask ? Max_Unsigned : Res
6805   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6806 }
6807 
6808 //
6809 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6810 // unsigned addition operation.
6811 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6812 //
6813 // We empirically determined its semantic equivalence to following reduced expression
6814 //    overflow_mask =  (a + b) <u (a | b)
6815 //
6816 // and also verified it though Alive2 solver.
6817 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6818 //
6819 
6820 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6821                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6822   // Res = Signed Add INP1, INP2
6823   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6824   // Compute T1 = INP1 | INP2
6825   vpor(xtmp3, src1, src2, vlen_enc);
6826   // T1 = Minimum signed value.
6827   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6828   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6829   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6830   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6831   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6832   // Compute overflow detection mask = Res<1> <s T1
6833   if (elem_bt == T_INT) {
6834     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6835   } else {
6836     assert(elem_bt == T_LONG, "");
6837     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6838   }
6839   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6840 }
6841 
6842 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6843                                       int vlen_enc, bool xtmp2_hold_M1) {
6844   if (VM_Version::supports_avx512dq()) {
6845     evpmovq2m(ktmp, src, vlen_enc);
6846   } else {
6847     assert(VM_Version::supports_evex(), "");
6848     if (!xtmp2_hold_M1) {
6849       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6850     }
6851     evpsraq(xtmp1, src, 63, vlen_enc);
6852     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6853   }
6854 }
6855 
6856 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6857                                       int vlen_enc, bool xtmp2_hold_M1) {
6858   if (VM_Version::supports_avx512dq()) {
6859     evpmovd2m(ktmp, src, vlen_enc);
6860   } else {
6861     assert(VM_Version::supports_evex(), "");
6862     if (!xtmp2_hold_M1) {
6863       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6864     }
6865     vpsrad(xtmp1, src, 31, vlen_enc);
6866     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6867   }
6868 }
6869 
6870 
6871 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6872   if (elem_bt == T_LONG) {
6873     if (VM_Version::supports_evex()) {
6874       evpsraq(dst, src, 63, vlen_enc);
6875     } else {
6876       vpsrad(dst, src, 31, vlen_enc);
6877       vpshufd(dst, dst, 0xF5, vlen_enc);
6878     }
6879   } else {
6880     assert(elem_bt == T_INT, "");
6881     vpsrad(dst, src, 31, vlen_enc);
6882   }
6883 }
6884 
6885 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6886   if (compute_allones) {
6887     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6888       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6889     } else {
6890       vpcmpeqq(allones, allones, allones, vlen_enc);
6891     }
6892   }
6893   if (elem_bt == T_LONG) {
6894     vpsrlq(dst, allones, 1, vlen_enc);
6895   } else {
6896     assert(elem_bt == T_INT, "");
6897     vpsrld(dst, allones, 1, vlen_enc);
6898   }
6899 }
6900 
6901 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6902   if (compute_allones) {
6903     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6904       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6905     } else {
6906       vpcmpeqq(allones, allones, allones, vlen_enc);
6907     }
6908   }
6909   if (elem_bt == T_LONG) {
6910     vpsllq(dst, allones, 63, vlen_enc);
6911   } else {
6912     assert(elem_bt == T_INT, "");
6913     vpslld(dst, allones, 31, vlen_enc);
6914   }
6915 }
6916 
6917 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6918                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6919   switch(elem_bt) {
6920     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6921     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6922     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6923     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6924     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6925   }
6926 }
6927 
6928 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6929   switch(elem_bt) {
6930     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6931     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6932     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6933     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6934     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6935   }
6936 }
6937 
6938 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6939                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6940   if (elem_bt == T_LONG) {
6941     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6942   } else {
6943     assert(elem_bt == T_INT, "");
6944     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6945   }
6946 }
6947 
6948 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6949                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6950                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6951   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6952   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6953   // Overflow detection based on Hacker's delight section 2-13.
6954   if (ideal_opc == Op_SaturatingAddV) {
6955     // res = src1 + src2
6956     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6957     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6958     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6959     vpxor(xtmp1, dst, src1, vlen_enc);
6960     vpxor(xtmp2, dst, src2, vlen_enc);
6961     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6962   } else {
6963     assert(ideal_opc == Op_SaturatingSubV, "");
6964     // res = src1 - src2
6965     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6966     // Overflow occurs when both inputs have opposite polarity and
6967     // result polarity does not comply with first input polarity.
6968     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6969     vpxor(xtmp1, src1, src2, vlen_enc);
6970     vpxor(xtmp2, dst, src1, vlen_enc);
6971     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6972   }
6973 
6974   // Compute overflow detection mask.
6975   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6976   // Note: xtmp1 hold -1 in all its lanes after above call.
6977 
6978   // Compute mask based on first input polarity.
6979   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6980 
6981   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6982   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6983 
6984   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6985   // set bits in first input polarity mask holds a min value.
6986   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6987   // Blend destination lanes with saturated values using overflow detection mask.
6988   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6989 }
6990 
6991 
6992 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6993                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6994                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6995   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6996   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6997   // Overflow detection based on Hacker's delight section 2-13.
6998   if (ideal_opc == Op_SaturatingAddV) {
6999     // res = src1 + src2
7000     vpadd(elem_bt, dst, src1, src2, vlen_enc);
7001     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
7002     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
7003     vpxor(xtmp1, dst, src1, vlen_enc);
7004     vpxor(xtmp2, dst, src2, vlen_enc);
7005     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
7006   } else {
7007     assert(ideal_opc == Op_SaturatingSubV, "");
7008     // res = src1 - src2
7009     vpsub(elem_bt, dst, src1, src2, vlen_enc);
7010     // Overflow occurs when both inputs have opposite polarity and
7011     // result polarity does not comply with first input polarity.
7012     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
7013     vpxor(xtmp1, src1, src2, vlen_enc);
7014     vpxor(xtmp2, dst, src1, vlen_enc);
7015     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
7016   }
7017 
7018   // Sign-extend to compute overflow detection mask.
7019   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
7020 
7021   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
7022   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
7023   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
7024 
7025   // Compose saturating min/max vector using first input polarity mask.
7026   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
7027   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
7028 
7029   // Blend result with saturating vector using overflow detection mask.
7030   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
7031 }
7032 
7033 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7034   switch(elem_bt) {
7035     case T_BYTE:
7036       if (ideal_opc == Op_SaturatingAddV) {
7037         vpaddsb(dst, src1, src2, vlen_enc);
7038       } else {
7039         assert(ideal_opc == Op_SaturatingSubV, "");
7040         vpsubsb(dst, src1, src2, vlen_enc);
7041       }
7042       break;
7043     case T_SHORT:
7044       if (ideal_opc == Op_SaturatingAddV) {
7045         vpaddsw(dst, src1, src2, vlen_enc);
7046       } else {
7047         assert(ideal_opc == Op_SaturatingSubV, "");
7048         vpsubsw(dst, src1, src2, vlen_enc);
7049       }
7050       break;
7051     default:
7052       fatal("Unsupported type %s", type2name(elem_bt));
7053       break;
7054   }
7055 }
7056 
7057 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7058   switch(elem_bt) {
7059     case T_BYTE:
7060       if (ideal_opc == Op_SaturatingAddV) {
7061         vpaddusb(dst, src1, src2, vlen_enc);
7062       } else {
7063         assert(ideal_opc == Op_SaturatingSubV, "");
7064         vpsubusb(dst, src1, src2, vlen_enc);
7065       }
7066       break;
7067     case T_SHORT:
7068       if (ideal_opc == Op_SaturatingAddV) {
7069         vpaddusw(dst, src1, src2, vlen_enc);
7070       } else {
7071         assert(ideal_opc == Op_SaturatingSubV, "");
7072         vpsubusw(dst, src1, src2, vlen_enc);
7073       }
7074       break;
7075     default:
7076       fatal("Unsupported type %s", type2name(elem_bt));
7077       break;
7078   }
7079 }
7080 
7081 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7082                                                      XMMRegister src2, int vlen_enc) {
7083   switch(elem_bt) {
7084     case T_BYTE:
7085       evpermi2b(dst, src1, src2, vlen_enc);
7086       break;
7087     case T_SHORT:
7088       evpermi2w(dst, src1, src2, vlen_enc);
7089       break;
7090     case T_INT:
7091       evpermi2d(dst, src1, src2, vlen_enc);
7092       break;
7093     case T_LONG:
7094       evpermi2q(dst, src1, src2, vlen_enc);
7095       break;
7096     case T_FLOAT:
7097       evpermi2ps(dst, src1, src2, vlen_enc);
7098       break;
7099     case T_DOUBLE:
7100       evpermi2pd(dst, src1, src2, vlen_enc);
7101       break;
7102     default:
7103       fatal("Unsupported type %s", type2name(elem_bt));
7104       break;
7105   }
7106 }
7107 
7108 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7109   if (is_unsigned) {
7110     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7111   } else {
7112     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7113   }
7114 }
7115 
7116 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7117   if (is_unsigned) {
7118     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7119   } else {
7120     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7121   }
7122 }