1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  53 
  54   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  55   // NativeJump::patch_verified_entry will be able to patch out the entry
  56   // code safely. The push to verify stack depth is ok at 5 bytes,
  57   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  58   // stack bang then we must use the 6 byte frame allocation even if
  59   // we have no frame. :-(
  60   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  61 
  62   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  63   // Remove word for return addr
  64   framesize -= wordSize;
  65   stack_bang_size -= wordSize;
  66 
  67   // Calls to C2R adapters often do not accept exceptional returns.
  68   // We require that their callers must bang for them.  But be careful, because
  69   // some VM calls (such as call site linkage) can use several kilobytes of
  70   // stack.  But the stack safety zone should account for that.
  71   // See bugs 4446381, 4468289, 4497237.
  72   if (stack_bang_size > 0) {
  73     generate_stack_overflow_check(stack_bang_size);
  74 
  75     // We always push rbp, so that on return to interpreter rbp, will be
  76     // restored correctly and we can correct the stack.
  77     push(rbp);
  78     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  79     if (PreserveFramePointer) {
  80       mov(rbp, rsp);
  81     }
  82     // Remove word for ebp
  83     framesize -= wordSize;
  84 
  85     // Create frame
  86     if (framesize) {
  87       subptr(rsp, framesize);
  88     }
  89   } else {
  90     // Create frame (force generation of a 4 byte immediate value)
  91     subptr_imm32(rsp, framesize);
  92 
  93     // Save RBP register now.
  94     framesize -= wordSize;
  95     movptr(Address(rsp, framesize), rbp);
  96     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  97     if (PreserveFramePointer) {
  98       movptr(rbp, rsp);
  99       if (framesize > 0) {
 100         addptr(rbp, framesize);
 101       }
 102     }
 103   }
 104 
 105   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 106     framesize -= wordSize;
 107     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 108   }
 109 
 110 #ifndef _LP64
 111   // If method sets FPU control word do it now
 112   if (fp_mode_24b) {
 113     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 114   }
 115   if (UseSSE >= 2 && VerifyFPU) {
 116     verify_FPU(0, "FPU stack must be clean on entry");
 117   }
 118 #endif
 119 
 120 #ifdef ASSERT
 121   if (VerifyStackAtCalls) {
 122     Label L;
 123     push(rax);
 124     mov(rax, rsp);
 125     andptr(rax, StackAlignmentInBytes-1);
 126     cmpptr(rax, StackAlignmentInBytes-wordSize);
 127     pop(rax);
 128     jcc(Assembler::equal, L);
 129     STOP("Stack is not properly aligned!");
 130     bind(L);
 131   }
 132 #endif
 133 
 134   if (!is_stub) {
 135     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 136  #ifdef _LP64
 137     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 138     Label dummy_slow_path;
 139     Label dummy_continuation;
 140     Label* slow_path = &dummy_slow_path;
 141     Label* continuation = &dummy_continuation;
 142     if (!Compile::current()->output()->in_scratch_emit_size()) {
 143       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 144       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 145       Compile::current()->output()->add_stub(stub);
 146       slow_path = &stub->entry();
 147       continuation = &stub->continuation();
 148     }
 149     bs->nmethod_entry_barrier(this, slow_path, continuation);
 150 #else
 151     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 152     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 153 #endif
 154   }
 155 }
 156 
 157 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 158   switch (vlen_in_bytes) {
 159     case  4: // fall-through
 160     case  8: // fall-through
 161     case 16: return Assembler::AVX_128bit;
 162     case 32: return Assembler::AVX_256bit;
 163     case 64: return Assembler::AVX_512bit;
 164 
 165     default: {
 166       ShouldNotReachHere();
 167       return Assembler::AVX_NoVec;
 168     }
 169   }
 170 }
 171 
 172 // fast_lock and fast_unlock used by C2
 173 
 174 // Because the transitions from emitted code to the runtime
 175 // monitorenter/exit helper stubs are so slow it's critical that
 176 // we inline both the stack-locking fast path and the inflated fast path.
 177 //
 178 // See also: cmpFastLock and cmpFastUnlock.
 179 //
 180 // What follows is a specialized inline transliteration of the code
 181 // in enter() and exit(). If we're concerned about I$ bloat another
 182 // option would be to emit TrySlowEnter and TrySlowExit methods
 183 // at startup-time.  These methods would accept arguments as
 184 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 185 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 186 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 187 // In practice, however, the # of lock sites is bounded and is usually small.
 188 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 189 // if the processor uses simple bimodal branch predictors keyed by EIP
 190 // Since the helper routines would be called from multiple synchronization
 191 // sites.
 192 //
 193 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 194 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 195 // to those specialized methods.  That'd give us a mostly platform-independent
 196 // implementation that the JITs could optimize and inline at their pleasure.
 197 // Done correctly, the only time we'd need to cross to native could would be
 198 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 199 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 200 // (b) explicit barriers or fence operations.
 201 //
 202 // TODO:
 203 //
 204 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 205 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 206 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 207 //    the lock operators would typically be faster than reifying Self.
 208 //
 209 // *  Ideally I'd define the primitives as:
 210 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 211 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 212 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 213 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 214 //    Furthermore the register assignments are overconstrained, possibly resulting in
 215 //    sub-optimal code near the synchronization site.
 216 //
 217 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 218 //    Alternately, use a better sp-proximity test.
 219 //
 220 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 221 //    Either one is sufficient to uniquely identify a thread.
 222 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 223 //
 224 // *  Intrinsify notify() and notifyAll() for the common cases where the
 225 //    object is locked by the calling thread but the waitlist is empty.
 226 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 227 //
 228 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 229 //    But beware of excessive branch density on AMD Opterons.
 230 //
 231 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 232 //    or failure of the fast path.  If the fast path fails then we pass
 233 //    control to the slow path, typically in C.  In fast_lock and
 234 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 235 //    will emit a conditional branch immediately after the node.
 236 //    So we have branches to branches and lots of ICC.ZF games.
 237 //    Instead, it might be better to have C2 pass a "FailureLabel"
 238 //    into fast_lock and fast_unlock.  In the case of success, control
 239 //    will drop through the node.  ICC.ZF is undefined at exit.
 240 //    In the case of failure, the node will branch directly to the
 241 //    FailureLabel
 242 
 243 
 244 // obj: object to lock
 245 // box: on-stack box address (displaced header location) - KILLED
 246 // rax,: tmp -- KILLED
 247 // scr: tmp -- KILLED
 248 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 249                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 250                                  Metadata* method_data) {
 251   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 252   // Ensure the register assignments are disjoint
 253   assert(tmpReg == rax, "");
 254   assert(cx1Reg == noreg, "");
 255   assert(cx2Reg == noreg, "");
 256   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 257 
 258   // Possible cases that we'll encounter in fast_lock
 259   // ------------------------------------------------
 260   // * Inflated
 261   //    -- unlocked
 262   //    -- Locked
 263   //       = by self
 264   //       = by other
 265   // * neutral
 266   // * stack-locked
 267   //    -- by self
 268   //       = sp-proximity test hits
 269   //       = sp-proximity test generates false-negative
 270   //    -- by other
 271   //
 272 
 273   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 274 
 275   if (DiagnoseSyncOnValueBasedClasses != 0) {
 276     load_klass(tmpReg, objReg, scrReg);
 277     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 278     jcc(Assembler::notZero, DONE_LABEL);
 279   }
 280 
 281   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 282   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 283   jcc(Assembler::notZero, IsInflated);
 284 
 285   if (LockingMode == LM_MONITOR) {
 286     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 287     testptr(objReg, objReg);
 288   } else {
 289     assert(LockingMode == LM_LEGACY, "must be");
 290     // Attempt stack-locking ...
 291     orptr (tmpReg, markWord::unlocked_value);
 292     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 293     lock();
 294     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 295     jcc(Assembler::equal, COUNT);           // Success
 296 
 297     // Recursive locking.
 298     // The object is stack-locked: markword contains stack pointer to BasicLock.
 299     // Locked by current thread if difference with current SP is less than one page.
 300     subptr(tmpReg, rsp);
 301     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 302     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 303     movptr(Address(boxReg, 0), tmpReg);
 304   }
 305   jmp(DONE_LABEL);
 306 
 307   bind(IsInflated);
 308   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 309 
 310 #ifndef _LP64
 311   // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 312   orl(boxReg, 1);  // set ICC.ZF=0 to indicate failure
 313 #else
 314   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 315   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 316   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 317 
 318   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 319   movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset()));
 320   movq(scrReg, tmpReg);
 321   xorq(tmpReg, tmpReg);
 322   lock();
 323   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 324 
 325   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 326   jccb(Assembler::equal, COUNT);    // CAS above succeeded; propagate ZF = 1 (success)
 327 
 328   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 329   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 330   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 331   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 332 #endif // _LP64
 333   bind(DONE_LABEL);
 334 
 335   // ZFlag == 1 count in fast path
 336   // ZFlag == 0 count in slow path
 337   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 338 
 339   bind(COUNT);
 340   if (LockingMode == LM_LEGACY) {
 341 #ifdef _LP64
 342     // Count monitors in fast path
 343     increment(Address(thread, JavaThread::held_monitor_count_offset()));
 344 #endif
 345   }
 346   xorl(tmpReg, tmpReg); // Set ZF == 1
 347 
 348   bind(NO_COUNT);
 349 
 350   // At NO_COUNT the icc ZFlag is set as follows ...
 351   // fast_unlock uses the same protocol.
 352   // ZFlag == 1 -> Success
 353   // ZFlag == 0 -> Failure - force control through the slow path
 354 }
 355 
 356 // obj: object to unlock
 357 // box: box address (displaced header location), killed.  Must be EAX.
 358 // tmp: killed, cannot be obj nor box.
 359 //
 360 // Some commentary on balanced locking:
 361 //
 362 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 363 // Methods that don't have provably balanced locking are forced to run in the
 364 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 365 // The interpreter provides two properties:
 366 // I1:  At return-time the interpreter automatically and quietly unlocks any
 367 //      objects acquired the current activation (frame).  Recall that the
 368 //      interpreter maintains an on-stack list of locks currently held by
 369 //      a frame.
 370 // I2:  If a method attempts to unlock an object that is not held by the
 371 //      the frame the interpreter throws IMSX.
 372 //
 373 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 374 // B() doesn't have provably balanced locking so it runs in the interpreter.
 375 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 376 // is still locked by A().
 377 //
 378 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 379 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 380 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 381 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 382 // Arguably given that the spec legislates the JNI case as undefined our implementation
 383 // could reasonably *avoid* checking owner in fast_unlock().
 384 // In the interest of performance we elide m->Owner==Self check in unlock.
 385 // A perfectly viable alternative is to elide the owner check except when
 386 // Xcheck:jni is enabled.
 387 
 388 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 389   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 390   assert(boxReg == rax, "");
 391   assert_different_registers(objReg, boxReg, tmpReg);
 392 
 393   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 394 
 395   if (LockingMode == LM_LEGACY) {
 396     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 397     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 398   }
 399   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 400   if (LockingMode != LM_MONITOR) {
 401     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 402     jcc(Assembler::zero, Stacked);
 403   }
 404 
 405   // It's inflated.
 406 
 407 #ifndef _LP64
 408   // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 409   orl(boxReg, 1);  // set ICC.ZF=0 to indicate failure
 410   jmpb(DONE_LABEL);
 411 #else
 412   // Despite our balanced locking property we still check that m->_owner == Self
 413   // as java routines or native JNI code called by this thread might
 414   // have released the lock.
 415   //
 416   // If there's no contention try a 1-0 exit.  That is, exit without
 417   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 418   // we detect and recover from the race that the 1-0 exit admits.
 419   //
 420   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 421   // before it STs null into _owner, releasing the lock.  Updates
 422   // to data protected by the critical section must be visible before
 423   // we drop the lock (and thus before any other thread could acquire
 424   // the lock and observe the fields protected by the lock).
 425   // IA32's memory-model is SPO, so STs are ordered with respect to
 426   // each other and there's no need for an explicit barrier (fence).
 427   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 428   Label LSuccess, LNotRecursive;
 429 
 430   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 431   jccb(Assembler::equal, LNotRecursive);
 432 
 433   // Recursive inflated unlock
 434   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 435   jmpb(LSuccess);
 436 
 437   bind(LNotRecursive);
 438 
 439   // Set owner to null.
 440   // Release to satisfy the JMM
 441   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 442   // We need a full fence after clearing owner to avoid stranding.
 443   // StoreLoad achieves this.
 444   membar(StoreLoad);
 445 
 446   // Check if the entry_list is empty.
 447   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD);
 448   jccb(Assembler::zero, LSuccess);    // If so we are done.
 449 
 450   // Check if there is a successor.
 451   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 452   jccb(Assembler::notZero, LSuccess); // If so we are done.
 453 
 454   // Save the monitor pointer in the current thread, so we can try to
 455   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 456   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 457   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 458 
 459   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 460   jmpb  (DONE_LABEL);
 461 
 462   bind  (LSuccess);
 463   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 464   jmpb  (DONE_LABEL);
 465 #endif  // _LP64
 466 
 467   if (LockingMode == LM_LEGACY) {
 468     bind  (Stacked);
 469     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 470     lock();
 471     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 472     // Intentional fall-thru into DONE_LABEL
 473   }
 474 
 475   bind(DONE_LABEL);
 476 
 477   // ZFlag == 1 count in fast path
 478   // ZFlag == 0 count in slow path
 479   jccb(Assembler::notZero, NO_COUNT);
 480 
 481   bind(COUNT);
 482 
 483   if (LockingMode == LM_LEGACY) {
 484     // Count monitors in fast path
 485 #ifdef _LP64
 486     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 487 #endif
 488   }
 489 
 490   xorl(tmpReg, tmpReg); // Set ZF == 1
 491 
 492   bind(NO_COUNT);
 493 }
 494 
 495 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 496                                               Register t, Register thread) {
 497   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 498   assert(rax_reg == rax, "Used for CAS");
 499   assert_different_registers(obj, box, rax_reg, t, thread);
 500 
 501   // Handle inflated monitor.
 502   Label inflated;
 503   // Finish fast lock successfully. ZF value is irrelevant.
 504   Label locked;
 505   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 506   Label slow_path;
 507 
 508   if (UseObjectMonitorTable) {
 509     // Clear cache in case fast locking succeeds.
 510     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 511   }
 512 
 513   if (DiagnoseSyncOnValueBasedClasses != 0) {
 514     load_klass(rax_reg, obj, t);
 515     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 516     jcc(Assembler::notZero, slow_path);
 517   }
 518 
 519   const Register mark = t;
 520 
 521   { // Lightweight Lock
 522 
 523     Label push;
 524 
 525     const Register top = UseObjectMonitorTable ? rax_reg : box;
 526 
 527     // Load the mark.
 528     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 529 
 530     // Prefetch top.
 531     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 532 
 533     // Check for monitor (0b10).
 534     testptr(mark, markWord::monitor_value);
 535     jcc(Assembler::notZero, inflated);
 536 
 537     // Check if lock-stack is full.
 538     cmpl(top, LockStack::end_offset() - 1);
 539     jcc(Assembler::greater, slow_path);
 540 
 541     // Check if recursive.
 542     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 543     jccb(Assembler::equal, push);
 544 
 545     // Try to lock. Transition lock bits 0b01 => 0b00
 546     movptr(rax_reg, mark);
 547     orptr(rax_reg, markWord::unlocked_value);
 548     andptr(mark, ~(int32_t)markWord::unlocked_value);
 549     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 550     jcc(Assembler::notEqual, slow_path);
 551 
 552     if (UseObjectMonitorTable) {
 553       // Need to reload top, clobbered by CAS.
 554       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 555     }
 556     bind(push);
 557     // After successful lock, push object on lock-stack.
 558     movptr(Address(thread, top), obj);
 559     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 560     jmpb(locked);
 561   }
 562 
 563   { // Handle inflated monitor.
 564     bind(inflated);
 565 
 566 #ifndef _LP64
 567     // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 568     orl(box, 1);  // set ICC.ZF=0 to indicate failure
 569     jmpb(slow_path);
 570 #else
 571     const Register monitor = t;
 572 
 573     if (!UseObjectMonitorTable) {
 574       assert(mark == monitor, "should be the same here");
 575     } else {
 576       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 577       // Fetch ObjectMonitor* from the cache or take the slow-path.
 578       Label monitor_found;
 579 
 580       // Load cache address
 581       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 582 
 583       const int num_unrolled = 2;
 584       for (int i = 0; i < num_unrolled; i++) {
 585         cmpptr(obj, Address(t));
 586         jccb(Assembler::equal, monitor_found);
 587         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 588       }
 589 
 590       Label loop;
 591 
 592       // Search for obj in cache.
 593       bind(loop);
 594 
 595       // Check for match.
 596       cmpptr(obj, Address(t));
 597       jccb(Assembler::equal, monitor_found);
 598 
 599       // Search until null encountered, guaranteed _null_sentinel at end.
 600       cmpptr(Address(t), 1);
 601       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 602       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 603       jmpb(loop);
 604 
 605       // Cache hit.
 606       bind(monitor_found);
 607       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 608     }
 609     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 610     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 611     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 612 
 613     Label monitor_locked;
 614     // Lock the monitor.
 615 
 616     if (UseObjectMonitorTable) {
 617       // Cache the monitor for unlock before trashing box. On failure to acquire
 618       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 619       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 620     }
 621 
 622     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 623     xorptr(rax_reg, rax_reg);
 624     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 625     lock(); cmpxchgptr(box, owner_address);
 626     jccb(Assembler::equal, monitor_locked);
 627 
 628     // Check if recursive.
 629     cmpptr(box, rax_reg);
 630     jccb(Assembler::notEqual, slow_path);
 631 
 632     // Recursive.
 633     increment(recursions_address);
 634 
 635     bind(monitor_locked);
 636 #endif  // _LP64
 637   }
 638 
 639   bind(locked);
 640   // Set ZF = 1
 641   xorl(rax_reg, rax_reg);
 642 
 643 #ifdef ASSERT
 644   // Check that locked label is reached with ZF set.
 645   Label zf_correct;
 646   Label zf_bad_zero;
 647   jcc(Assembler::zero, zf_correct);
 648   jmp(zf_bad_zero);
 649 #endif
 650 
 651   bind(slow_path);
 652 #ifdef ASSERT
 653   // Check that slow_path label is reached with ZF not set.
 654   jcc(Assembler::notZero, zf_correct);
 655   stop("Fast Lock ZF != 0");
 656   bind(zf_bad_zero);
 657   stop("Fast Lock ZF != 1");
 658   bind(zf_correct);
 659 #endif
 660   // C2 uses the value of ZF to determine the continuation.
 661 }
 662 
 663 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 664   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 665   assert(reg_rax == rax, "Used for CAS");
 666   assert_different_registers(obj, reg_rax, t);
 667 
 668   // Handle inflated monitor.
 669   Label inflated, inflated_check_lock_stack;
 670   // Finish fast unlock successfully.  MUST jump with ZF == 1
 671   Label unlocked, slow_path;
 672 
 673   const Register mark = t;
 674   const Register monitor = t;
 675   const Register top = UseObjectMonitorTable ? t : reg_rax;
 676   const Register box = reg_rax;
 677 
 678   Label dummy;
 679   C2FastUnlockLightweightStub* stub = nullptr;
 680 
 681   if (!Compile::current()->output()->in_scratch_emit_size()) {
 682     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 683     Compile::current()->output()->add_stub(stub);
 684   }
 685 
 686   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 687 
 688   { // Lightweight Unlock
 689 
 690     // Load top.
 691     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 692 
 693     if (!UseObjectMonitorTable) {
 694       // Prefetch mark.
 695       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 696     }
 697 
 698     // Check if obj is top of lock-stack.
 699     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 700     // Top of lock stack was not obj. Must be monitor.
 701     jcc(Assembler::notEqual, inflated_check_lock_stack);
 702 
 703     // Pop lock-stack.
 704     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 705     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 706 
 707     // Check if recursive.
 708     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 709     jcc(Assembler::equal, unlocked);
 710 
 711     // We elide the monitor check, let the CAS fail instead.
 712 
 713     if (UseObjectMonitorTable) {
 714       // Load mark.
 715       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 716     }
 717 
 718     // Try to unlock. Transition lock bits 0b00 => 0b01
 719     movptr(reg_rax, mark);
 720     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 721     orptr(mark, markWord::unlocked_value);
 722     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 723     jcc(Assembler::notEqual, push_and_slow_path);
 724     jmp(unlocked);
 725   }
 726 
 727 
 728   { // Handle inflated monitor.
 729     bind(inflated_check_lock_stack);
 730 #ifdef ASSERT
 731     Label check_done;
 732     subl(top, oopSize);
 733     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 734     jcc(Assembler::below, check_done);
 735     cmpptr(obj, Address(thread, top));
 736     jccb(Assembler::notEqual, inflated_check_lock_stack);
 737     stop("Fast Unlock lock on stack");
 738     bind(check_done);
 739     if (UseObjectMonitorTable) {
 740       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 741     }
 742     testptr(mark, markWord::monitor_value);
 743     jccb(Assembler::notZero, inflated);
 744     stop("Fast Unlock not monitor");
 745 #endif
 746 
 747     bind(inflated);
 748 
 749 #ifndef _LP64
 750     // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 751     orl(t, 1);  // set ICC.ZF=0 to indicate failure
 752     jmpb(slow_path);
 753 #else
 754     if (!UseObjectMonitorTable) {
 755       assert(mark == monitor, "should be the same here");
 756     } else {
 757       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 758       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 759       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 760       cmpptr(monitor, alignof(ObjectMonitor*));
 761       jcc(Assembler::below, slow_path);
 762     }
 763     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 764     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 765     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 766     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 767     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 768 
 769     Label recursive;
 770 
 771     // Check if recursive.
 772     cmpptr(recursions_address, 0);
 773     jccb(Assembler::notZero, recursive);
 774 
 775     // Set owner to null.
 776     // Release to satisfy the JMM
 777     movptr(owner_address, NULL_WORD);
 778     // We need a full fence after clearing owner to avoid stranding.
 779     // StoreLoad achieves this.
 780     membar(StoreLoad);
 781 
 782     // Check if the entry_list is empty.
 783     cmpptr(entry_list_address, NULL_WORD);
 784     jccb(Assembler::zero, unlocked);    // If so we are done.
 785 
 786     // Check if there is a successor.
 787     cmpptr(succ_address, NULL_WORD);
 788     jccb(Assembler::notZero, unlocked); // If so we are done.
 789 
 790     // Save the monitor pointer in the current thread, so we can try to
 791     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 792     if (!UseObjectMonitorTable) {
 793       andptr(monitor, ~(int32_t)markWord::monitor_value);
 794     }
 795     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 796 
 797     orl(t, 1); // Fast Unlock ZF = 0
 798     jmpb(slow_path);
 799 
 800     // Recursive unlock.
 801     bind(recursive);
 802     decrement(recursions_address);
 803 #endif  // _LP64
 804   }
 805 
 806   bind(unlocked);
 807   xorl(t, t); // Fast Unlock ZF = 1
 808 
 809 #ifdef ASSERT
 810   // Check that unlocked label is reached with ZF set.
 811   Label zf_correct;
 812   Label zf_bad_zero;
 813   jcc(Assembler::zero, zf_correct);
 814   jmp(zf_bad_zero);
 815 #endif
 816 
 817   bind(slow_path);
 818   if (stub != nullptr) {
 819     bind(stub->slow_path_continuation());
 820   }
 821 #ifdef ASSERT
 822   // Check that stub->continuation() label is reached with ZF not set.
 823   jcc(Assembler::notZero, zf_correct);
 824   stop("Fast Unlock ZF != 0");
 825   bind(zf_bad_zero);
 826   stop("Fast Unlock ZF != 1");
 827   bind(zf_correct);
 828 #endif
 829   // C2 uses the value of ZF to determine the continuation.
 830 }
 831 
 832 //-------------------------------------------------------------------------------------------
 833 // Generic instructions support for use in .ad files C2 code generation
 834 
 835 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 836   if (dst != src) {
 837     movdqu(dst, src);
 838   }
 839   if (opcode == Op_AbsVD) {
 840     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 841   } else {
 842     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 843     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 844   }
 845 }
 846 
 847 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 848   if (opcode == Op_AbsVD) {
 849     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 850   } else {
 851     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 852     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 853   }
 854 }
 855 
 856 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 857   if (dst != src) {
 858     movdqu(dst, src);
 859   }
 860   if (opcode == Op_AbsVF) {
 861     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 862   } else {
 863     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 864     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 865   }
 866 }
 867 
 868 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 869   if (opcode == Op_AbsVF) {
 870     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 871   } else {
 872     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 873     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 874   }
 875 }
 876 
 877 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 878   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 879   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 880 
 881   if (opcode == Op_MinV) {
 882     if (elem_bt == T_BYTE) {
 883       pminsb(dst, src);
 884     } else if (elem_bt == T_SHORT) {
 885       pminsw(dst, src);
 886     } else if (elem_bt == T_INT) {
 887       pminsd(dst, src);
 888     } else {
 889       assert(elem_bt == T_LONG, "required");
 890       assert(tmp == xmm0, "required");
 891       assert_different_registers(dst, src, tmp);
 892       movdqu(xmm0, dst);
 893       pcmpgtq(xmm0, src);
 894       blendvpd(dst, src);  // xmm0 as mask
 895     }
 896   } else { // opcode == Op_MaxV
 897     if (elem_bt == T_BYTE) {
 898       pmaxsb(dst, src);
 899     } else if (elem_bt == T_SHORT) {
 900       pmaxsw(dst, src);
 901     } else if (elem_bt == T_INT) {
 902       pmaxsd(dst, src);
 903     } else {
 904       assert(elem_bt == T_LONG, "required");
 905       assert(tmp == xmm0, "required");
 906       assert_different_registers(dst, src, tmp);
 907       movdqu(xmm0, src);
 908       pcmpgtq(xmm0, dst);
 909       blendvpd(dst, src);  // xmm0 as mask
 910     }
 911   }
 912 }
 913 
 914 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 915                                   XMMRegister src1, Address src2, int vlen_enc) {
 916   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 917   if (opcode == Op_UMinV) {
 918     switch(elem_bt) {
 919       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 920       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 921       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 922       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 923       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 924     }
 925   } else {
 926     assert(opcode == Op_UMaxV, "required");
 927     switch(elem_bt) {
 928       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 929       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 930       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 931       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 932       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 933     }
 934   }
 935 }
 936 
 937 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 938   // For optimality, leverage a full vector width of 512 bits
 939   // for operations over smaller vector sizes on AVX512 targets.
 940   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 941     if (opcode == Op_UMaxV) {
 942       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 943     } else {
 944       assert(opcode == Op_UMinV, "required");
 945       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 946     }
 947   } else {
 948     // T1 = -1
 949     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 950     // T1 = -1 << 63
 951     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 952     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 953     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 954     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 955     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 956     // Mask = T2 > T1
 957     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 958     if (opcode == Op_UMaxV) {
 959       // Res = Mask ? Src2 : Src1
 960       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 961     } else {
 962       // Res = Mask ? Src1 : Src2
 963       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 964     }
 965   }
 966 }
 967 
 968 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 969                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 970   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 971   if (opcode == Op_UMinV) {
 972     switch(elem_bt) {
 973       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 974       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 975       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 976       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 977       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 978     }
 979   } else {
 980     assert(opcode == Op_UMaxV, "required");
 981     switch(elem_bt) {
 982       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 983       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 984       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 985       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 986       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 987     }
 988   }
 989 }
 990 
 991 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 992                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 993                                  int vlen_enc) {
 994   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 995 
 996   if (opcode == Op_MinV) {
 997     if (elem_bt == T_BYTE) {
 998       vpminsb(dst, src1, src2, vlen_enc);
 999     } else if (elem_bt == T_SHORT) {
1000       vpminsw(dst, src1, src2, vlen_enc);
1001     } else if (elem_bt == T_INT) {
1002       vpminsd(dst, src1, src2, vlen_enc);
1003     } else {
1004       assert(elem_bt == T_LONG, "required");
1005       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1006         vpminsq(dst, src1, src2, vlen_enc);
1007       } else {
1008         assert_different_registers(dst, src1, src2);
1009         vpcmpgtq(dst, src1, src2, vlen_enc);
1010         vblendvpd(dst, src1, src2, dst, vlen_enc);
1011       }
1012     }
1013   } else { // opcode == Op_MaxV
1014     if (elem_bt == T_BYTE) {
1015       vpmaxsb(dst, src1, src2, vlen_enc);
1016     } else if (elem_bt == T_SHORT) {
1017       vpmaxsw(dst, src1, src2, vlen_enc);
1018     } else if (elem_bt == T_INT) {
1019       vpmaxsd(dst, src1, src2, vlen_enc);
1020     } else {
1021       assert(elem_bt == T_LONG, "required");
1022       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1023         vpmaxsq(dst, src1, src2, vlen_enc);
1024       } else {
1025         assert_different_registers(dst, src1, src2);
1026         vpcmpgtq(dst, src1, src2, vlen_enc);
1027         vblendvpd(dst, src2, src1, dst, vlen_enc);
1028       }
1029     }
1030   }
1031 }
1032 
1033 // Float/Double min max
1034 
1035 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1036                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1037                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1038                                    int vlen_enc) {
1039   assert(UseAVX > 0, "required");
1040   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1041          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1042   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1043   assert_different_registers(a, tmp, atmp, btmp);
1044   assert_different_registers(b, tmp, atmp, btmp);
1045 
1046   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1047   bool is_double_word = is_double_word_type(elem_bt);
1048 
1049   /* Note on 'non-obvious' assembly sequence:
1050    *
1051    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1052    * and Java on how they handle floats:
1053    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1054    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1055    *
1056    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1057    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1058    *                (only useful when signs differ, noop otherwise)
1059    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1060 
1061    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1062    *   btmp = (b < +0.0) ? a : b
1063    *   atmp = (b < +0.0) ? b : a
1064    *   Tmp  = Max_Float(atmp , btmp)
1065    *   Res  = (atmp == NaN) ? atmp : Tmp
1066    */
1067 
1068   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1069   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1070   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1071   XMMRegister mask;
1072 
1073   if (!is_double_word && is_min) {
1074     mask = a;
1075     vblend = &MacroAssembler::vblendvps;
1076     vmaxmin = &MacroAssembler::vminps;
1077     vcmp = &MacroAssembler::vcmpps;
1078   } else if (!is_double_word && !is_min) {
1079     mask = b;
1080     vblend = &MacroAssembler::vblendvps;
1081     vmaxmin = &MacroAssembler::vmaxps;
1082     vcmp = &MacroAssembler::vcmpps;
1083   } else if (is_double_word && is_min) {
1084     mask = a;
1085     vblend = &MacroAssembler::vblendvpd;
1086     vmaxmin = &MacroAssembler::vminpd;
1087     vcmp = &MacroAssembler::vcmppd;
1088   } else {
1089     assert(is_double_word && !is_min, "sanity");
1090     mask = b;
1091     vblend = &MacroAssembler::vblendvpd;
1092     vmaxmin = &MacroAssembler::vmaxpd;
1093     vcmp = &MacroAssembler::vcmppd;
1094   }
1095 
1096   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1097   XMMRegister maxmin, scratch;
1098   if (dst == btmp) {
1099     maxmin = btmp;
1100     scratch = tmp;
1101   } else {
1102     maxmin = tmp;
1103     scratch = btmp;
1104   }
1105 
1106   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1107   if (precompute_mask && !is_double_word) {
1108     vpsrad(tmp, mask, 32, vlen_enc);
1109     mask = tmp;
1110   } else if (precompute_mask && is_double_word) {
1111     vpxor(tmp, tmp, tmp, vlen_enc);
1112     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1113     mask = tmp;
1114   }
1115 
1116   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1117   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1118   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1119   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1120   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1121 }
1122 
1123 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1124                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1125                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1126                                     int vlen_enc) {
1127   assert(UseAVX > 2, "required");
1128   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1129          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1130   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1131   assert_different_registers(dst, a, atmp, btmp);
1132   assert_different_registers(dst, b, atmp, btmp);
1133 
1134   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1135   bool is_double_word = is_double_word_type(elem_bt);
1136   bool merge = true;
1137 
1138   if (!is_double_word && is_min) {
1139     evpmovd2m(ktmp, a, vlen_enc);
1140     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1141     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1142     vminps(dst, atmp, btmp, vlen_enc);
1143     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1144     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1145   } else if (!is_double_word && !is_min) {
1146     evpmovd2m(ktmp, b, vlen_enc);
1147     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1148     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1149     vmaxps(dst, atmp, btmp, vlen_enc);
1150     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1151     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1152   } else if (is_double_word && is_min) {
1153     evpmovq2m(ktmp, a, vlen_enc);
1154     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1155     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1156     vminpd(dst, atmp, btmp, vlen_enc);
1157     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1158     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1159   } else {
1160     assert(is_double_word && !is_min, "sanity");
1161     evpmovq2m(ktmp, b, vlen_enc);
1162     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1163     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1164     vmaxpd(dst, atmp, btmp, vlen_enc);
1165     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1166     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1167   }
1168 }
1169 
1170 // Float/Double signum
1171 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1172   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1173 
1174   Label DONE_LABEL;
1175 
1176   if (opcode == Op_SignumF) {
1177     assert(UseSSE > 0, "required");
1178     ucomiss(dst, zero);
1179     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1180     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1181     movflt(dst, one);
1182     jcc(Assembler::above, DONE_LABEL);
1183     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1184   } else if (opcode == Op_SignumD) {
1185     assert(UseSSE > 1, "required");
1186     ucomisd(dst, zero);
1187     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1188     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1189     movdbl(dst, one);
1190     jcc(Assembler::above, DONE_LABEL);
1191     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1192   }
1193 
1194   bind(DONE_LABEL);
1195 }
1196 
1197 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1198   if (sign) {
1199     pmovsxbw(dst, src);
1200   } else {
1201     pmovzxbw(dst, src);
1202   }
1203 }
1204 
1205 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1206   if (sign) {
1207     vpmovsxbw(dst, src, vector_len);
1208   } else {
1209     vpmovzxbw(dst, src, vector_len);
1210   }
1211 }
1212 
1213 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1214   if (sign) {
1215     vpmovsxbd(dst, src, vector_len);
1216   } else {
1217     vpmovzxbd(dst, src, vector_len);
1218   }
1219 }
1220 
1221 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1222   if (sign) {
1223     vpmovsxwd(dst, src, vector_len);
1224   } else {
1225     vpmovzxwd(dst, src, vector_len);
1226   }
1227 }
1228 
1229 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1230                                      int shift, int vector_len) {
1231   if (opcode == Op_RotateLeftV) {
1232     if (etype == T_INT) {
1233       evprold(dst, src, shift, vector_len);
1234     } else {
1235       assert(etype == T_LONG, "expected type T_LONG");
1236       evprolq(dst, src, shift, vector_len);
1237     }
1238   } else {
1239     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1240     if (etype == T_INT) {
1241       evprord(dst, src, shift, vector_len);
1242     } else {
1243       assert(etype == T_LONG, "expected type T_LONG");
1244       evprorq(dst, src, shift, vector_len);
1245     }
1246   }
1247 }
1248 
1249 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1250                                      XMMRegister shift, int vector_len) {
1251   if (opcode == Op_RotateLeftV) {
1252     if (etype == T_INT) {
1253       evprolvd(dst, src, shift, vector_len);
1254     } else {
1255       assert(etype == T_LONG, "expected type T_LONG");
1256       evprolvq(dst, src, shift, vector_len);
1257     }
1258   } else {
1259     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1260     if (etype == T_INT) {
1261       evprorvd(dst, src, shift, vector_len);
1262     } else {
1263       assert(etype == T_LONG, "expected type T_LONG");
1264       evprorvq(dst, src, shift, vector_len);
1265     }
1266   }
1267 }
1268 
1269 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1270   if (opcode == Op_RShiftVI) {
1271     psrad(dst, shift);
1272   } else if (opcode == Op_LShiftVI) {
1273     pslld(dst, shift);
1274   } else {
1275     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1276     psrld(dst, shift);
1277   }
1278 }
1279 
1280 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1281   switch (opcode) {
1282     case Op_RShiftVI:  psrad(dst, shift); break;
1283     case Op_LShiftVI:  pslld(dst, shift); break;
1284     case Op_URShiftVI: psrld(dst, shift); break;
1285 
1286     default: assert(false, "%s", NodeClassNames[opcode]);
1287   }
1288 }
1289 
1290 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1291   if (opcode == Op_RShiftVI) {
1292     vpsrad(dst, nds, shift, vector_len);
1293   } else if (opcode == Op_LShiftVI) {
1294     vpslld(dst, nds, shift, vector_len);
1295   } else {
1296     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1297     vpsrld(dst, nds, shift, vector_len);
1298   }
1299 }
1300 
1301 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1302   switch (opcode) {
1303     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1304     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1305     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1306 
1307     default: assert(false, "%s", NodeClassNames[opcode]);
1308   }
1309 }
1310 
1311 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1312   switch (opcode) {
1313     case Op_RShiftVB:  // fall-through
1314     case Op_RShiftVS:  psraw(dst, shift); break;
1315 
1316     case Op_LShiftVB:  // fall-through
1317     case Op_LShiftVS:  psllw(dst, shift);   break;
1318 
1319     case Op_URShiftVS: // fall-through
1320     case Op_URShiftVB: psrlw(dst, shift);  break;
1321 
1322     default: assert(false, "%s", NodeClassNames[opcode]);
1323   }
1324 }
1325 
1326 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1327   switch (opcode) {
1328     case Op_RShiftVB:  // fall-through
1329     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1330 
1331     case Op_LShiftVB:  // fall-through
1332     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1333 
1334     case Op_URShiftVS: // fall-through
1335     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1336 
1337     default: assert(false, "%s", NodeClassNames[opcode]);
1338   }
1339 }
1340 
1341 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1342   switch (opcode) {
1343     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1344     case Op_LShiftVL:  psllq(dst, shift); break;
1345     case Op_URShiftVL: psrlq(dst, shift); break;
1346 
1347     default: assert(false, "%s", NodeClassNames[opcode]);
1348   }
1349 }
1350 
1351 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1352   if (opcode == Op_RShiftVL) {
1353     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1354   } else if (opcode == Op_LShiftVL) {
1355     psllq(dst, shift);
1356   } else {
1357     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1358     psrlq(dst, shift);
1359   }
1360 }
1361 
1362 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1363   switch (opcode) {
1364     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1365     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1366     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1367 
1368     default: assert(false, "%s", NodeClassNames[opcode]);
1369   }
1370 }
1371 
1372 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1373   if (opcode == Op_RShiftVL) {
1374     evpsraq(dst, nds, shift, vector_len);
1375   } else if (opcode == Op_LShiftVL) {
1376     vpsllq(dst, nds, shift, vector_len);
1377   } else {
1378     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1379     vpsrlq(dst, nds, shift, vector_len);
1380   }
1381 }
1382 
1383 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1384   switch (opcode) {
1385     case Op_RShiftVB:  // fall-through
1386     case Op_RShiftVS:  // fall-through
1387     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1388 
1389     case Op_LShiftVB:  // fall-through
1390     case Op_LShiftVS:  // fall-through
1391     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1392 
1393     case Op_URShiftVB: // fall-through
1394     case Op_URShiftVS: // fall-through
1395     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1396 
1397     default: assert(false, "%s", NodeClassNames[opcode]);
1398   }
1399 }
1400 
1401 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1402   switch (opcode) {
1403     case Op_RShiftVB:  // fall-through
1404     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1405 
1406     case Op_LShiftVB:  // fall-through
1407     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1408 
1409     case Op_URShiftVB: // fall-through
1410     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1411 
1412     default: assert(false, "%s", NodeClassNames[opcode]);
1413   }
1414 }
1415 
1416 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1417   assert(UseAVX >= 2, "required");
1418   switch (opcode) {
1419     case Op_RShiftVL: {
1420       if (UseAVX > 2) {
1421         assert(tmp == xnoreg, "not used");
1422         if (!VM_Version::supports_avx512vl()) {
1423           vlen_enc = Assembler::AVX_512bit;
1424         }
1425         evpsravq(dst, src, shift, vlen_enc);
1426       } else {
1427         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1428         vpsrlvq(dst, src, shift, vlen_enc);
1429         vpsrlvq(tmp, tmp, shift, vlen_enc);
1430         vpxor(dst, dst, tmp, vlen_enc);
1431         vpsubq(dst, dst, tmp, vlen_enc);
1432       }
1433       break;
1434     }
1435     case Op_LShiftVL: {
1436       assert(tmp == xnoreg, "not used");
1437       vpsllvq(dst, src, shift, vlen_enc);
1438       break;
1439     }
1440     case Op_URShiftVL: {
1441       assert(tmp == xnoreg, "not used");
1442       vpsrlvq(dst, src, shift, vlen_enc);
1443       break;
1444     }
1445     default: assert(false, "%s", NodeClassNames[opcode]);
1446   }
1447 }
1448 
1449 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1450 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1451   assert(opcode == Op_LShiftVB ||
1452          opcode == Op_RShiftVB ||
1453          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1454   bool sign = (opcode != Op_URShiftVB);
1455   assert(vector_len == 0, "required");
1456   vextendbd(sign, dst, src, 1);
1457   vpmovzxbd(vtmp, shift, 1);
1458   varshiftd(opcode, dst, dst, vtmp, 1);
1459   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1460   vextracti128_high(vtmp, dst);
1461   vpackusdw(dst, dst, vtmp, 0);
1462 }
1463 
1464 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1465 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1466   assert(opcode == Op_LShiftVB ||
1467          opcode == Op_RShiftVB ||
1468          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1469   bool sign = (opcode != Op_URShiftVB);
1470   int ext_vector_len = vector_len + 1;
1471   vextendbw(sign, dst, src, ext_vector_len);
1472   vpmovzxbw(vtmp, shift, ext_vector_len);
1473   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1474   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1475   if (vector_len == 0) {
1476     vextracti128_high(vtmp, dst);
1477     vpackuswb(dst, dst, vtmp, vector_len);
1478   } else {
1479     vextracti64x4_high(vtmp, dst);
1480     vpackuswb(dst, dst, vtmp, vector_len);
1481     vpermq(dst, dst, 0xD8, vector_len);
1482   }
1483 }
1484 
1485 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1486   switch(typ) {
1487     case T_BYTE:
1488       pinsrb(dst, val, idx);
1489       break;
1490     case T_SHORT:
1491       pinsrw(dst, val, idx);
1492       break;
1493     case T_INT:
1494       pinsrd(dst, val, idx);
1495       break;
1496     case T_LONG:
1497       pinsrq(dst, val, idx);
1498       break;
1499     default:
1500       assert(false,"Should not reach here.");
1501       break;
1502   }
1503 }
1504 
1505 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1506   switch(typ) {
1507     case T_BYTE:
1508       vpinsrb(dst, src, val, idx);
1509       break;
1510     case T_SHORT:
1511       vpinsrw(dst, src, val, idx);
1512       break;
1513     case T_INT:
1514       vpinsrd(dst, src, val, idx);
1515       break;
1516     case T_LONG:
1517       vpinsrq(dst, src, val, idx);
1518       break;
1519     default:
1520       assert(false,"Should not reach here.");
1521       break;
1522   }
1523 }
1524 
1525 #ifdef _LP64
1526 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1527                                                 XMMRegister dst, Register base,
1528                                                 Register idx_base,
1529                                                 Register offset, Register mask,
1530                                                 Register mask_idx, Register rtmp,
1531                                                 int vlen_enc) {
1532   vpxor(dst, dst, dst, vlen_enc);
1533   if (elem_bt == T_SHORT) {
1534     for (int i = 0; i < 4; i++) {
1535       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1536       Label skip_load;
1537       btq(mask, mask_idx);
1538       jccb(Assembler::carryClear, skip_load);
1539       movl(rtmp, Address(idx_base, i * 4));
1540       if (offset != noreg) {
1541         addl(rtmp, offset);
1542       }
1543       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1544       bind(skip_load);
1545       incq(mask_idx);
1546     }
1547   } else {
1548     assert(elem_bt == T_BYTE, "");
1549     for (int i = 0; i < 8; i++) {
1550       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1551       Label skip_load;
1552       btq(mask, mask_idx);
1553       jccb(Assembler::carryClear, skip_load);
1554       movl(rtmp, Address(idx_base, i * 4));
1555       if (offset != noreg) {
1556         addl(rtmp, offset);
1557       }
1558       pinsrb(dst, Address(base, rtmp), i);
1559       bind(skip_load);
1560       incq(mask_idx);
1561     }
1562   }
1563 }
1564 #endif // _LP64
1565 
1566 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1567                                          Register base, Register idx_base,
1568                                          Register offset, Register rtmp,
1569                                          int vlen_enc) {
1570   vpxor(dst, dst, dst, vlen_enc);
1571   if (elem_bt == T_SHORT) {
1572     for (int i = 0; i < 4; i++) {
1573       // dst[i] = src[offset + idx_base[i]]
1574       movl(rtmp, Address(idx_base, i * 4));
1575       if (offset != noreg) {
1576         addl(rtmp, offset);
1577       }
1578       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1579     }
1580   } else {
1581     assert(elem_bt == T_BYTE, "");
1582     for (int i = 0; i < 8; i++) {
1583       // dst[i] = src[offset + idx_base[i]]
1584       movl(rtmp, Address(idx_base, i * 4));
1585       if (offset != noreg) {
1586         addl(rtmp, offset);
1587       }
1588       pinsrb(dst, Address(base, rtmp), i);
1589     }
1590   }
1591 }
1592 
1593 /*
1594  * Gather using hybrid algorithm, first partially unroll scalar loop
1595  * to accumulate values from gather indices into a quad-word(64bit) slice.
1596  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1597  * permutation to place the slice into appropriate vector lane
1598  * locations in destination vector. Following pseudo code describes the
1599  * algorithm in detail:
1600  *
1601  * DST_VEC = ZERO_VEC
1602  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1603  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1604  * FOREACH_ITER:
1605  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1606  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1607  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1608  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1609  *
1610  * With each iteration, doubleword permute indices (0,1) corresponding
1611  * to gathered quadword gets right shifted by two lane positions.
1612  *
1613  */
1614 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1615                                         Register base, Register idx_base,
1616                                         Register offset, Register mask,
1617                                         XMMRegister xtmp1, XMMRegister xtmp2,
1618                                         XMMRegister temp_dst, Register rtmp,
1619                                         Register mask_idx, Register length,
1620                                         int vector_len, int vlen_enc) {
1621   Label GATHER8_LOOP;
1622   assert(is_subword_type(elem_ty), "");
1623   movl(length, vector_len);
1624   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1625   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1626   vallones(xtmp2, vlen_enc);
1627   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1628   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1629   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1630 
1631   bind(GATHER8_LOOP);
1632     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1633     if (mask == noreg) {
1634       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1635     } else {
1636       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1637     }
1638     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1639     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1640     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1641     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1642     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1643     vpor(dst, dst, temp_dst, vlen_enc);
1644     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1645     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1646     jcc(Assembler::notEqual, GATHER8_LOOP);
1647 }
1648 
1649 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1650   switch(typ) {
1651     case T_INT:
1652       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1653       break;
1654     case T_FLOAT:
1655       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1656       break;
1657     case T_LONG:
1658       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1659       break;
1660     case T_DOUBLE:
1661       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1662       break;
1663     default:
1664       assert(false,"Should not reach here.");
1665       break;
1666   }
1667 }
1668 
1669 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1670   switch(typ) {
1671     case T_INT:
1672       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1673       break;
1674     case T_FLOAT:
1675       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1676       break;
1677     case T_LONG:
1678       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1679       break;
1680     case T_DOUBLE:
1681       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1682       break;
1683     default:
1684       assert(false,"Should not reach here.");
1685       break;
1686   }
1687 }
1688 
1689 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1690   switch(typ) {
1691     case T_INT:
1692       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1693       break;
1694     case T_FLOAT:
1695       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1696       break;
1697     case T_LONG:
1698       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1699       break;
1700     case T_DOUBLE:
1701       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1702       break;
1703     default:
1704       assert(false,"Should not reach here.");
1705       break;
1706   }
1707 }
1708 
1709 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1710   if (vlen_in_bytes <= 16) {
1711     pxor (dst, dst);
1712     psubb(dst, src);
1713     switch (elem_bt) {
1714       case T_BYTE:   /* nothing to do */ break;
1715       case T_SHORT:  pmovsxbw(dst, dst); break;
1716       case T_INT:    pmovsxbd(dst, dst); break;
1717       case T_FLOAT:  pmovsxbd(dst, dst); break;
1718       case T_LONG:   pmovsxbq(dst, dst); break;
1719       case T_DOUBLE: pmovsxbq(dst, dst); break;
1720 
1721       default: assert(false, "%s", type2name(elem_bt));
1722     }
1723   } else {
1724     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1725     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1726 
1727     vpxor (dst, dst, dst, vlen_enc);
1728     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1729 
1730     switch (elem_bt) {
1731       case T_BYTE:   /* nothing to do */            break;
1732       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1733       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1734       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1735       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1736       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1737 
1738       default: assert(false, "%s", type2name(elem_bt));
1739     }
1740   }
1741 }
1742 
1743 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1744   if (novlbwdq) {
1745     vpmovsxbd(xtmp, src, vlen_enc);
1746     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1747             Assembler::eq, true, vlen_enc, noreg);
1748   } else {
1749     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1750     vpsubb(xtmp, xtmp, src, vlen_enc);
1751     evpmovb2m(dst, xtmp, vlen_enc);
1752   }
1753 }
1754 
1755 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1756   if (is_integral_type(bt)) {
1757     switch (vlen_in_bytes) {
1758       case 4:  movdl(dst, src);   break;
1759       case 8:  movq(dst, src);    break;
1760       case 16: movdqu(dst, src);  break;
1761       case 32: vmovdqu(dst, src); break;
1762       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1763       default: ShouldNotReachHere();
1764     }
1765   } else {
1766     switch (vlen_in_bytes) {
1767       case 4:  movflt(dst, src); break;
1768       case 8:  movdbl(dst, src); break;
1769       case 16: movups(dst, src); break;
1770       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1771       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1772       default: ShouldNotReachHere();
1773     }
1774   }
1775 }
1776 
1777 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1778   assert(rscratch != noreg || always_reachable(src), "missing");
1779 
1780   if (reachable(src)) {
1781     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1782   } else {
1783     lea(rscratch, src);
1784     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1785   }
1786 }
1787 
1788 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1789   int vlen_enc = vector_length_encoding(vlen);
1790   if (VM_Version::supports_avx()) {
1791     if (bt == T_LONG) {
1792       if (VM_Version::supports_avx2()) {
1793         vpbroadcastq(dst, src, vlen_enc);
1794       } else {
1795         vmovddup(dst, src, vlen_enc);
1796       }
1797     } else if (bt == T_DOUBLE) {
1798       if (vlen_enc != Assembler::AVX_128bit) {
1799         vbroadcastsd(dst, src, vlen_enc, noreg);
1800       } else {
1801         vmovddup(dst, src, vlen_enc);
1802       }
1803     } else {
1804       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1805         vpbroadcastd(dst, src, vlen_enc);
1806       } else {
1807         vbroadcastss(dst, src, vlen_enc);
1808       }
1809     }
1810   } else if (VM_Version::supports_sse3()) {
1811     movddup(dst, src);
1812   } else {
1813     load_vector(bt, dst, src, vlen);
1814   }
1815 }
1816 
1817 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1818   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1819   int offset = exact_log2(type2aelembytes(bt)) << 6;
1820   if (is_floating_point_type(bt)) {
1821     offset += 128;
1822   }
1823   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1824   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1825 }
1826 
1827 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1828 
1829 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1830   int vector_len = Assembler::AVX_128bit;
1831 
1832   switch (opcode) {
1833     case Op_AndReductionV:  pand(dst, src); break;
1834     case Op_OrReductionV:   por (dst, src); break;
1835     case Op_XorReductionV:  pxor(dst, src); break;
1836     case Op_MinReductionV:
1837       switch (typ) {
1838         case T_BYTE:        pminsb(dst, src); break;
1839         case T_SHORT:       pminsw(dst, src); break;
1840         case T_INT:         pminsd(dst, src); break;
1841         case T_LONG:        assert(UseAVX > 2, "required");
1842                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1843         default:            assert(false, "wrong type");
1844       }
1845       break;
1846     case Op_MaxReductionV:
1847       switch (typ) {
1848         case T_BYTE:        pmaxsb(dst, src); break;
1849         case T_SHORT:       pmaxsw(dst, src); break;
1850         case T_INT:         pmaxsd(dst, src); break;
1851         case T_LONG:        assert(UseAVX > 2, "required");
1852                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1853         default:            assert(false, "wrong type");
1854       }
1855       break;
1856     case Op_AddReductionVF: addss(dst, src); break;
1857     case Op_AddReductionVD: addsd(dst, src); break;
1858     case Op_AddReductionVI:
1859       switch (typ) {
1860         case T_BYTE:        paddb(dst, src); break;
1861         case T_SHORT:       paddw(dst, src); break;
1862         case T_INT:         paddd(dst, src); break;
1863         default:            assert(false, "wrong type");
1864       }
1865       break;
1866     case Op_AddReductionVL: paddq(dst, src); break;
1867     case Op_MulReductionVF: mulss(dst, src); break;
1868     case Op_MulReductionVD: mulsd(dst, src); break;
1869     case Op_MulReductionVI:
1870       switch (typ) {
1871         case T_SHORT:       pmullw(dst, src); break;
1872         case T_INT:         pmulld(dst, src); break;
1873         default:            assert(false, "wrong type");
1874       }
1875       break;
1876     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1877                             evpmullq(dst, dst, src, vector_len); break;
1878     default:                assert(false, "wrong opcode");
1879   }
1880 }
1881 
1882 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1883   switch (opcode) {
1884     case Op_AddReductionVF: addps(dst, src); break;
1885     case Op_AddReductionVD: addpd(dst, src); break;
1886     case Op_MulReductionVF: mulps(dst, src); break;
1887     case Op_MulReductionVD: mulpd(dst, src); break;
1888     default:                assert(false, "%s", NodeClassNames[opcode]);
1889   }
1890 }
1891 
1892 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1893   int vector_len = Assembler::AVX_256bit;
1894 
1895   switch (opcode) {
1896     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1897     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1898     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1899     case Op_MinReductionV:
1900       switch (typ) {
1901         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1902         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1903         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1904         case T_LONG:        assert(UseAVX > 2, "required");
1905                             vpminsq(dst, src1, src2, vector_len); break;
1906         default:            assert(false, "wrong type");
1907       }
1908       break;
1909     case Op_MaxReductionV:
1910       switch (typ) {
1911         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1912         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1913         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1914         case T_LONG:        assert(UseAVX > 2, "required");
1915                             vpmaxsq(dst, src1, src2, vector_len); break;
1916         default:            assert(false, "wrong type");
1917       }
1918       break;
1919     case Op_AddReductionVI:
1920       switch (typ) {
1921         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1922         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1923         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1924         default:            assert(false, "wrong type");
1925       }
1926       break;
1927     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1928     case Op_MulReductionVI:
1929       switch (typ) {
1930         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1931         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1932         default:            assert(false, "wrong type");
1933       }
1934       break;
1935     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1936     default:                assert(false, "wrong opcode");
1937   }
1938 }
1939 
1940 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1941   int vector_len = Assembler::AVX_256bit;
1942 
1943   switch (opcode) {
1944     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1945     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1946     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1947     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1948     default:                assert(false, "%s", NodeClassNames[opcode]);
1949   }
1950 }
1951 
1952 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1953                                   XMMRegister dst, XMMRegister src,
1954                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1955   switch (opcode) {
1956     case Op_AddReductionVF:
1957     case Op_MulReductionVF:
1958       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1959       break;
1960 
1961     case Op_AddReductionVD:
1962     case Op_MulReductionVD:
1963       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1964       break;
1965 
1966     default: assert(false, "wrong opcode");
1967   }
1968 }
1969 
1970 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1971                                             XMMRegister dst, XMMRegister src,
1972                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1973   switch (opcode) {
1974     case Op_AddReductionVF:
1975     case Op_MulReductionVF:
1976       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1977       break;
1978 
1979     case Op_AddReductionVD:
1980     case Op_MulReductionVD:
1981       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1982       break;
1983 
1984     default: assert(false, "%s", NodeClassNames[opcode]);
1985   }
1986 }
1987 
1988 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1989                              Register dst, Register src1, XMMRegister src2,
1990                              XMMRegister vtmp1, XMMRegister vtmp2) {
1991   switch (vlen) {
1992     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1993     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1994     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1995     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1996 
1997     default: assert(false, "wrong vector length");
1998   }
1999 }
2000 
2001 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2002                              Register dst, Register src1, XMMRegister src2,
2003                              XMMRegister vtmp1, XMMRegister vtmp2) {
2004   switch (vlen) {
2005     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2006     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2007     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2008     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2009 
2010     default: assert(false, "wrong vector length");
2011   }
2012 }
2013 
2014 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2015                              Register dst, Register src1, XMMRegister src2,
2016                              XMMRegister vtmp1, XMMRegister vtmp2) {
2017   switch (vlen) {
2018     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2019     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2020     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2021     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2022 
2023     default: assert(false, "wrong vector length");
2024   }
2025 }
2026 
2027 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2028                              Register dst, Register src1, XMMRegister src2,
2029                              XMMRegister vtmp1, XMMRegister vtmp2) {
2030   switch (vlen) {
2031     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2032     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2033     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2034     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2035 
2036     default: assert(false, "wrong vector length");
2037   }
2038 }
2039 
2040 #ifdef _LP64
2041 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2042                              Register dst, Register src1, XMMRegister src2,
2043                              XMMRegister vtmp1, XMMRegister vtmp2) {
2044   switch (vlen) {
2045     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2046     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2047     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2048 
2049     default: assert(false, "wrong vector length");
2050   }
2051 }
2052 #endif // _LP64
2053 
2054 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2055   switch (vlen) {
2056     case 2:
2057       assert(vtmp2 == xnoreg, "");
2058       reduce2F(opcode, dst, src, vtmp1);
2059       break;
2060     case 4:
2061       assert(vtmp2 == xnoreg, "");
2062       reduce4F(opcode, dst, src, vtmp1);
2063       break;
2064     case 8:
2065       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2066       break;
2067     case 16:
2068       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2069       break;
2070     default: assert(false, "wrong vector length");
2071   }
2072 }
2073 
2074 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2075   switch (vlen) {
2076     case 2:
2077       assert(vtmp2 == xnoreg, "");
2078       reduce2D(opcode, dst, src, vtmp1);
2079       break;
2080     case 4:
2081       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2082       break;
2083     case 8:
2084       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2085       break;
2086     default: assert(false, "wrong vector length");
2087   }
2088 }
2089 
2090 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2091   switch (vlen) {
2092     case 2:
2093       assert(vtmp1 == xnoreg, "");
2094       assert(vtmp2 == xnoreg, "");
2095       unorderedReduce2F(opcode, dst, src);
2096       break;
2097     case 4:
2098       assert(vtmp2 == xnoreg, "");
2099       unorderedReduce4F(opcode, dst, src, vtmp1);
2100       break;
2101     case 8:
2102       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2103       break;
2104     case 16:
2105       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2106       break;
2107     default: assert(false, "wrong vector length");
2108   }
2109 }
2110 
2111 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2112   switch (vlen) {
2113     case 2:
2114       assert(vtmp1 == xnoreg, "");
2115       assert(vtmp2 == xnoreg, "");
2116       unorderedReduce2D(opcode, dst, src);
2117       break;
2118     case 4:
2119       assert(vtmp2 == xnoreg, "");
2120       unorderedReduce4D(opcode, dst, src, vtmp1);
2121       break;
2122     case 8:
2123       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2124       break;
2125     default: assert(false, "wrong vector length");
2126   }
2127 }
2128 
2129 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2130   if (opcode == Op_AddReductionVI) {
2131     if (vtmp1 != src2) {
2132       movdqu(vtmp1, src2);
2133     }
2134     phaddd(vtmp1, vtmp1);
2135   } else {
2136     pshufd(vtmp1, src2, 0x1);
2137     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2138   }
2139   movdl(vtmp2, src1);
2140   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2141   movdl(dst, vtmp1);
2142 }
2143 
2144 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2145   if (opcode == Op_AddReductionVI) {
2146     if (vtmp1 != src2) {
2147       movdqu(vtmp1, src2);
2148     }
2149     phaddd(vtmp1, src2);
2150     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2151   } else {
2152     pshufd(vtmp2, src2, 0xE);
2153     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2154     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2155   }
2156 }
2157 
2158 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2159   if (opcode == Op_AddReductionVI) {
2160     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2161     vextracti128_high(vtmp2, vtmp1);
2162     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2163     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2164   } else {
2165     vextracti128_high(vtmp1, src2);
2166     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2167     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2168   }
2169 }
2170 
2171 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2172   vextracti64x4_high(vtmp2, src2);
2173   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2174   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2175 }
2176 
2177 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2178   pshufd(vtmp2, src2, 0x1);
2179   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2180   movdqu(vtmp1, vtmp2);
2181   psrldq(vtmp1, 2);
2182   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2183   movdqu(vtmp2, vtmp1);
2184   psrldq(vtmp2, 1);
2185   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2186   movdl(vtmp2, src1);
2187   pmovsxbd(vtmp1, vtmp1);
2188   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2189   pextrb(dst, vtmp1, 0x0);
2190   movsbl(dst, dst);
2191 }
2192 
2193 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2194   pshufd(vtmp1, src2, 0xE);
2195   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2196   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2197 }
2198 
2199 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2200   vextracti128_high(vtmp2, src2);
2201   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2202   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2203 }
2204 
2205 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2206   vextracti64x4_high(vtmp1, src2);
2207   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2208   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2209 }
2210 
2211 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2212   pmovsxbw(vtmp2, src2);
2213   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2214 }
2215 
2216 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2217   if (UseAVX > 1) {
2218     int vector_len = Assembler::AVX_256bit;
2219     vpmovsxbw(vtmp1, src2, vector_len);
2220     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2221   } else {
2222     pmovsxbw(vtmp2, src2);
2223     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2224     pshufd(vtmp2, src2, 0x1);
2225     pmovsxbw(vtmp2, src2);
2226     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2227   }
2228 }
2229 
2230 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2231   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2232     int vector_len = Assembler::AVX_512bit;
2233     vpmovsxbw(vtmp1, src2, vector_len);
2234     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2235   } else {
2236     assert(UseAVX >= 2,"Should not reach here.");
2237     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2238     vextracti128_high(vtmp2, src2);
2239     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2240   }
2241 }
2242 
2243 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2244   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2245   vextracti64x4_high(vtmp2, src2);
2246   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2247 }
2248 
2249 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2250   if (opcode == Op_AddReductionVI) {
2251     if (vtmp1 != src2) {
2252       movdqu(vtmp1, src2);
2253     }
2254     phaddw(vtmp1, vtmp1);
2255     phaddw(vtmp1, vtmp1);
2256   } else {
2257     pshufd(vtmp2, src2, 0x1);
2258     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2259     movdqu(vtmp1, vtmp2);
2260     psrldq(vtmp1, 2);
2261     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2262   }
2263   movdl(vtmp2, src1);
2264   pmovsxwd(vtmp1, vtmp1);
2265   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2266   pextrw(dst, vtmp1, 0x0);
2267   movswl(dst, dst);
2268 }
2269 
2270 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2271   if (opcode == Op_AddReductionVI) {
2272     if (vtmp1 != src2) {
2273       movdqu(vtmp1, src2);
2274     }
2275     phaddw(vtmp1, src2);
2276   } else {
2277     pshufd(vtmp1, src2, 0xE);
2278     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2279   }
2280   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2281 }
2282 
2283 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2284   if (opcode == Op_AddReductionVI) {
2285     int vector_len = Assembler::AVX_256bit;
2286     vphaddw(vtmp2, src2, src2, vector_len);
2287     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2288   } else {
2289     vextracti128_high(vtmp2, src2);
2290     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2291   }
2292   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2293 }
2294 
2295 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2296   int vector_len = Assembler::AVX_256bit;
2297   vextracti64x4_high(vtmp1, src2);
2298   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2299   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2300 }
2301 
2302 #ifdef _LP64
2303 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2304   pshufd(vtmp2, src2, 0xE);
2305   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2306   movdq(vtmp1, src1);
2307   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2308   movdq(dst, vtmp1);
2309 }
2310 
2311 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2312   vextracti128_high(vtmp1, src2);
2313   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2314   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2315 }
2316 
2317 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2318   vextracti64x4_high(vtmp2, src2);
2319   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2320   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2321 }
2322 
2323 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2324   mov64(temp, -1L);
2325   bzhiq(temp, temp, len);
2326   kmovql(dst, temp);
2327 }
2328 #endif // _LP64
2329 
2330 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2331   reduce_operation_128(T_FLOAT, opcode, dst, src);
2332   pshufd(vtmp, src, 0x1);
2333   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2334 }
2335 
2336 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2337   reduce2F(opcode, dst, src, vtmp);
2338   pshufd(vtmp, src, 0x2);
2339   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2340   pshufd(vtmp, src, 0x3);
2341   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2342 }
2343 
2344 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2345   reduce4F(opcode, dst, src, vtmp2);
2346   vextractf128_high(vtmp2, src);
2347   reduce4F(opcode, dst, vtmp2, vtmp1);
2348 }
2349 
2350 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2351   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2352   vextracti64x4_high(vtmp1, src);
2353   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2354 }
2355 
2356 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2357   pshufd(dst, src, 0x1);
2358   reduce_operation_128(T_FLOAT, opcode, dst, src);
2359 }
2360 
2361 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2362   pshufd(vtmp, src, 0xE);
2363   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2364   unorderedReduce2F(opcode, dst, vtmp);
2365 }
2366 
2367 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2368   vextractf128_high(vtmp1, src);
2369   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2370   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2371 }
2372 
2373 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2374   vextractf64x4_high(vtmp2, src);
2375   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2376   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2377 }
2378 
2379 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2380   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2381   pshufd(vtmp, src, 0xE);
2382   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2383 }
2384 
2385 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2386   reduce2D(opcode, dst, src, vtmp2);
2387   vextractf128_high(vtmp2, src);
2388   reduce2D(opcode, dst, vtmp2, vtmp1);
2389 }
2390 
2391 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2392   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2393   vextracti64x4_high(vtmp1, src);
2394   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2395 }
2396 
2397 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2398   pshufd(dst, src, 0xE);
2399   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2400 }
2401 
2402 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2403   vextractf128_high(vtmp, src);
2404   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2405   unorderedReduce2D(opcode, dst, vtmp);
2406 }
2407 
2408 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2409   vextractf64x4_high(vtmp2, src);
2410   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2411   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2412 }
2413 
2414 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2415   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2416 }
2417 
2418 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2419   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2420 }
2421 
2422 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2423   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2424 }
2425 
2426 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2427                                  int vec_enc) {
2428   switch(elem_bt) {
2429     case T_INT:
2430     case T_FLOAT:
2431       vmaskmovps(dst, src, mask, vec_enc);
2432       break;
2433     case T_LONG:
2434     case T_DOUBLE:
2435       vmaskmovpd(dst, src, mask, vec_enc);
2436       break;
2437     default:
2438       fatal("Unsupported type %s", type2name(elem_bt));
2439       break;
2440   }
2441 }
2442 
2443 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2444                                  int vec_enc) {
2445   switch(elem_bt) {
2446     case T_INT:
2447     case T_FLOAT:
2448       vmaskmovps(dst, src, mask, vec_enc);
2449       break;
2450     case T_LONG:
2451     case T_DOUBLE:
2452       vmaskmovpd(dst, src, mask, vec_enc);
2453       break;
2454     default:
2455       fatal("Unsupported type %s", type2name(elem_bt));
2456       break;
2457   }
2458 }
2459 
2460 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2461                                           XMMRegister dst, XMMRegister src,
2462                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2463                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2464   const int permconst[] = {1, 14};
2465   XMMRegister wsrc = src;
2466   XMMRegister wdst = xmm_0;
2467   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2468 
2469   int vlen_enc = Assembler::AVX_128bit;
2470   if (vlen == 16) {
2471     vlen_enc = Assembler::AVX_256bit;
2472   }
2473 
2474   for (int i = log2(vlen) - 1; i >=0; i--) {
2475     if (i == 0 && !is_dst_valid) {
2476       wdst = dst;
2477     }
2478     if (i == 3) {
2479       vextracti64x4_high(wtmp, wsrc);
2480     } else if (i == 2) {
2481       vextracti128_high(wtmp, wsrc);
2482     } else { // i = [0,1]
2483       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2484     }
2485     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2486     wsrc = wdst;
2487     vlen_enc = Assembler::AVX_128bit;
2488   }
2489   if (is_dst_valid) {
2490     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2491   }
2492 }
2493 
2494 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2495                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2496                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2497   XMMRegister wsrc = src;
2498   XMMRegister wdst = xmm_0;
2499   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2500   int vlen_enc = Assembler::AVX_128bit;
2501   if (vlen == 8) {
2502     vlen_enc = Assembler::AVX_256bit;
2503   }
2504   for (int i = log2(vlen) - 1; i >=0; i--) {
2505     if (i == 0 && !is_dst_valid) {
2506       wdst = dst;
2507     }
2508     if (i == 1) {
2509       vextracti128_high(wtmp, wsrc);
2510     } else if (i == 2) {
2511       vextracti64x4_high(wtmp, wsrc);
2512     } else {
2513       assert(i == 0, "%d", i);
2514       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2515     }
2516     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2517     wsrc = wdst;
2518     vlen_enc = Assembler::AVX_128bit;
2519   }
2520   if (is_dst_valid) {
2521     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2522   }
2523 }
2524 
2525 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2526   switch (bt) {
2527     case T_BYTE:  pextrb(dst, src, idx); break;
2528     case T_SHORT: pextrw(dst, src, idx); break;
2529     case T_INT:   pextrd(dst, src, idx); break;
2530     case T_LONG:  pextrq(dst, src, idx); break;
2531 
2532     default:
2533       assert(false,"Should not reach here.");
2534       break;
2535   }
2536 }
2537 
2538 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2539   int esize =  type2aelembytes(typ);
2540   int elem_per_lane = 16/esize;
2541   int lane = elemindex / elem_per_lane;
2542   int eindex = elemindex % elem_per_lane;
2543 
2544   if (lane >= 2) {
2545     assert(UseAVX > 2, "required");
2546     vextractf32x4(dst, src, lane & 3);
2547     return dst;
2548   } else if (lane > 0) {
2549     assert(UseAVX > 0, "required");
2550     vextractf128(dst, src, lane);
2551     return dst;
2552   } else {
2553     return src;
2554   }
2555 }
2556 
2557 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2558   if (typ == T_BYTE) {
2559     movsbl(dst, dst);
2560   } else if (typ == T_SHORT) {
2561     movswl(dst, dst);
2562   }
2563 }
2564 
2565 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2566   int esize =  type2aelembytes(typ);
2567   int elem_per_lane = 16/esize;
2568   int eindex = elemindex % elem_per_lane;
2569   assert(is_integral_type(typ),"required");
2570 
2571   if (eindex == 0) {
2572     if (typ == T_LONG) {
2573       movq(dst, src);
2574     } else {
2575       movdl(dst, src);
2576       movsxl(typ, dst);
2577     }
2578   } else {
2579     extract(typ, dst, src, eindex);
2580     movsxl(typ, dst);
2581   }
2582 }
2583 
2584 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2585   int esize =  type2aelembytes(typ);
2586   int elem_per_lane = 16/esize;
2587   int eindex = elemindex % elem_per_lane;
2588   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2589 
2590   if (eindex == 0) {
2591     movq(dst, src);
2592   } else {
2593     if (typ == T_FLOAT) {
2594       if (UseAVX == 0) {
2595         movdqu(dst, src);
2596         shufps(dst, dst, eindex);
2597       } else {
2598         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2599       }
2600     } else {
2601       if (UseAVX == 0) {
2602         movdqu(dst, src);
2603         psrldq(dst, eindex*esize);
2604       } else {
2605         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2606       }
2607       movq(dst, dst);
2608     }
2609   }
2610   // Zero upper bits
2611   if (typ == T_FLOAT) {
2612     if (UseAVX == 0) {
2613       assert(vtmp != xnoreg, "required.");
2614       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2615       pand(dst, vtmp);
2616     } else {
2617       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2618     }
2619   }
2620 }
2621 
2622 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2623   switch(typ) {
2624     case T_BYTE:
2625     case T_BOOLEAN:
2626       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2627       break;
2628     case T_SHORT:
2629     case T_CHAR:
2630       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2631       break;
2632     case T_INT:
2633     case T_FLOAT:
2634       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2635       break;
2636     case T_LONG:
2637     case T_DOUBLE:
2638       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2639       break;
2640     default:
2641       assert(false,"Should not reach here.");
2642       break;
2643   }
2644 }
2645 
2646 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2647   assert(rscratch != noreg || always_reachable(src2), "missing");
2648 
2649   switch(typ) {
2650     case T_BOOLEAN:
2651     case T_BYTE:
2652       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2653       break;
2654     case T_CHAR:
2655     case T_SHORT:
2656       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2657       break;
2658     case T_INT:
2659     case T_FLOAT:
2660       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2661       break;
2662     case T_LONG:
2663     case T_DOUBLE:
2664       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2665       break;
2666     default:
2667       assert(false,"Should not reach here.");
2668       break;
2669   }
2670 }
2671 
2672 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2673   switch(typ) {
2674     case T_BYTE:
2675       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2676       break;
2677     case T_SHORT:
2678       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2679       break;
2680     case T_INT:
2681     case T_FLOAT:
2682       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2683       break;
2684     case T_LONG:
2685     case T_DOUBLE:
2686       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2687       break;
2688     default:
2689       assert(false,"Should not reach here.");
2690       break;
2691   }
2692 }
2693 
2694 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2695   assert(vlen_in_bytes <= 32, "");
2696   int esize = type2aelembytes(bt);
2697   if (vlen_in_bytes == 32) {
2698     assert(vtmp == xnoreg, "required.");
2699     if (esize >= 4) {
2700       vtestps(src1, src2, AVX_256bit);
2701     } else {
2702       vptest(src1, src2, AVX_256bit);
2703     }
2704     return;
2705   }
2706   if (vlen_in_bytes < 16) {
2707     // Duplicate the lower part to fill the whole register,
2708     // Don't need to do so for src2
2709     assert(vtmp != xnoreg, "required");
2710     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2711     pshufd(vtmp, src1, shuffle_imm);
2712   } else {
2713     assert(vtmp == xnoreg, "required");
2714     vtmp = src1;
2715   }
2716   if (esize >= 4 && VM_Version::supports_avx()) {
2717     vtestps(vtmp, src2, AVX_128bit);
2718   } else {
2719     ptest(vtmp, src2);
2720   }
2721 }
2722 
2723 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2724 #ifdef ASSERT
2725   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2726   bool is_bw_supported = VM_Version::supports_avx512bw();
2727   if (is_bw && !is_bw_supported) {
2728     assert(vlen_enc != Assembler::AVX_512bit, "required");
2729     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2730            "XMM register should be 0-15");
2731   }
2732 #endif // ASSERT
2733   switch (elem_bt) {
2734     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2735     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2736     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2737     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2738     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2739     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2740     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2741   }
2742 }
2743 
2744 #ifdef _LP64
2745 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2746   assert(UseAVX >= 2, "required");
2747   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2748   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2749   if ((UseAVX > 2) &&
2750       (!is_bw || VM_Version::supports_avx512bw()) &&
2751       (!is_vl || VM_Version::supports_avx512vl())) {
2752     switch (elem_bt) {
2753       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2754       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2755       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2756       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2757       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2758     }
2759   } else {
2760     assert(vlen_enc != Assembler::AVX_512bit, "required");
2761     assert((dst->encoding() < 16),"XMM register should be 0-15");
2762     switch (elem_bt) {
2763       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2764       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2765       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2766       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2767       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2768       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2769       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2770     }
2771   }
2772 }
2773 #endif
2774 
2775 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2776   switch (to_elem_bt) {
2777     case T_SHORT:
2778       vpmovsxbw(dst, src, vlen_enc);
2779       break;
2780     case T_INT:
2781       vpmovsxbd(dst, src, vlen_enc);
2782       break;
2783     case T_FLOAT:
2784       vpmovsxbd(dst, src, vlen_enc);
2785       vcvtdq2ps(dst, dst, vlen_enc);
2786       break;
2787     case T_LONG:
2788       vpmovsxbq(dst, src, vlen_enc);
2789       break;
2790     case T_DOUBLE: {
2791       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2792       vpmovsxbd(dst, src, mid_vlen_enc);
2793       vcvtdq2pd(dst, dst, vlen_enc);
2794       break;
2795     }
2796     default:
2797       fatal("Unsupported type %s", type2name(to_elem_bt));
2798       break;
2799   }
2800 }
2801 
2802 //-------------------------------------------------------------------------------------------
2803 
2804 // IndexOf for constant substrings with size >= 8 chars
2805 // which don't need to be loaded through stack.
2806 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2807                                          Register cnt1, Register cnt2,
2808                                          int int_cnt2,  Register result,
2809                                          XMMRegister vec, Register tmp,
2810                                          int ae) {
2811   ShortBranchVerifier sbv(this);
2812   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2813   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2814 
2815   // This method uses the pcmpestri instruction with bound registers
2816   //   inputs:
2817   //     xmm - substring
2818   //     rax - substring length (elements count)
2819   //     mem - scanned string
2820   //     rdx - string length (elements count)
2821   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2822   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2823   //   outputs:
2824   //     rcx - matched index in string
2825   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2826   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2827   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2828   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2829   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2830 
2831   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2832         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2833         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2834 
2835   // Note, inline_string_indexOf() generates checks:
2836   // if (substr.count > string.count) return -1;
2837   // if (substr.count == 0) return 0;
2838   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2839 
2840   // Load substring.
2841   if (ae == StrIntrinsicNode::UL) {
2842     pmovzxbw(vec, Address(str2, 0));
2843   } else {
2844     movdqu(vec, Address(str2, 0));
2845   }
2846   movl(cnt2, int_cnt2);
2847   movptr(result, str1); // string addr
2848 
2849   if (int_cnt2 > stride) {
2850     jmpb(SCAN_TO_SUBSTR);
2851 
2852     // Reload substr for rescan, this code
2853     // is executed only for large substrings (> 8 chars)
2854     bind(RELOAD_SUBSTR);
2855     if (ae == StrIntrinsicNode::UL) {
2856       pmovzxbw(vec, Address(str2, 0));
2857     } else {
2858       movdqu(vec, Address(str2, 0));
2859     }
2860     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2861 
2862     bind(RELOAD_STR);
2863     // We came here after the beginning of the substring was
2864     // matched but the rest of it was not so we need to search
2865     // again. Start from the next element after the previous match.
2866 
2867     // cnt2 is number of substring reminding elements and
2868     // cnt1 is number of string reminding elements when cmp failed.
2869     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2870     subl(cnt1, cnt2);
2871     addl(cnt1, int_cnt2);
2872     movl(cnt2, int_cnt2); // Now restore cnt2
2873 
2874     decrementl(cnt1);     // Shift to next element
2875     cmpl(cnt1, cnt2);
2876     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2877 
2878     addptr(result, (1<<scale1));
2879 
2880   } // (int_cnt2 > 8)
2881 
2882   // Scan string for start of substr in 16-byte vectors
2883   bind(SCAN_TO_SUBSTR);
2884   pcmpestri(vec, Address(result, 0), mode);
2885   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2886   subl(cnt1, stride);
2887   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2888   cmpl(cnt1, cnt2);
2889   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2890   addptr(result, 16);
2891   jmpb(SCAN_TO_SUBSTR);
2892 
2893   // Found a potential substr
2894   bind(FOUND_CANDIDATE);
2895   // Matched whole vector if first element matched (tmp(rcx) == 0).
2896   if (int_cnt2 == stride) {
2897     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2898   } else { // int_cnt2 > 8
2899     jccb(Assembler::overflow, FOUND_SUBSTR);
2900   }
2901   // After pcmpestri tmp(rcx) contains matched element index
2902   // Compute start addr of substr
2903   lea(result, Address(result, tmp, scale1));
2904 
2905   // Make sure string is still long enough
2906   subl(cnt1, tmp);
2907   cmpl(cnt1, cnt2);
2908   if (int_cnt2 == stride) {
2909     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2910   } else { // int_cnt2 > 8
2911     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2912   }
2913   // Left less then substring.
2914 
2915   bind(RET_NOT_FOUND);
2916   movl(result, -1);
2917   jmp(EXIT);
2918 
2919   if (int_cnt2 > stride) {
2920     // This code is optimized for the case when whole substring
2921     // is matched if its head is matched.
2922     bind(MATCH_SUBSTR_HEAD);
2923     pcmpestri(vec, Address(result, 0), mode);
2924     // Reload only string if does not match
2925     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2926 
2927     Label CONT_SCAN_SUBSTR;
2928     // Compare the rest of substring (> 8 chars).
2929     bind(FOUND_SUBSTR);
2930     // First 8 chars are already matched.
2931     negptr(cnt2);
2932     addptr(cnt2, stride);
2933 
2934     bind(SCAN_SUBSTR);
2935     subl(cnt1, stride);
2936     cmpl(cnt2, -stride); // Do not read beyond substring
2937     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2938     // Back-up strings to avoid reading beyond substring:
2939     // cnt1 = cnt1 - cnt2 + 8
2940     addl(cnt1, cnt2); // cnt2 is negative
2941     addl(cnt1, stride);
2942     movl(cnt2, stride); negptr(cnt2);
2943     bind(CONT_SCAN_SUBSTR);
2944     if (int_cnt2 < (int)G) {
2945       int tail_off1 = int_cnt2<<scale1;
2946       int tail_off2 = int_cnt2<<scale2;
2947       if (ae == StrIntrinsicNode::UL) {
2948         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2949       } else {
2950         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2951       }
2952       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2953     } else {
2954       // calculate index in register to avoid integer overflow (int_cnt2*2)
2955       movl(tmp, int_cnt2);
2956       addptr(tmp, cnt2);
2957       if (ae == StrIntrinsicNode::UL) {
2958         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2959       } else {
2960         movdqu(vec, Address(str2, tmp, scale2, 0));
2961       }
2962       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2963     }
2964     // Need to reload strings pointers if not matched whole vector
2965     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2966     addptr(cnt2, stride);
2967     jcc(Assembler::negative, SCAN_SUBSTR);
2968     // Fall through if found full substring
2969 
2970   } // (int_cnt2 > 8)
2971 
2972   bind(RET_FOUND);
2973   // Found result if we matched full small substring.
2974   // Compute substr offset
2975   subptr(result, str1);
2976   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2977     shrl(result, 1); // index
2978   }
2979   bind(EXIT);
2980 
2981 } // string_indexofC8
2982 
2983 // Small strings are loaded through stack if they cross page boundary.
2984 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2985                                        Register cnt1, Register cnt2,
2986                                        int int_cnt2,  Register result,
2987                                        XMMRegister vec, Register tmp,
2988                                        int ae) {
2989   ShortBranchVerifier sbv(this);
2990   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2991   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2992 
2993   //
2994   // int_cnt2 is length of small (< 8 chars) constant substring
2995   // or (-1) for non constant substring in which case its length
2996   // is in cnt2 register.
2997   //
2998   // Note, inline_string_indexOf() generates checks:
2999   // if (substr.count > string.count) return -1;
3000   // if (substr.count == 0) return 0;
3001   //
3002   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3003   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3004   // This method uses the pcmpestri instruction with bound registers
3005   //   inputs:
3006   //     xmm - substring
3007   //     rax - substring length (elements count)
3008   //     mem - scanned string
3009   //     rdx - string length (elements count)
3010   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3011   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3012   //   outputs:
3013   //     rcx - matched index in string
3014   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3015   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3016   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3017   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3018 
3019   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3020         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3021         FOUND_CANDIDATE;
3022 
3023   { //========================================================
3024     // We don't know where these strings are located
3025     // and we can't read beyond them. Load them through stack.
3026     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3027 
3028     movptr(tmp, rsp); // save old SP
3029 
3030     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3031       if (int_cnt2 == (1>>scale2)) { // One byte
3032         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3033         load_unsigned_byte(result, Address(str2, 0));
3034         movdl(vec, result); // move 32 bits
3035       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3036         // Not enough header space in 32-bit VM: 12+3 = 15.
3037         movl(result, Address(str2, -1));
3038         shrl(result, 8);
3039         movdl(vec, result); // move 32 bits
3040       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3041         load_unsigned_short(result, Address(str2, 0));
3042         movdl(vec, result); // move 32 bits
3043       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3044         movdl(vec, Address(str2, 0)); // move 32 bits
3045       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3046         movq(vec, Address(str2, 0));  // move 64 bits
3047       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3048         // Array header size is 12 bytes in 32-bit VM
3049         // + 6 bytes for 3 chars == 18 bytes,
3050         // enough space to load vec and shift.
3051         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3052         if (ae == StrIntrinsicNode::UL) {
3053           int tail_off = int_cnt2-8;
3054           pmovzxbw(vec, Address(str2, tail_off));
3055           psrldq(vec, -2*tail_off);
3056         }
3057         else {
3058           int tail_off = int_cnt2*(1<<scale2);
3059           movdqu(vec, Address(str2, tail_off-16));
3060           psrldq(vec, 16-tail_off);
3061         }
3062       }
3063     } else { // not constant substring
3064       cmpl(cnt2, stride);
3065       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3066 
3067       // We can read beyond string if srt+16 does not cross page boundary
3068       // since heaps are aligned and mapped by pages.
3069       assert(os::vm_page_size() < (int)G, "default page should be small");
3070       movl(result, str2); // We need only low 32 bits
3071       andl(result, ((int)os::vm_page_size()-1));
3072       cmpl(result, ((int)os::vm_page_size()-16));
3073       jccb(Assembler::belowEqual, CHECK_STR);
3074 
3075       // Move small strings to stack to allow load 16 bytes into vec.
3076       subptr(rsp, 16);
3077       int stk_offset = wordSize-(1<<scale2);
3078       push(cnt2);
3079 
3080       bind(COPY_SUBSTR);
3081       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3082         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3083         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3084       } else if (ae == StrIntrinsicNode::UU) {
3085         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3086         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3087       }
3088       decrement(cnt2);
3089       jccb(Assembler::notZero, COPY_SUBSTR);
3090 
3091       pop(cnt2);
3092       movptr(str2, rsp);  // New substring address
3093     } // non constant
3094 
3095     bind(CHECK_STR);
3096     cmpl(cnt1, stride);
3097     jccb(Assembler::aboveEqual, BIG_STRINGS);
3098 
3099     // Check cross page boundary.
3100     movl(result, str1); // We need only low 32 bits
3101     andl(result, ((int)os::vm_page_size()-1));
3102     cmpl(result, ((int)os::vm_page_size()-16));
3103     jccb(Assembler::belowEqual, BIG_STRINGS);
3104 
3105     subptr(rsp, 16);
3106     int stk_offset = -(1<<scale1);
3107     if (int_cnt2 < 0) { // not constant
3108       push(cnt2);
3109       stk_offset += wordSize;
3110     }
3111     movl(cnt2, cnt1);
3112 
3113     bind(COPY_STR);
3114     if (ae == StrIntrinsicNode::LL) {
3115       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3116       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3117     } else {
3118       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3119       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3120     }
3121     decrement(cnt2);
3122     jccb(Assembler::notZero, COPY_STR);
3123 
3124     if (int_cnt2 < 0) { // not constant
3125       pop(cnt2);
3126     }
3127     movptr(str1, rsp);  // New string address
3128 
3129     bind(BIG_STRINGS);
3130     // Load substring.
3131     if (int_cnt2 < 0) { // -1
3132       if (ae == StrIntrinsicNode::UL) {
3133         pmovzxbw(vec, Address(str2, 0));
3134       } else {
3135         movdqu(vec, Address(str2, 0));
3136       }
3137       push(cnt2);       // substr count
3138       push(str2);       // substr addr
3139       push(str1);       // string addr
3140     } else {
3141       // Small (< 8 chars) constant substrings are loaded already.
3142       movl(cnt2, int_cnt2);
3143     }
3144     push(tmp);  // original SP
3145 
3146   } // Finished loading
3147 
3148   //========================================================
3149   // Start search
3150   //
3151 
3152   movptr(result, str1); // string addr
3153 
3154   if (int_cnt2  < 0) {  // Only for non constant substring
3155     jmpb(SCAN_TO_SUBSTR);
3156 
3157     // SP saved at sp+0
3158     // String saved at sp+1*wordSize
3159     // Substr saved at sp+2*wordSize
3160     // Substr count saved at sp+3*wordSize
3161 
3162     // Reload substr for rescan, this code
3163     // is executed only for large substrings (> 8 chars)
3164     bind(RELOAD_SUBSTR);
3165     movptr(str2, Address(rsp, 2*wordSize));
3166     movl(cnt2, Address(rsp, 3*wordSize));
3167     if (ae == StrIntrinsicNode::UL) {
3168       pmovzxbw(vec, Address(str2, 0));
3169     } else {
3170       movdqu(vec, Address(str2, 0));
3171     }
3172     // We came here after the beginning of the substring was
3173     // matched but the rest of it was not so we need to search
3174     // again. Start from the next element after the previous match.
3175     subptr(str1, result); // Restore counter
3176     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3177       shrl(str1, 1);
3178     }
3179     addl(cnt1, str1);
3180     decrementl(cnt1);   // Shift to next element
3181     cmpl(cnt1, cnt2);
3182     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3183 
3184     addptr(result, (1<<scale1));
3185   } // non constant
3186 
3187   // Scan string for start of substr in 16-byte vectors
3188   bind(SCAN_TO_SUBSTR);
3189   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3190   pcmpestri(vec, Address(result, 0), mode);
3191   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3192   subl(cnt1, stride);
3193   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3194   cmpl(cnt1, cnt2);
3195   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3196   addptr(result, 16);
3197 
3198   bind(ADJUST_STR);
3199   cmpl(cnt1, stride); // Do not read beyond string
3200   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3201   // Back-up string to avoid reading beyond string.
3202   lea(result, Address(result, cnt1, scale1, -16));
3203   movl(cnt1, stride);
3204   jmpb(SCAN_TO_SUBSTR);
3205 
3206   // Found a potential substr
3207   bind(FOUND_CANDIDATE);
3208   // After pcmpestri tmp(rcx) contains matched element index
3209 
3210   // Make sure string is still long enough
3211   subl(cnt1, tmp);
3212   cmpl(cnt1, cnt2);
3213   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3214   // Left less then substring.
3215 
3216   bind(RET_NOT_FOUND);
3217   movl(result, -1);
3218   jmp(CLEANUP);
3219 
3220   bind(FOUND_SUBSTR);
3221   // Compute start addr of substr
3222   lea(result, Address(result, tmp, scale1));
3223   if (int_cnt2 > 0) { // Constant substring
3224     // Repeat search for small substring (< 8 chars)
3225     // from new point without reloading substring.
3226     // Have to check that we don't read beyond string.
3227     cmpl(tmp, stride-int_cnt2);
3228     jccb(Assembler::greater, ADJUST_STR);
3229     // Fall through if matched whole substring.
3230   } else { // non constant
3231     assert(int_cnt2 == -1, "should be != 0");
3232 
3233     addl(tmp, cnt2);
3234     // Found result if we matched whole substring.
3235     cmpl(tmp, stride);
3236     jcc(Assembler::lessEqual, RET_FOUND);
3237 
3238     // Repeat search for small substring (<= 8 chars)
3239     // from new point 'str1' without reloading substring.
3240     cmpl(cnt2, stride);
3241     // Have to check that we don't read beyond string.
3242     jccb(Assembler::lessEqual, ADJUST_STR);
3243 
3244     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3245     // Compare the rest of substring (> 8 chars).
3246     movptr(str1, result);
3247 
3248     cmpl(tmp, cnt2);
3249     // First 8 chars are already matched.
3250     jccb(Assembler::equal, CHECK_NEXT);
3251 
3252     bind(SCAN_SUBSTR);
3253     pcmpestri(vec, Address(str1, 0), mode);
3254     // Need to reload strings pointers if not matched whole vector
3255     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3256 
3257     bind(CHECK_NEXT);
3258     subl(cnt2, stride);
3259     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3260     addptr(str1, 16);
3261     if (ae == StrIntrinsicNode::UL) {
3262       addptr(str2, 8);
3263     } else {
3264       addptr(str2, 16);
3265     }
3266     subl(cnt1, stride);
3267     cmpl(cnt2, stride); // Do not read beyond substring
3268     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3269     // Back-up strings to avoid reading beyond substring.
3270 
3271     if (ae == StrIntrinsicNode::UL) {
3272       lea(str2, Address(str2, cnt2, scale2, -8));
3273       lea(str1, Address(str1, cnt2, scale1, -16));
3274     } else {
3275       lea(str2, Address(str2, cnt2, scale2, -16));
3276       lea(str1, Address(str1, cnt2, scale1, -16));
3277     }
3278     subl(cnt1, cnt2);
3279     movl(cnt2, stride);
3280     addl(cnt1, stride);
3281     bind(CONT_SCAN_SUBSTR);
3282     if (ae == StrIntrinsicNode::UL) {
3283       pmovzxbw(vec, Address(str2, 0));
3284     } else {
3285       movdqu(vec, Address(str2, 0));
3286     }
3287     jmp(SCAN_SUBSTR);
3288 
3289     bind(RET_FOUND_LONG);
3290     movptr(str1, Address(rsp, wordSize));
3291   } // non constant
3292 
3293   bind(RET_FOUND);
3294   // Compute substr offset
3295   subptr(result, str1);
3296   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3297     shrl(result, 1); // index
3298   }
3299   bind(CLEANUP);
3300   pop(rsp); // restore SP
3301 
3302 } // string_indexof
3303 
3304 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3305                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3306   ShortBranchVerifier sbv(this);
3307   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3308 
3309   int stride = 8;
3310 
3311   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3312         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3313         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3314         FOUND_SEQ_CHAR, DONE_LABEL;
3315 
3316   movptr(result, str1);
3317   if (UseAVX >= 2) {
3318     cmpl(cnt1, stride);
3319     jcc(Assembler::less, SCAN_TO_CHAR);
3320     cmpl(cnt1, 2*stride);
3321     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3322     movdl(vec1, ch);
3323     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3324     vpxor(vec2, vec2);
3325     movl(tmp, cnt1);
3326     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3327     andl(cnt1,0x0000000F);  //tail count (in chars)
3328 
3329     bind(SCAN_TO_16_CHAR_LOOP);
3330     vmovdqu(vec3, Address(result, 0));
3331     vpcmpeqw(vec3, vec3, vec1, 1);
3332     vptest(vec2, vec3);
3333     jcc(Assembler::carryClear, FOUND_CHAR);
3334     addptr(result, 32);
3335     subl(tmp, 2*stride);
3336     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3337     jmp(SCAN_TO_8_CHAR);
3338     bind(SCAN_TO_8_CHAR_INIT);
3339     movdl(vec1, ch);
3340     pshuflw(vec1, vec1, 0x00);
3341     pshufd(vec1, vec1, 0);
3342     pxor(vec2, vec2);
3343   }
3344   bind(SCAN_TO_8_CHAR);
3345   cmpl(cnt1, stride);
3346   jcc(Assembler::less, SCAN_TO_CHAR);
3347   if (UseAVX < 2) {
3348     movdl(vec1, ch);
3349     pshuflw(vec1, vec1, 0x00);
3350     pshufd(vec1, vec1, 0);
3351     pxor(vec2, vec2);
3352   }
3353   movl(tmp, cnt1);
3354   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3355   andl(cnt1,0x00000007);  //tail count (in chars)
3356 
3357   bind(SCAN_TO_8_CHAR_LOOP);
3358   movdqu(vec3, Address(result, 0));
3359   pcmpeqw(vec3, vec1);
3360   ptest(vec2, vec3);
3361   jcc(Assembler::carryClear, FOUND_CHAR);
3362   addptr(result, 16);
3363   subl(tmp, stride);
3364   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3365   bind(SCAN_TO_CHAR);
3366   testl(cnt1, cnt1);
3367   jcc(Assembler::zero, RET_NOT_FOUND);
3368   bind(SCAN_TO_CHAR_LOOP);
3369   load_unsigned_short(tmp, Address(result, 0));
3370   cmpl(ch, tmp);
3371   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3372   addptr(result, 2);
3373   subl(cnt1, 1);
3374   jccb(Assembler::zero, RET_NOT_FOUND);
3375   jmp(SCAN_TO_CHAR_LOOP);
3376 
3377   bind(RET_NOT_FOUND);
3378   movl(result, -1);
3379   jmpb(DONE_LABEL);
3380 
3381   bind(FOUND_CHAR);
3382   if (UseAVX >= 2) {
3383     vpmovmskb(tmp, vec3);
3384   } else {
3385     pmovmskb(tmp, vec3);
3386   }
3387   bsfl(ch, tmp);
3388   addptr(result, ch);
3389 
3390   bind(FOUND_SEQ_CHAR);
3391   subptr(result, str1);
3392   shrl(result, 1);
3393 
3394   bind(DONE_LABEL);
3395 } // string_indexof_char
3396 
3397 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3398                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3399   ShortBranchVerifier sbv(this);
3400   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3401 
3402   int stride = 16;
3403 
3404   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3405         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3406         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3407         FOUND_SEQ_CHAR, DONE_LABEL;
3408 
3409   movptr(result, str1);
3410   if (UseAVX >= 2) {
3411     cmpl(cnt1, stride);
3412     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3413     cmpl(cnt1, stride*2);
3414     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3415     movdl(vec1, ch);
3416     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3417     vpxor(vec2, vec2);
3418     movl(tmp, cnt1);
3419     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3420     andl(cnt1,0x0000001F);  //tail count (in chars)
3421 
3422     bind(SCAN_TO_32_CHAR_LOOP);
3423     vmovdqu(vec3, Address(result, 0));
3424     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3425     vptest(vec2, vec3);
3426     jcc(Assembler::carryClear, FOUND_CHAR);
3427     addptr(result, 32);
3428     subl(tmp, stride*2);
3429     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3430     jmp(SCAN_TO_16_CHAR);
3431 
3432     bind(SCAN_TO_16_CHAR_INIT);
3433     movdl(vec1, ch);
3434     pxor(vec2, vec2);
3435     pshufb(vec1, vec2);
3436   }
3437 
3438   bind(SCAN_TO_16_CHAR);
3439   cmpl(cnt1, stride);
3440   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3441   if (UseAVX < 2) {
3442     movdl(vec1, ch);
3443     pxor(vec2, vec2);
3444     pshufb(vec1, vec2);
3445   }
3446   movl(tmp, cnt1);
3447   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3448   andl(cnt1,0x0000000F);  //tail count (in bytes)
3449 
3450   bind(SCAN_TO_16_CHAR_LOOP);
3451   movdqu(vec3, Address(result, 0));
3452   pcmpeqb(vec3, vec1);
3453   ptest(vec2, vec3);
3454   jcc(Assembler::carryClear, FOUND_CHAR);
3455   addptr(result, 16);
3456   subl(tmp, stride);
3457   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3458 
3459   bind(SCAN_TO_CHAR_INIT);
3460   testl(cnt1, cnt1);
3461   jcc(Assembler::zero, RET_NOT_FOUND);
3462   bind(SCAN_TO_CHAR_LOOP);
3463   load_unsigned_byte(tmp, Address(result, 0));
3464   cmpl(ch, tmp);
3465   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3466   addptr(result, 1);
3467   subl(cnt1, 1);
3468   jccb(Assembler::zero, RET_NOT_FOUND);
3469   jmp(SCAN_TO_CHAR_LOOP);
3470 
3471   bind(RET_NOT_FOUND);
3472   movl(result, -1);
3473   jmpb(DONE_LABEL);
3474 
3475   bind(FOUND_CHAR);
3476   if (UseAVX >= 2) {
3477     vpmovmskb(tmp, vec3);
3478   } else {
3479     pmovmskb(tmp, vec3);
3480   }
3481   bsfl(ch, tmp);
3482   addptr(result, ch);
3483 
3484   bind(FOUND_SEQ_CHAR);
3485   subptr(result, str1);
3486 
3487   bind(DONE_LABEL);
3488 } // stringL_indexof_char
3489 
3490 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3491   switch (eltype) {
3492   case T_BOOLEAN: return sizeof(jboolean);
3493   case T_BYTE:  return sizeof(jbyte);
3494   case T_SHORT: return sizeof(jshort);
3495   case T_CHAR:  return sizeof(jchar);
3496   case T_INT:   return sizeof(jint);
3497   default:
3498     ShouldNotReachHere();
3499     return -1;
3500   }
3501 }
3502 
3503 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3504   switch (eltype) {
3505   // T_BOOLEAN used as surrogate for unsigned byte
3506   case T_BOOLEAN: movzbl(dst, src);   break;
3507   case T_BYTE:    movsbl(dst, src);   break;
3508   case T_SHORT:   movswl(dst, src);   break;
3509   case T_CHAR:    movzwl(dst, src);   break;
3510   case T_INT:     movl(dst, src);     break;
3511   default:
3512     ShouldNotReachHere();
3513   }
3514 }
3515 
3516 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3517   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3518 }
3519 
3520 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3521   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3522 }
3523 
3524 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3525   const int vlen = Assembler::AVX_256bit;
3526   switch (eltype) {
3527   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3528   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3529   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3530   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3531   case T_INT:
3532     // do nothing
3533     break;
3534   default:
3535     ShouldNotReachHere();
3536   }
3537 }
3538 
3539 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3540                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3541                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3542                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3543                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3544                                         BasicType eltype) {
3545   ShortBranchVerifier sbv(this);
3546   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3547   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3548   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3549 
3550   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3551         SHORT_UNROLLED_LOOP_EXIT,
3552         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3553         UNROLLED_VECTOR_LOOP_BEGIN,
3554         END;
3555   switch (eltype) {
3556   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3557   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3558   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3559   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3560   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3561   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3562   }
3563 
3564   // For "renaming" for readibility of the code
3565   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3566                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3567                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3568 
3569   const int elsize = arrays_hashcode_elsize(eltype);
3570 
3571   /*
3572     if (cnt1 >= 2) {
3573       if (cnt1 >= 32) {
3574         UNROLLED VECTOR LOOP
3575       }
3576       UNROLLED SCALAR LOOP
3577     }
3578     SINGLE SCALAR
3579    */
3580 
3581   cmpl(cnt1, 32);
3582   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3583 
3584   // cnt1 >= 32 && generate_vectorized_loop
3585   xorl(index, index);
3586 
3587   // vresult = IntVector.zero(I256);
3588   for (int idx = 0; idx < 4; idx++) {
3589     vpxor(vresult[idx], vresult[idx]);
3590   }
3591   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3592   Register bound = tmp2;
3593   Register next = tmp3;
3594   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3595   movl(next, Address(tmp2, 0));
3596   movdl(vnext, next);
3597   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3598 
3599   // index = 0;
3600   // bound = cnt1 & ~(32 - 1);
3601   movl(bound, cnt1);
3602   andl(bound, ~(32 - 1));
3603   // for (; index < bound; index += 32) {
3604   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3605   // result *= next;
3606   imull(result, next);
3607   // loop fission to upfront the cost of fetching from memory, OOO execution
3608   // can then hopefully do a better job of prefetching
3609   for (int idx = 0; idx < 4; idx++) {
3610     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3611   }
3612   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3613   for (int idx = 0; idx < 4; idx++) {
3614     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3615     arrays_hashcode_elvcast(vtmp[idx], eltype);
3616     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3617   }
3618   // index += 32;
3619   addl(index, 32);
3620   // index < bound;
3621   cmpl(index, bound);
3622   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3623   // }
3624 
3625   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3626   subl(cnt1, bound);
3627   // release bound
3628 
3629   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3630   for (int idx = 0; idx < 4; idx++) {
3631     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3632     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3633     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3634   }
3635   // result += vresult.reduceLanes(ADD);
3636   for (int idx = 0; idx < 4; idx++) {
3637     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3638   }
3639 
3640   // } else if (cnt1 < 32) {
3641 
3642   bind(SHORT_UNROLLED_BEGIN);
3643   // int i = 1;
3644   movl(index, 1);
3645   cmpl(index, cnt1);
3646   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3647 
3648   // for (; i < cnt1 ; i += 2) {
3649   bind(SHORT_UNROLLED_LOOP_BEGIN);
3650   movl(tmp3, 961);
3651   imull(result, tmp3);
3652   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3653   movl(tmp3, tmp2);
3654   shll(tmp3, 5);
3655   subl(tmp3, tmp2);
3656   addl(result, tmp3);
3657   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3658   addl(result, tmp3);
3659   addl(index, 2);
3660   cmpl(index, cnt1);
3661   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3662 
3663   // }
3664   // if (i >= cnt1) {
3665   bind(SHORT_UNROLLED_LOOP_EXIT);
3666   jccb(Assembler::greater, END);
3667   movl(tmp2, result);
3668   shll(result, 5);
3669   subl(result, tmp2);
3670   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3671   addl(result, tmp3);
3672   // }
3673   bind(END);
3674 
3675   BLOCK_COMMENT("} // arrays_hashcode");
3676 
3677 } // arrays_hashcode
3678 
3679 // helper function for string_compare
3680 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3681                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3682                                            Address::ScaleFactor scale2, Register index, int ae) {
3683   if (ae == StrIntrinsicNode::LL) {
3684     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3685     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3686   } else if (ae == StrIntrinsicNode::UU) {
3687     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3688     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3689   } else {
3690     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3691     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3692   }
3693 }
3694 
3695 // Compare strings, used for char[] and byte[].
3696 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3697                                        Register cnt1, Register cnt2, Register result,
3698                                        XMMRegister vec1, int ae, KRegister mask) {
3699   ShortBranchVerifier sbv(this);
3700   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3701   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3702   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3703   int stride2x2 = 0x40;
3704   Address::ScaleFactor scale = Address::no_scale;
3705   Address::ScaleFactor scale1 = Address::no_scale;
3706   Address::ScaleFactor scale2 = Address::no_scale;
3707 
3708   if (ae != StrIntrinsicNode::LL) {
3709     stride2x2 = 0x20;
3710   }
3711 
3712   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3713     shrl(cnt2, 1);
3714   }
3715   // Compute the minimum of the string lengths and the
3716   // difference of the string lengths (stack).
3717   // Do the conditional move stuff
3718   movl(result, cnt1);
3719   subl(cnt1, cnt2);
3720   push(cnt1);
3721   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3722 
3723   // Is the minimum length zero?
3724   testl(cnt2, cnt2);
3725   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3726   if (ae == StrIntrinsicNode::LL) {
3727     // Load first bytes
3728     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3729     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3730   } else if (ae == StrIntrinsicNode::UU) {
3731     // Load first characters
3732     load_unsigned_short(result, Address(str1, 0));
3733     load_unsigned_short(cnt1, Address(str2, 0));
3734   } else {
3735     load_unsigned_byte(result, Address(str1, 0));
3736     load_unsigned_short(cnt1, Address(str2, 0));
3737   }
3738   subl(result, cnt1);
3739   jcc(Assembler::notZero,  POP_LABEL);
3740 
3741   if (ae == StrIntrinsicNode::UU) {
3742     // Divide length by 2 to get number of chars
3743     shrl(cnt2, 1);
3744   }
3745   cmpl(cnt2, 1);
3746   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3747 
3748   // Check if the strings start at the same location and setup scale and stride
3749   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3750     cmpptr(str1, str2);
3751     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3752     if (ae == StrIntrinsicNode::LL) {
3753       scale = Address::times_1;
3754       stride = 16;
3755     } else {
3756       scale = Address::times_2;
3757       stride = 8;
3758     }
3759   } else {
3760     scale1 = Address::times_1;
3761     scale2 = Address::times_2;
3762     // scale not used
3763     stride = 8;
3764   }
3765 
3766   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3767     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3768     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3769     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3770     Label COMPARE_TAIL_LONG;
3771     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3772 
3773     int pcmpmask = 0x19;
3774     if (ae == StrIntrinsicNode::LL) {
3775       pcmpmask &= ~0x01;
3776     }
3777 
3778     // Setup to compare 16-chars (32-bytes) vectors,
3779     // start from first character again because it has aligned address.
3780     if (ae == StrIntrinsicNode::LL) {
3781       stride2 = 32;
3782     } else {
3783       stride2 = 16;
3784     }
3785     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3786       adr_stride = stride << scale;
3787     } else {
3788       adr_stride1 = 8;  //stride << scale1;
3789       adr_stride2 = 16; //stride << scale2;
3790     }
3791 
3792     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3793     // rax and rdx are used by pcmpestri as elements counters
3794     movl(result, cnt2);
3795     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3796     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3797 
3798     // fast path : compare first 2 8-char vectors.
3799     bind(COMPARE_16_CHARS);
3800     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3801       movdqu(vec1, Address(str1, 0));
3802     } else {
3803       pmovzxbw(vec1, Address(str1, 0));
3804     }
3805     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3806     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3807 
3808     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3809       movdqu(vec1, Address(str1, adr_stride));
3810       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3811     } else {
3812       pmovzxbw(vec1, Address(str1, adr_stride1));
3813       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3814     }
3815     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3816     addl(cnt1, stride);
3817 
3818     // Compare the characters at index in cnt1
3819     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3820     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3821     subl(result, cnt2);
3822     jmp(POP_LABEL);
3823 
3824     // Setup the registers to start vector comparison loop
3825     bind(COMPARE_WIDE_VECTORS);
3826     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3827       lea(str1, Address(str1, result, scale));
3828       lea(str2, Address(str2, result, scale));
3829     } else {
3830       lea(str1, Address(str1, result, scale1));
3831       lea(str2, Address(str2, result, scale2));
3832     }
3833     subl(result, stride2);
3834     subl(cnt2, stride2);
3835     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3836     negptr(result);
3837 
3838     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3839     bind(COMPARE_WIDE_VECTORS_LOOP);
3840 
3841 #ifdef _LP64
3842     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3843       cmpl(cnt2, stride2x2);
3844       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3845       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3846       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3847 
3848       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3849       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3850         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3851         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3852       } else {
3853         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3854         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3855       }
3856       kortestql(mask, mask);
3857       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3858       addptr(result, stride2x2);  // update since we already compared at this addr
3859       subl(cnt2, stride2x2);      // and sub the size too
3860       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3861 
3862       vpxor(vec1, vec1);
3863       jmpb(COMPARE_WIDE_TAIL);
3864     }//if (VM_Version::supports_avx512vlbw())
3865 #endif // _LP64
3866 
3867 
3868     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3869     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3870       vmovdqu(vec1, Address(str1, result, scale));
3871       vpxor(vec1, Address(str2, result, scale));
3872     } else {
3873       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3874       vpxor(vec1, Address(str2, result, scale2));
3875     }
3876     vptest(vec1, vec1);
3877     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3878     addptr(result, stride2);
3879     subl(cnt2, stride2);
3880     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3881     // clean upper bits of YMM registers
3882     vpxor(vec1, vec1);
3883 
3884     // compare wide vectors tail
3885     bind(COMPARE_WIDE_TAIL);
3886     testptr(result, result);
3887     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3888 
3889     movl(result, stride2);
3890     movl(cnt2, result);
3891     negptr(result);
3892     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3893 
3894     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3895     bind(VECTOR_NOT_EQUAL);
3896     // clean upper bits of YMM registers
3897     vpxor(vec1, vec1);
3898     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3899       lea(str1, Address(str1, result, scale));
3900       lea(str2, Address(str2, result, scale));
3901     } else {
3902       lea(str1, Address(str1, result, scale1));
3903       lea(str2, Address(str2, result, scale2));
3904     }
3905     jmp(COMPARE_16_CHARS);
3906 
3907     // Compare tail chars, length between 1 to 15 chars
3908     bind(COMPARE_TAIL_LONG);
3909     movl(cnt2, result);
3910     cmpl(cnt2, stride);
3911     jcc(Assembler::less, COMPARE_SMALL_STR);
3912 
3913     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3914       movdqu(vec1, Address(str1, 0));
3915     } else {
3916       pmovzxbw(vec1, Address(str1, 0));
3917     }
3918     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3919     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3920     subptr(cnt2, stride);
3921     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3922     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3923       lea(str1, Address(str1, result, scale));
3924       lea(str2, Address(str2, result, scale));
3925     } else {
3926       lea(str1, Address(str1, result, scale1));
3927       lea(str2, Address(str2, result, scale2));
3928     }
3929     negptr(cnt2);
3930     jmpb(WHILE_HEAD_LABEL);
3931 
3932     bind(COMPARE_SMALL_STR);
3933   } else if (UseSSE42Intrinsics) {
3934     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3935     int pcmpmask = 0x19;
3936     // Setup to compare 8-char (16-byte) vectors,
3937     // start from first character again because it has aligned address.
3938     movl(result, cnt2);
3939     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3940     if (ae == StrIntrinsicNode::LL) {
3941       pcmpmask &= ~0x01;
3942     }
3943     jcc(Assembler::zero, COMPARE_TAIL);
3944     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3945       lea(str1, Address(str1, result, scale));
3946       lea(str2, Address(str2, result, scale));
3947     } else {
3948       lea(str1, Address(str1, result, scale1));
3949       lea(str2, Address(str2, result, scale2));
3950     }
3951     negptr(result);
3952 
3953     // pcmpestri
3954     //   inputs:
3955     //     vec1- substring
3956     //     rax - negative string length (elements count)
3957     //     mem - scanned string
3958     //     rdx - string length (elements count)
3959     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3960     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3961     //   outputs:
3962     //     rcx - first mismatched element index
3963     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3964 
3965     bind(COMPARE_WIDE_VECTORS);
3966     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3967       movdqu(vec1, Address(str1, result, scale));
3968       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3969     } else {
3970       pmovzxbw(vec1, Address(str1, result, scale1));
3971       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3972     }
3973     // After pcmpestri cnt1(rcx) contains mismatched element index
3974 
3975     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3976     addptr(result, stride);
3977     subptr(cnt2, stride);
3978     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3979 
3980     // compare wide vectors tail
3981     testptr(result, result);
3982     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3983 
3984     movl(cnt2, stride);
3985     movl(result, stride);
3986     negptr(result);
3987     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3988       movdqu(vec1, Address(str1, result, scale));
3989       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3990     } else {
3991       pmovzxbw(vec1, Address(str1, result, scale1));
3992       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3993     }
3994     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3995 
3996     // Mismatched characters in the vectors
3997     bind(VECTOR_NOT_EQUAL);
3998     addptr(cnt1, result);
3999     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4000     subl(result, cnt2);
4001     jmpb(POP_LABEL);
4002 
4003     bind(COMPARE_TAIL); // limit is zero
4004     movl(cnt2, result);
4005     // Fallthru to tail compare
4006   }
4007   // Shift str2 and str1 to the end of the arrays, negate min
4008   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4009     lea(str1, Address(str1, cnt2, scale));
4010     lea(str2, Address(str2, cnt2, scale));
4011   } else {
4012     lea(str1, Address(str1, cnt2, scale1));
4013     lea(str2, Address(str2, cnt2, scale2));
4014   }
4015   decrementl(cnt2);  // first character was compared already
4016   negptr(cnt2);
4017 
4018   // Compare the rest of the elements
4019   bind(WHILE_HEAD_LABEL);
4020   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4021   subl(result, cnt1);
4022   jccb(Assembler::notZero, POP_LABEL);
4023   increment(cnt2);
4024   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4025 
4026   // Strings are equal up to min length.  Return the length difference.
4027   bind(LENGTH_DIFF_LABEL);
4028   pop(result);
4029   if (ae == StrIntrinsicNode::UU) {
4030     // Divide diff by 2 to get number of chars
4031     sarl(result, 1);
4032   }
4033   jmpb(DONE_LABEL);
4034 
4035 #ifdef _LP64
4036   if (VM_Version::supports_avx512vlbw()) {
4037 
4038     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4039 
4040     kmovql(cnt1, mask);
4041     notq(cnt1);
4042     bsfq(cnt2, cnt1);
4043     if (ae != StrIntrinsicNode::LL) {
4044       // Divide diff by 2 to get number of chars
4045       sarl(cnt2, 1);
4046     }
4047     addq(result, cnt2);
4048     if (ae == StrIntrinsicNode::LL) {
4049       load_unsigned_byte(cnt1, Address(str2, result));
4050       load_unsigned_byte(result, Address(str1, result));
4051     } else if (ae == StrIntrinsicNode::UU) {
4052       load_unsigned_short(cnt1, Address(str2, result, scale));
4053       load_unsigned_short(result, Address(str1, result, scale));
4054     } else {
4055       load_unsigned_short(cnt1, Address(str2, result, scale2));
4056       load_unsigned_byte(result, Address(str1, result, scale1));
4057     }
4058     subl(result, cnt1);
4059     jmpb(POP_LABEL);
4060   }//if (VM_Version::supports_avx512vlbw())
4061 #endif // _LP64
4062 
4063   // Discard the stored length difference
4064   bind(POP_LABEL);
4065   pop(cnt1);
4066 
4067   // That's it
4068   bind(DONE_LABEL);
4069   if(ae == StrIntrinsicNode::UL) {
4070     negl(result);
4071   }
4072 
4073 }
4074 
4075 // Search for Non-ASCII character (Negative byte value) in a byte array,
4076 // return the index of the first such character, otherwise the length
4077 // of the array segment searched.
4078 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4079 //   @IntrinsicCandidate
4080 //   public static int countPositives(byte[] ba, int off, int len) {
4081 //     for (int i = off; i < off + len; i++) {
4082 //       if (ba[i] < 0) {
4083 //         return i - off;
4084 //       }
4085 //     }
4086 //     return len;
4087 //   }
4088 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4089   Register result, Register tmp1,
4090   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4091   // rsi: byte array
4092   // rcx: len
4093   // rax: result
4094   ShortBranchVerifier sbv(this);
4095   assert_different_registers(ary1, len, result, tmp1);
4096   assert_different_registers(vec1, vec2);
4097   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4098 
4099   movl(result, len); // copy
4100   // len == 0
4101   testl(len, len);
4102   jcc(Assembler::zero, DONE);
4103 
4104   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4105     VM_Version::supports_avx512vlbw() &&
4106     VM_Version::supports_bmi2()) {
4107 
4108     Label test_64_loop, test_tail, BREAK_LOOP;
4109     movl(tmp1, len);
4110     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4111 
4112     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4113     andl(len,  0xffffffc0); // vector count (in chars)
4114     jccb(Assembler::zero, test_tail);
4115 
4116     lea(ary1, Address(ary1, len, Address::times_1));
4117     negptr(len);
4118 
4119     bind(test_64_loop);
4120     // Check whether our 64 elements of size byte contain negatives
4121     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4122     kortestql(mask1, mask1);
4123     jcc(Assembler::notZero, BREAK_LOOP);
4124 
4125     addptr(len, 64);
4126     jccb(Assembler::notZero, test_64_loop);
4127 
4128     bind(test_tail);
4129     // bail out when there is nothing to be done
4130     testl(tmp1, -1);
4131     jcc(Assembler::zero, DONE);
4132 
4133 
4134     // check the tail for absense of negatives
4135     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4136 #ifdef _LP64
4137     {
4138       Register tmp3_aliased = len;
4139       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4140       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4141       notq(tmp3_aliased);
4142       kmovql(mask2, tmp3_aliased);
4143     }
4144 #else
4145     Label k_init;
4146     jmp(k_init);
4147 
4148     // We could not read 64-bits from a general purpose register thus we move
4149     // data required to compose 64 1's to the instruction stream
4150     // We emit 64 byte wide series of elements from 0..63 which later on would
4151     // be used as a compare targets with tail count contained in tmp1 register.
4152     // Result would be a k register having tmp1 consecutive number or 1
4153     // counting from least significant bit.
4154     address tmp = pc();
4155     emit_int64(0x0706050403020100);
4156     emit_int64(0x0F0E0D0C0B0A0908);
4157     emit_int64(0x1716151413121110);
4158     emit_int64(0x1F1E1D1C1B1A1918);
4159     emit_int64(0x2726252423222120);
4160     emit_int64(0x2F2E2D2C2B2A2928);
4161     emit_int64(0x3736353433323130);
4162     emit_int64(0x3F3E3D3C3B3A3938);
4163 
4164     bind(k_init);
4165     lea(len, InternalAddress(tmp));
4166     // create mask to test for negative byte inside a vector
4167     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4168     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4169 
4170 #endif
4171     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4172     ktestq(mask1, mask2);
4173     jcc(Assembler::zero, DONE);
4174 
4175     // do a full check for negative registers in the tail
4176     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4177                      // ary1 already pointing to the right place
4178     jmpb(TAIL_START);
4179 
4180     bind(BREAK_LOOP);
4181     // At least one byte in the last 64 byte block was negative.
4182     // Set up to look at the last 64 bytes as if they were a tail
4183     lea(ary1, Address(ary1, len, Address::times_1));
4184     addptr(result, len);
4185     // Ignore the very last byte: if all others are positive,
4186     // it must be negative, so we can skip right to the 2+1 byte
4187     // end comparison at this point
4188     orl(result, 63);
4189     movl(len, 63);
4190     // Fallthru to tail compare
4191   } else {
4192 
4193     if (UseAVX >= 2 && UseSSE >= 2) {
4194       // With AVX2, use 32-byte vector compare
4195       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4196 
4197       // Compare 32-byte vectors
4198       testl(len, 0xffffffe0);   // vector count (in bytes)
4199       jccb(Assembler::zero, TAIL_START);
4200 
4201       andl(len, 0xffffffe0);
4202       lea(ary1, Address(ary1, len, Address::times_1));
4203       negptr(len);
4204 
4205       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4206       movdl(vec2, tmp1);
4207       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4208 
4209       bind(COMPARE_WIDE_VECTORS);
4210       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4211       vptest(vec1, vec2);
4212       jccb(Assembler::notZero, BREAK_LOOP);
4213       addptr(len, 32);
4214       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4215 
4216       testl(result, 0x0000001f);   // any bytes remaining?
4217       jcc(Assembler::zero, DONE);
4218 
4219       // Quick test using the already prepared vector mask
4220       movl(len, result);
4221       andl(len, 0x0000001f);
4222       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4223       vptest(vec1, vec2);
4224       jcc(Assembler::zero, DONE);
4225       // There are zeros, jump to the tail to determine exactly where
4226       jmpb(TAIL_START);
4227 
4228       bind(BREAK_LOOP);
4229       // At least one byte in the last 32-byte vector is negative.
4230       // Set up to look at the last 32 bytes as if they were a tail
4231       lea(ary1, Address(ary1, len, Address::times_1));
4232       addptr(result, len);
4233       // Ignore the very last byte: if all others are positive,
4234       // it must be negative, so we can skip right to the 2+1 byte
4235       // end comparison at this point
4236       orl(result, 31);
4237       movl(len, 31);
4238       // Fallthru to tail compare
4239     } else if (UseSSE42Intrinsics) {
4240       // With SSE4.2, use double quad vector compare
4241       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4242 
4243       // Compare 16-byte vectors
4244       testl(len, 0xfffffff0);   // vector count (in bytes)
4245       jcc(Assembler::zero, TAIL_START);
4246 
4247       andl(len, 0xfffffff0);
4248       lea(ary1, Address(ary1, len, Address::times_1));
4249       negptr(len);
4250 
4251       movl(tmp1, 0x80808080);
4252       movdl(vec2, tmp1);
4253       pshufd(vec2, vec2, 0);
4254 
4255       bind(COMPARE_WIDE_VECTORS);
4256       movdqu(vec1, Address(ary1, len, Address::times_1));
4257       ptest(vec1, vec2);
4258       jccb(Assembler::notZero, BREAK_LOOP);
4259       addptr(len, 16);
4260       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4261 
4262       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4263       jcc(Assembler::zero, DONE);
4264 
4265       // Quick test using the already prepared vector mask
4266       movl(len, result);
4267       andl(len, 0x0000000f);   // tail count (in bytes)
4268       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4269       ptest(vec1, vec2);
4270       jcc(Assembler::zero, DONE);
4271       jmpb(TAIL_START);
4272 
4273       bind(BREAK_LOOP);
4274       // At least one byte in the last 16-byte vector is negative.
4275       // Set up and look at the last 16 bytes as if they were a tail
4276       lea(ary1, Address(ary1, len, Address::times_1));
4277       addptr(result, len);
4278       // Ignore the very last byte: if all others are positive,
4279       // it must be negative, so we can skip right to the 2+1 byte
4280       // end comparison at this point
4281       orl(result, 15);
4282       movl(len, 15);
4283       // Fallthru to tail compare
4284     }
4285   }
4286 
4287   bind(TAIL_START);
4288   // Compare 4-byte vectors
4289   andl(len, 0xfffffffc); // vector count (in bytes)
4290   jccb(Assembler::zero, COMPARE_CHAR);
4291 
4292   lea(ary1, Address(ary1, len, Address::times_1));
4293   negptr(len);
4294 
4295   bind(COMPARE_VECTORS);
4296   movl(tmp1, Address(ary1, len, Address::times_1));
4297   andl(tmp1, 0x80808080);
4298   jccb(Assembler::notZero, TAIL_ADJUST);
4299   addptr(len, 4);
4300   jccb(Assembler::notZero, COMPARE_VECTORS);
4301 
4302   // Compare trailing char (final 2-3 bytes), if any
4303   bind(COMPARE_CHAR);
4304 
4305   testl(result, 0x2);   // tail  char
4306   jccb(Assembler::zero, COMPARE_BYTE);
4307   load_unsigned_short(tmp1, Address(ary1, 0));
4308   andl(tmp1, 0x00008080);
4309   jccb(Assembler::notZero, CHAR_ADJUST);
4310   lea(ary1, Address(ary1, 2));
4311 
4312   bind(COMPARE_BYTE);
4313   testl(result, 0x1);   // tail  byte
4314   jccb(Assembler::zero, DONE);
4315   load_unsigned_byte(tmp1, Address(ary1, 0));
4316   testl(tmp1, 0x00000080);
4317   jccb(Assembler::zero, DONE);
4318   subptr(result, 1);
4319   jmpb(DONE);
4320 
4321   bind(TAIL_ADJUST);
4322   // there are negative bits in the last 4 byte block.
4323   // Adjust result and check the next three bytes
4324   addptr(result, len);
4325   orl(result, 3);
4326   lea(ary1, Address(ary1, len, Address::times_1));
4327   jmpb(COMPARE_CHAR);
4328 
4329   bind(CHAR_ADJUST);
4330   // We are looking at a char + optional byte tail, and found that one
4331   // of the bytes in the char is negative. Adjust the result, check the
4332   // first byte and readjust if needed.
4333   andl(result, 0xfffffffc);
4334   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4335   jccb(Assembler::notZero, DONE);
4336   addptr(result, 1);
4337 
4338   // That's it
4339   bind(DONE);
4340   if (UseAVX >= 2 && UseSSE >= 2) {
4341     // clean upper bits of YMM registers
4342     vpxor(vec1, vec1);
4343     vpxor(vec2, vec2);
4344   }
4345 }
4346 
4347 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4348 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4349                                       Register limit, Register result, Register chr,
4350                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4351                                       KRegister mask, bool expand_ary2) {
4352   // for expand_ary2, limit is the (smaller) size of the second array.
4353   ShortBranchVerifier sbv(this);
4354   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4355 
4356   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4357          "Expansion only implemented for AVX2");
4358 
4359   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4360   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4361 
4362   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4363   int scaleIncr = expand_ary2 ? 8 : 16;
4364 
4365   if (is_array_equ) {
4366     // Check the input args
4367     cmpoop(ary1, ary2);
4368     jcc(Assembler::equal, TRUE_LABEL);
4369 
4370     // Need additional checks for arrays_equals.
4371     testptr(ary1, ary1);
4372     jcc(Assembler::zero, FALSE_LABEL);
4373     testptr(ary2, ary2);
4374     jcc(Assembler::zero, FALSE_LABEL);
4375 
4376     // Check the lengths
4377     movl(limit, Address(ary1, length_offset));
4378     cmpl(limit, Address(ary2, length_offset));
4379     jcc(Assembler::notEqual, FALSE_LABEL);
4380   }
4381 
4382   // count == 0
4383   testl(limit, limit);
4384   jcc(Assembler::zero, TRUE_LABEL);
4385 
4386   if (is_array_equ) {
4387     // Load array address
4388     lea(ary1, Address(ary1, base_offset));
4389     lea(ary2, Address(ary2, base_offset));
4390   }
4391 
4392   if (is_array_equ && is_char) {
4393     // arrays_equals when used for char[].
4394     shll(limit, 1);      // byte count != 0
4395   }
4396   movl(result, limit); // copy
4397 
4398   if (UseAVX >= 2) {
4399     // With AVX2, use 32-byte vector compare
4400     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4401 
4402     // Compare 32-byte vectors
4403     if (expand_ary2) {
4404       andl(result, 0x0000000f);  //   tail count (in bytes)
4405       andl(limit, 0xfffffff0);   // vector count (in bytes)
4406       jcc(Assembler::zero, COMPARE_TAIL);
4407     } else {
4408       andl(result, 0x0000001f);  //   tail count (in bytes)
4409       andl(limit, 0xffffffe0);   // vector count (in bytes)
4410       jcc(Assembler::zero, COMPARE_TAIL_16);
4411     }
4412 
4413     lea(ary1, Address(ary1, limit, scaleFactor));
4414     lea(ary2, Address(ary2, limit, Address::times_1));
4415     negptr(limit);
4416 
4417 #ifdef _LP64
4418     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4419       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4420 
4421       cmpl(limit, -64);
4422       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4423 
4424       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4425 
4426       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4427       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4428       kortestql(mask, mask);
4429       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4430       addptr(limit, 64);  // update since we already compared at this addr
4431       cmpl(limit, -64);
4432       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4433 
4434       // At this point we may still need to compare -limit+result bytes.
4435       // We could execute the next two instruction and just continue via non-wide path:
4436       //  cmpl(limit, 0);
4437       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4438       // But since we stopped at the points ary{1,2}+limit which are
4439       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4440       // (|limit| <= 32 and result < 32),
4441       // we may just compare the last 64 bytes.
4442       //
4443       addptr(result, -64);   // it is safe, bc we just came from this area
4444       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4445       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4446       kortestql(mask, mask);
4447       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4448 
4449       jmp(TRUE_LABEL);
4450 
4451       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4452 
4453     }//if (VM_Version::supports_avx512vlbw())
4454 #endif //_LP64
4455     bind(COMPARE_WIDE_VECTORS);
4456     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4457     if (expand_ary2) {
4458       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4459     } else {
4460       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4461     }
4462     vpxor(vec1, vec2);
4463 
4464     vptest(vec1, vec1);
4465     jcc(Assembler::notZero, FALSE_LABEL);
4466     addptr(limit, scaleIncr * 2);
4467     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4468 
4469     testl(result, result);
4470     jcc(Assembler::zero, TRUE_LABEL);
4471 
4472     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4473     if (expand_ary2) {
4474       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4475     } else {
4476       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4477     }
4478     vpxor(vec1, vec2);
4479 
4480     vptest(vec1, vec1);
4481     jcc(Assembler::notZero, FALSE_LABEL);
4482     jmp(TRUE_LABEL);
4483 
4484     bind(COMPARE_TAIL_16); // limit is zero
4485     movl(limit, result);
4486 
4487     // Compare 16-byte chunks
4488     andl(result, 0x0000000f);  //   tail count (in bytes)
4489     andl(limit, 0xfffffff0);   // vector count (in bytes)
4490     jcc(Assembler::zero, COMPARE_TAIL);
4491 
4492     lea(ary1, Address(ary1, limit, scaleFactor));
4493     lea(ary2, Address(ary2, limit, Address::times_1));
4494     negptr(limit);
4495 
4496     bind(COMPARE_WIDE_VECTORS_16);
4497     movdqu(vec1, Address(ary1, limit, scaleFactor));
4498     if (expand_ary2) {
4499       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4500     } else {
4501       movdqu(vec2, Address(ary2, limit, Address::times_1));
4502     }
4503     pxor(vec1, vec2);
4504 
4505     ptest(vec1, vec1);
4506     jcc(Assembler::notZero, FALSE_LABEL);
4507     addptr(limit, scaleIncr);
4508     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4509 
4510     bind(COMPARE_TAIL); // limit is zero
4511     movl(limit, result);
4512     // Fallthru to tail compare
4513   } else if (UseSSE42Intrinsics) {
4514     // With SSE4.2, use double quad vector compare
4515     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4516 
4517     // Compare 16-byte vectors
4518     andl(result, 0x0000000f);  //   tail count (in bytes)
4519     andl(limit, 0xfffffff0);   // vector count (in bytes)
4520     jcc(Assembler::zero, COMPARE_TAIL);
4521 
4522     lea(ary1, Address(ary1, limit, Address::times_1));
4523     lea(ary2, Address(ary2, limit, Address::times_1));
4524     negptr(limit);
4525 
4526     bind(COMPARE_WIDE_VECTORS);
4527     movdqu(vec1, Address(ary1, limit, Address::times_1));
4528     movdqu(vec2, Address(ary2, limit, Address::times_1));
4529     pxor(vec1, vec2);
4530 
4531     ptest(vec1, vec1);
4532     jcc(Assembler::notZero, FALSE_LABEL);
4533     addptr(limit, 16);
4534     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4535 
4536     testl(result, result);
4537     jcc(Assembler::zero, TRUE_LABEL);
4538 
4539     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4540     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4541     pxor(vec1, vec2);
4542 
4543     ptest(vec1, vec1);
4544     jccb(Assembler::notZero, FALSE_LABEL);
4545     jmpb(TRUE_LABEL);
4546 
4547     bind(COMPARE_TAIL); // limit is zero
4548     movl(limit, result);
4549     // Fallthru to tail compare
4550   }
4551 
4552   // Compare 4-byte vectors
4553   if (expand_ary2) {
4554     testl(result, result);
4555     jccb(Assembler::zero, TRUE_LABEL);
4556   } else {
4557     andl(limit, 0xfffffffc); // vector count (in bytes)
4558     jccb(Assembler::zero, COMPARE_CHAR);
4559   }
4560 
4561   lea(ary1, Address(ary1, limit, scaleFactor));
4562   lea(ary2, Address(ary2, limit, Address::times_1));
4563   negptr(limit);
4564 
4565   bind(COMPARE_VECTORS);
4566   if (expand_ary2) {
4567     // There are no "vector" operations for bytes to shorts
4568     movzbl(chr, Address(ary2, limit, Address::times_1));
4569     cmpw(Address(ary1, limit, Address::times_2), chr);
4570     jccb(Assembler::notEqual, FALSE_LABEL);
4571     addptr(limit, 1);
4572     jcc(Assembler::notZero, COMPARE_VECTORS);
4573     jmp(TRUE_LABEL);
4574   } else {
4575     movl(chr, Address(ary1, limit, Address::times_1));
4576     cmpl(chr, Address(ary2, limit, Address::times_1));
4577     jccb(Assembler::notEqual, FALSE_LABEL);
4578     addptr(limit, 4);
4579     jcc(Assembler::notZero, COMPARE_VECTORS);
4580   }
4581 
4582   // Compare trailing char (final 2 bytes), if any
4583   bind(COMPARE_CHAR);
4584   testl(result, 0x2);   // tail  char
4585   jccb(Assembler::zero, COMPARE_BYTE);
4586   load_unsigned_short(chr, Address(ary1, 0));
4587   load_unsigned_short(limit, Address(ary2, 0));
4588   cmpl(chr, limit);
4589   jccb(Assembler::notEqual, FALSE_LABEL);
4590 
4591   if (is_array_equ && is_char) {
4592     bind(COMPARE_BYTE);
4593   } else {
4594     lea(ary1, Address(ary1, 2));
4595     lea(ary2, Address(ary2, 2));
4596 
4597     bind(COMPARE_BYTE);
4598     testl(result, 0x1);   // tail  byte
4599     jccb(Assembler::zero, TRUE_LABEL);
4600     load_unsigned_byte(chr, Address(ary1, 0));
4601     load_unsigned_byte(limit, Address(ary2, 0));
4602     cmpl(chr, limit);
4603     jccb(Assembler::notEqual, FALSE_LABEL);
4604   }
4605   bind(TRUE_LABEL);
4606   movl(result, 1);   // return true
4607   jmpb(DONE);
4608 
4609   bind(FALSE_LABEL);
4610   xorl(result, result); // return false
4611 
4612   // That's it
4613   bind(DONE);
4614   if (UseAVX >= 2) {
4615     // clean upper bits of YMM registers
4616     vpxor(vec1, vec1);
4617     vpxor(vec2, vec2);
4618   }
4619 }
4620 
4621 #ifdef _LP64
4622 
4623 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4624 #define __ masm.
4625   Register dst = stub.data<0>();
4626   XMMRegister src = stub.data<1>();
4627   address target = stub.data<2>();
4628   __ bind(stub.entry());
4629   __ subptr(rsp, 8);
4630   __ movdbl(Address(rsp), src);
4631   __ call(RuntimeAddress(target));
4632   __ pop(dst);
4633   __ jmp(stub.continuation());
4634 #undef __
4635 }
4636 
4637 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4638   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4639   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4640 
4641   address slowpath_target;
4642   if (dst_bt == T_INT) {
4643     if (src_bt == T_FLOAT) {
4644       cvttss2sil(dst, src);
4645       cmpl(dst, 0x80000000);
4646       slowpath_target = StubRoutines::x86::f2i_fixup();
4647     } else {
4648       cvttsd2sil(dst, src);
4649       cmpl(dst, 0x80000000);
4650       slowpath_target = StubRoutines::x86::d2i_fixup();
4651     }
4652   } else {
4653     if (src_bt == T_FLOAT) {
4654       cvttss2siq(dst, src);
4655       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4656       slowpath_target = StubRoutines::x86::f2l_fixup();
4657     } else {
4658       cvttsd2siq(dst, src);
4659       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4660       slowpath_target = StubRoutines::x86::d2l_fixup();
4661     }
4662   }
4663 
4664   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4665   jcc(Assembler::equal, stub->entry());
4666   bind(stub->continuation());
4667 }
4668 
4669 #endif // _LP64
4670 
4671 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4672                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4673   switch(ideal_opc) {
4674     case Op_LShiftVS:
4675       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4676     case Op_LShiftVI:
4677       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4678     case Op_LShiftVL:
4679       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4680     case Op_RShiftVS:
4681       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4682     case Op_RShiftVI:
4683       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4684     case Op_RShiftVL:
4685       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4686     case Op_URShiftVS:
4687       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4688     case Op_URShiftVI:
4689       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4690     case Op_URShiftVL:
4691       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4692     case Op_RotateRightV:
4693       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4694     case Op_RotateLeftV:
4695       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4696     default:
4697       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4698       break;
4699   }
4700 }
4701 
4702 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4703                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4704   if (is_unsigned) {
4705     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4706   } else {
4707     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4708   }
4709 }
4710 
4711 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4712                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4713   switch (elem_bt) {
4714     case T_BYTE:
4715       if (ideal_opc == Op_SaturatingAddV) {
4716         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4717       } else {
4718         assert(ideal_opc == Op_SaturatingSubV, "");
4719         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4720       }
4721       break;
4722     case T_SHORT:
4723       if (ideal_opc == Op_SaturatingAddV) {
4724         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4725       } else {
4726         assert(ideal_opc == Op_SaturatingSubV, "");
4727         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4728       }
4729       break;
4730     default:
4731       fatal("Unsupported type %s", type2name(elem_bt));
4732       break;
4733   }
4734 }
4735 
4736 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4737                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4738   switch (elem_bt) {
4739     case T_BYTE:
4740       if (ideal_opc == Op_SaturatingAddV) {
4741         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4742       } else {
4743         assert(ideal_opc == Op_SaturatingSubV, "");
4744         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4745       }
4746       break;
4747     case T_SHORT:
4748       if (ideal_opc == Op_SaturatingAddV) {
4749         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4750       } else {
4751         assert(ideal_opc == Op_SaturatingSubV, "");
4752         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4753       }
4754       break;
4755     default:
4756       fatal("Unsupported type %s", type2name(elem_bt));
4757       break;
4758   }
4759 }
4760 
4761 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4762                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4763   if (is_unsigned) {
4764     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4765   } else {
4766     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4767   }
4768 }
4769 
4770 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4771                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4772   switch (elem_bt) {
4773     case T_BYTE:
4774       if (ideal_opc == Op_SaturatingAddV) {
4775         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4776       } else {
4777         assert(ideal_opc == Op_SaturatingSubV, "");
4778         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4779       }
4780       break;
4781     case T_SHORT:
4782       if (ideal_opc == Op_SaturatingAddV) {
4783         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4784       } else {
4785         assert(ideal_opc == Op_SaturatingSubV, "");
4786         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4787       }
4788       break;
4789     default:
4790       fatal("Unsupported type %s", type2name(elem_bt));
4791       break;
4792   }
4793 }
4794 
4795 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4796                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4797   switch (elem_bt) {
4798     case T_BYTE:
4799       if (ideal_opc == Op_SaturatingAddV) {
4800         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4801       } else {
4802         assert(ideal_opc == Op_SaturatingSubV, "");
4803         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4804       }
4805       break;
4806     case T_SHORT:
4807       if (ideal_opc == Op_SaturatingAddV) {
4808         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4809       } else {
4810         assert(ideal_opc == Op_SaturatingSubV, "");
4811         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4812       }
4813       break;
4814     default:
4815       fatal("Unsupported type %s", type2name(elem_bt));
4816       break;
4817   }
4818 }
4819 
4820 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4821                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4822                                     bool is_varshift) {
4823   switch (ideal_opc) {
4824     case Op_AddVB:
4825       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4826     case Op_AddVS:
4827       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4828     case Op_AddVI:
4829       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4830     case Op_AddVL:
4831       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4832     case Op_AddVF:
4833       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4834     case Op_AddVD:
4835       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4836     case Op_SubVB:
4837       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4838     case Op_SubVS:
4839       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4840     case Op_SubVI:
4841       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4842     case Op_SubVL:
4843       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4844     case Op_SubVF:
4845       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4846     case Op_SubVD:
4847       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4848     case Op_MulVS:
4849       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4850     case Op_MulVI:
4851       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4852     case Op_MulVL:
4853       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4854     case Op_MulVF:
4855       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4856     case Op_MulVD:
4857       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4858     case Op_DivVF:
4859       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4860     case Op_DivVD:
4861       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4862     case Op_SqrtVF:
4863       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4864     case Op_SqrtVD:
4865       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4866     case Op_AbsVB:
4867       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4868     case Op_AbsVS:
4869       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4870     case Op_AbsVI:
4871       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4872     case Op_AbsVL:
4873       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4874     case Op_FmaVF:
4875       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4876     case Op_FmaVD:
4877       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4878     case Op_VectorRearrange:
4879       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4880     case Op_LShiftVS:
4881       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4882     case Op_LShiftVI:
4883       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4884     case Op_LShiftVL:
4885       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4886     case Op_RShiftVS:
4887       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4888     case Op_RShiftVI:
4889       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4890     case Op_RShiftVL:
4891       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4892     case Op_URShiftVS:
4893       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4894     case Op_URShiftVI:
4895       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4896     case Op_URShiftVL:
4897       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4898     case Op_RotateLeftV:
4899       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4900     case Op_RotateRightV:
4901       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4902     case Op_MaxV:
4903       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4904     case Op_MinV:
4905       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4906     case Op_UMinV:
4907       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4908     case Op_UMaxV:
4909       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4910     case Op_XorV:
4911       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4912     case Op_OrV:
4913       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4914     case Op_AndV:
4915       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4916     default:
4917       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4918       break;
4919   }
4920 }
4921 
4922 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4923                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4924   switch (ideal_opc) {
4925     case Op_AddVB:
4926       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4927     case Op_AddVS:
4928       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4929     case Op_AddVI:
4930       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4931     case Op_AddVL:
4932       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4933     case Op_AddVF:
4934       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4935     case Op_AddVD:
4936       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4937     case Op_SubVB:
4938       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4939     case Op_SubVS:
4940       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4941     case Op_SubVI:
4942       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4943     case Op_SubVL:
4944       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4945     case Op_SubVF:
4946       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4947     case Op_SubVD:
4948       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4949     case Op_MulVS:
4950       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4951     case Op_MulVI:
4952       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4953     case Op_MulVL:
4954       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4955     case Op_MulVF:
4956       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4957     case Op_MulVD:
4958       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4959     case Op_DivVF:
4960       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4961     case Op_DivVD:
4962       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4963     case Op_FmaVF:
4964       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4965     case Op_FmaVD:
4966       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4967     case Op_MaxV:
4968       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4969     case Op_MinV:
4970       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4971     case Op_UMaxV:
4972       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4973     case Op_UMinV:
4974       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4975     case Op_XorV:
4976       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4977     case Op_OrV:
4978       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4979     case Op_AndV:
4980       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4981     default:
4982       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4983       break;
4984   }
4985 }
4986 
4987 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4988                                   KRegister src1, KRegister src2) {
4989   BasicType etype = T_ILLEGAL;
4990   switch(mask_len) {
4991     case 2:
4992     case 4:
4993     case 8:  etype = T_BYTE; break;
4994     case 16: etype = T_SHORT; break;
4995     case 32: etype = T_INT; break;
4996     case 64: etype = T_LONG; break;
4997     default: fatal("Unsupported type"); break;
4998   }
4999   assert(etype != T_ILLEGAL, "");
5000   switch(ideal_opc) {
5001     case Op_AndVMask:
5002       kand(etype, dst, src1, src2); break;
5003     case Op_OrVMask:
5004       kor(etype, dst, src1, src2); break;
5005     case Op_XorVMask:
5006       kxor(etype, dst, src1, src2); break;
5007     default:
5008       fatal("Unsupported masked operation"); break;
5009   }
5010 }
5011 
5012 /*
5013  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5014  * If src is NaN, the result is 0.
5015  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
5016  * the result is equal to the value of Integer.MIN_VALUE.
5017  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
5018  * the result is equal to the value of Integer.MAX_VALUE.
5019  */
5020 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5021                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5022                                                                    Register rscratch, AddressLiteral float_sign_flip,
5023                                                                    int vec_enc) {
5024   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5025   Label done;
5026   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
5027   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5028   vptest(xtmp2, xtmp2, vec_enc);
5029   jccb(Assembler::equal, done);
5030 
5031   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
5032   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
5033 
5034   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5035   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
5036   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
5037 
5038   // Recompute the mask for remaining special value.
5039   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
5040   // Extract SRC values corresponding to TRUE mask lanes.
5041   vpand(xtmp4, xtmp2, src, vec_enc);
5042   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
5043   // values are set.
5044   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5045 
5046   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5047   bind(done);
5048 }
5049 
5050 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5051                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5052                                                                     Register rscratch, AddressLiteral float_sign_flip,
5053                                                                     int vec_enc) {
5054   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5055   Label done;
5056   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5057   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5058   kortestwl(ktmp1, ktmp1);
5059   jccb(Assembler::equal, done);
5060 
5061   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5062   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5063   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5064 
5065   kxorwl(ktmp1, ktmp1, ktmp2);
5066   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5067   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5068   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5069   bind(done);
5070 }
5071 
5072 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5073                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5074                                                                      Register rscratch, AddressLiteral double_sign_flip,
5075                                                                      int vec_enc) {
5076   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5077 
5078   Label done;
5079   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5080   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5081   kortestwl(ktmp1, ktmp1);
5082   jccb(Assembler::equal, done);
5083 
5084   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5085   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5086   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5087 
5088   kxorwl(ktmp1, ktmp1, ktmp2);
5089   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5090   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5091   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5092   bind(done);
5093 }
5094 
5095 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5096                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5097                                                                      Register rscratch, AddressLiteral float_sign_flip,
5098                                                                      int vec_enc) {
5099   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5100   Label done;
5101   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5102   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5103   kortestwl(ktmp1, ktmp1);
5104   jccb(Assembler::equal, done);
5105 
5106   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5107   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5108   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5109 
5110   kxorwl(ktmp1, ktmp1, ktmp2);
5111   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5112   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5113   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5114   bind(done);
5115 }
5116 
5117 /*
5118  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5119  * If src is NaN, the result is 0.
5120  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5121  * the result is equal to the value of Long.MIN_VALUE.
5122  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5123  * the result is equal to the value of Long.MAX_VALUE.
5124  */
5125 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5126                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5127                                                                       Register rscratch, AddressLiteral double_sign_flip,
5128                                                                       int vec_enc) {
5129   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5130 
5131   Label done;
5132   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5133   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5134   kortestwl(ktmp1, ktmp1);
5135   jccb(Assembler::equal, done);
5136 
5137   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5138   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5139   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5140 
5141   kxorwl(ktmp1, ktmp1, ktmp2);
5142   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5143   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5144   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5145   bind(done);
5146 }
5147 
5148 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5149                                                              XMMRegister xtmp, int index, int vec_enc) {
5150    assert(vec_enc < Assembler::AVX_512bit, "");
5151    if (vec_enc == Assembler::AVX_256bit) {
5152      vextractf128_high(xtmp, src);
5153      vshufps(dst, src, xtmp, index, vec_enc);
5154    } else {
5155      vshufps(dst, src, zero, index, vec_enc);
5156    }
5157 }
5158 
5159 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5160                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5161                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5162   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5163 
5164   Label done;
5165   // Compare the destination lanes with float_sign_flip
5166   // value to get mask for all special values.
5167   movdqu(xtmp1, float_sign_flip, rscratch);
5168   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5169   ptest(xtmp2, xtmp2);
5170   jccb(Assembler::equal, done);
5171 
5172   // Flip float_sign_flip to get max integer value.
5173   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5174   pxor(xtmp1, xtmp4);
5175 
5176   // Set detination lanes corresponding to unordered source lanes as zero.
5177   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5178   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5179 
5180   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5181   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5182   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5183 
5184   // Recompute the mask for remaining special value.
5185   pxor(xtmp2, xtmp3);
5186   // Extract mask corresponding to non-negative source lanes.
5187   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5188 
5189   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5190   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5191   pand(xtmp3, xtmp2);
5192 
5193   // Replace destination lanes holding special value(0x80000000) with max int
5194   // if corresponding source lane holds a +ve value.
5195   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5196   bind(done);
5197 }
5198 
5199 
5200 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5201                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5202   switch(to_elem_bt) {
5203     case T_SHORT:
5204       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5205       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5206       vpackusdw(dst, dst, zero, vec_enc);
5207       if (vec_enc == Assembler::AVX_256bit) {
5208         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5209       }
5210       break;
5211     case  T_BYTE:
5212       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5213       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5214       vpackusdw(dst, dst, zero, vec_enc);
5215       if (vec_enc == Assembler::AVX_256bit) {
5216         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5217       }
5218       vpackuswb(dst, dst, zero, vec_enc);
5219       break;
5220     default: assert(false, "%s", type2name(to_elem_bt));
5221   }
5222 }
5223 
5224 /*
5225  * Algorithm for vector D2L and F2I conversions:-
5226  * a) Perform vector D2L/F2I cast.
5227  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5228  *    It signifies that source value could be any of the special floating point
5229  *    values(NaN,-Inf,Inf,Max,-Min).
5230  * c) Set destination to zero if source is NaN value.
5231  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5232  */
5233 
5234 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5235                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5236                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5237   int to_elem_sz = type2aelembytes(to_elem_bt);
5238   assert(to_elem_sz <= 4, "");
5239   vcvttps2dq(dst, src, vec_enc);
5240   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5241   if (to_elem_sz < 4) {
5242     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5243     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5244   }
5245 }
5246 
5247 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5248                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5249                                             Register rscratch, int vec_enc) {
5250   int to_elem_sz = type2aelembytes(to_elem_bt);
5251   assert(to_elem_sz <= 4, "");
5252   vcvttps2dq(dst, src, vec_enc);
5253   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5254   switch(to_elem_bt) {
5255     case T_INT:
5256       break;
5257     case T_SHORT:
5258       evpmovdw(dst, dst, vec_enc);
5259       break;
5260     case T_BYTE:
5261       evpmovdb(dst, dst, vec_enc);
5262       break;
5263     default: assert(false, "%s", type2name(to_elem_bt));
5264   }
5265 }
5266 
5267 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5268                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5269                                             Register rscratch, int vec_enc) {
5270   evcvttps2qq(dst, src, vec_enc);
5271   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5272 }
5273 
5274 // Handling for downcasting from double to integer or sub-word types on AVX2.
5275 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5276                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5277                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5278   int to_elem_sz = type2aelembytes(to_elem_bt);
5279   assert(to_elem_sz < 8, "");
5280   vcvttpd2dq(dst, src, vec_enc);
5281   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5282                                               float_sign_flip, vec_enc);
5283   if (to_elem_sz < 4) {
5284     // xtmp4 holds all zero lanes.
5285     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5286   }
5287 }
5288 
5289 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5290                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5291                                             KRegister ktmp2, AddressLiteral sign_flip,
5292                                             Register rscratch, int vec_enc) {
5293   if (VM_Version::supports_avx512dq()) {
5294     evcvttpd2qq(dst, src, vec_enc);
5295     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5296     switch(to_elem_bt) {
5297       case T_LONG:
5298         break;
5299       case T_INT:
5300         evpmovsqd(dst, dst, vec_enc);
5301         break;
5302       case T_SHORT:
5303         evpmovsqd(dst, dst, vec_enc);
5304         evpmovdw(dst, dst, vec_enc);
5305         break;
5306       case T_BYTE:
5307         evpmovsqd(dst, dst, vec_enc);
5308         evpmovdb(dst, dst, vec_enc);
5309         break;
5310       default: assert(false, "%s", type2name(to_elem_bt));
5311     }
5312   } else {
5313     assert(type2aelembytes(to_elem_bt) <= 4, "");
5314     vcvttpd2dq(dst, src, vec_enc);
5315     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5316     switch(to_elem_bt) {
5317       case T_INT:
5318         break;
5319       case T_SHORT:
5320         evpmovdw(dst, dst, vec_enc);
5321         break;
5322       case T_BYTE:
5323         evpmovdb(dst, dst, vec_enc);
5324         break;
5325       default: assert(false, "%s", type2name(to_elem_bt));
5326     }
5327   }
5328 }
5329 
5330 #ifdef _LP64
5331 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5332                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5333                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5334   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5335   // and re-instantiate original MXCSR.RC mode after that.
5336   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5337 
5338   mov64(tmp, julong_cast(0.5L));
5339   evpbroadcastq(xtmp1, tmp, vec_enc);
5340   vaddpd(xtmp1, src , xtmp1, vec_enc);
5341   evcvtpd2qq(dst, xtmp1, vec_enc);
5342   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5343                                                 double_sign_flip, vec_enc);;
5344 
5345   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5346 }
5347 
5348 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5349                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5350                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5351   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5352   // and re-instantiate original MXCSR.RC mode after that.
5353   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5354 
5355   movl(tmp, jint_cast(0.5));
5356   movq(xtmp1, tmp);
5357   vbroadcastss(xtmp1, xtmp1, vec_enc);
5358   vaddps(xtmp1, src , xtmp1, vec_enc);
5359   vcvtps2dq(dst, xtmp1, vec_enc);
5360   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5361                                               float_sign_flip, vec_enc);
5362 
5363   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5364 }
5365 
5366 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5367                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5368                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5369   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5370   // and re-instantiate original MXCSR.RC mode after that.
5371   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5372 
5373   movl(tmp, jint_cast(0.5));
5374   movq(xtmp1, tmp);
5375   vbroadcastss(xtmp1, xtmp1, vec_enc);
5376   vaddps(xtmp1, src , xtmp1, vec_enc);
5377   vcvtps2dq(dst, xtmp1, vec_enc);
5378   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5379 
5380   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5381 }
5382 #endif // _LP64
5383 
5384 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5385                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5386   switch (from_elem_bt) {
5387     case T_BYTE:
5388       switch (to_elem_bt) {
5389         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5390         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5391         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5392         default: ShouldNotReachHere();
5393       }
5394       break;
5395     case T_SHORT:
5396       switch (to_elem_bt) {
5397         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5398         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5399         default: ShouldNotReachHere();
5400       }
5401       break;
5402     case T_INT:
5403       assert(to_elem_bt == T_LONG, "");
5404       vpmovzxdq(dst, src, vlen_enc);
5405       break;
5406     default:
5407       ShouldNotReachHere();
5408   }
5409 }
5410 
5411 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5412                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5413   switch (from_elem_bt) {
5414     case T_BYTE:
5415       switch (to_elem_bt) {
5416         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5417         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5418         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5419         default: ShouldNotReachHere();
5420       }
5421       break;
5422     case T_SHORT:
5423       switch (to_elem_bt) {
5424         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5425         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5426         default: ShouldNotReachHere();
5427       }
5428       break;
5429     case T_INT:
5430       assert(to_elem_bt == T_LONG, "");
5431       vpmovsxdq(dst, src, vlen_enc);
5432       break;
5433     default:
5434       ShouldNotReachHere();
5435   }
5436 }
5437 
5438 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5439                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5440   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5441   assert(vlen_enc != AVX_512bit, "");
5442 
5443   int dst_bt_size = type2aelembytes(dst_bt);
5444   int src_bt_size = type2aelembytes(src_bt);
5445   if (dst_bt_size > src_bt_size) {
5446     switch (dst_bt_size / src_bt_size) {
5447       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5448       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5449       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5450       default: ShouldNotReachHere();
5451     }
5452   } else {
5453     assert(dst_bt_size < src_bt_size, "");
5454     switch (src_bt_size / dst_bt_size) {
5455       case 2: {
5456         if (vlen_enc == AVX_128bit) {
5457           vpacksswb(dst, src, src, vlen_enc);
5458         } else {
5459           vpacksswb(dst, src, src, vlen_enc);
5460           vpermq(dst, dst, 0x08, vlen_enc);
5461         }
5462         break;
5463       }
5464       case 4: {
5465         if (vlen_enc == AVX_128bit) {
5466           vpackssdw(dst, src, src, vlen_enc);
5467           vpacksswb(dst, dst, dst, vlen_enc);
5468         } else {
5469           vpackssdw(dst, src, src, vlen_enc);
5470           vpermq(dst, dst, 0x08, vlen_enc);
5471           vpacksswb(dst, dst, dst, AVX_128bit);
5472         }
5473         break;
5474       }
5475       case 8: {
5476         if (vlen_enc == AVX_128bit) {
5477           vpshufd(dst, src, 0x08, vlen_enc);
5478           vpackssdw(dst, dst, dst, vlen_enc);
5479           vpacksswb(dst, dst, dst, vlen_enc);
5480         } else {
5481           vpshufd(dst, src, 0x08, vlen_enc);
5482           vpermq(dst, dst, 0x08, vlen_enc);
5483           vpackssdw(dst, dst, dst, AVX_128bit);
5484           vpacksswb(dst, dst, dst, AVX_128bit);
5485         }
5486         break;
5487       }
5488       default: ShouldNotReachHere();
5489     }
5490   }
5491 }
5492 
5493 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5494                                    bool merge, BasicType bt, int vlen_enc) {
5495   if (bt == T_INT) {
5496     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5497   } else {
5498     assert(bt == T_LONG, "");
5499     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5500   }
5501 }
5502 
5503 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5504                                    bool merge, BasicType bt, int vlen_enc) {
5505   if (bt == T_INT) {
5506     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5507   } else {
5508     assert(bt == T_LONG, "");
5509     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5510   }
5511 }
5512 
5513 #ifdef _LP64
5514 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5515                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5516                                                int vec_enc) {
5517   int index = 0;
5518   int vindex = 0;
5519   mov64(rtmp1, 0x0101010101010101L);
5520   pdepq(rtmp1, src, rtmp1);
5521   if (mask_len > 8) {
5522     movq(rtmp2, src);
5523     vpxor(xtmp, xtmp, xtmp, vec_enc);
5524     movq(xtmp, rtmp1);
5525   }
5526   movq(dst, rtmp1);
5527 
5528   mask_len -= 8;
5529   while (mask_len > 0) {
5530     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5531     index++;
5532     if ((index % 2) == 0) {
5533       pxor(xtmp, xtmp);
5534     }
5535     mov64(rtmp1, 0x0101010101010101L);
5536     shrq(rtmp2, 8);
5537     pdepq(rtmp1, rtmp2, rtmp1);
5538     pinsrq(xtmp, rtmp1, index % 2);
5539     vindex = index / 2;
5540     if (vindex) {
5541       // Write entire 16 byte vector when both 64 bit
5542       // lanes are update to save redundant instructions.
5543       if (index % 2) {
5544         vinsertf128(dst, dst, xtmp, vindex);
5545       }
5546     } else {
5547       vmovdqu(dst, xtmp);
5548     }
5549     mask_len -= 8;
5550   }
5551 }
5552 
5553 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5554   switch(opc) {
5555     case Op_VectorMaskTrueCount:
5556       popcntq(dst, tmp);
5557       break;
5558     case Op_VectorMaskLastTrue:
5559       if (VM_Version::supports_lzcnt()) {
5560         lzcntq(tmp, tmp);
5561         movl(dst, 63);
5562         subl(dst, tmp);
5563       } else {
5564         movl(dst, -1);
5565         bsrq(tmp, tmp);
5566         cmov32(Assembler::notZero, dst, tmp);
5567       }
5568       break;
5569     case Op_VectorMaskFirstTrue:
5570       if (VM_Version::supports_bmi1()) {
5571         if (masklen < 32) {
5572           orl(tmp, 1 << masklen);
5573           tzcntl(dst, tmp);
5574         } else if (masklen == 32) {
5575           tzcntl(dst, tmp);
5576         } else {
5577           assert(masklen == 64, "");
5578           tzcntq(dst, tmp);
5579         }
5580       } else {
5581         if (masklen < 32) {
5582           orl(tmp, 1 << masklen);
5583           bsfl(dst, tmp);
5584         } else {
5585           assert(masklen == 32 || masklen == 64, "");
5586           movl(dst, masklen);
5587           if (masklen == 32)  {
5588             bsfl(tmp, tmp);
5589           } else {
5590             bsfq(tmp, tmp);
5591           }
5592           cmov32(Assembler::notZero, dst, tmp);
5593         }
5594       }
5595       break;
5596     case Op_VectorMaskToLong:
5597       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5598       break;
5599     default: assert(false, "Unhandled mask operation");
5600   }
5601 }
5602 
5603 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5604                                               int masklen, int masksize, int vec_enc) {
5605   assert(VM_Version::supports_popcnt(), "");
5606 
5607   if(VM_Version::supports_avx512bw()) {
5608     kmovql(tmp, mask);
5609   } else {
5610     assert(masklen <= 16, "");
5611     kmovwl(tmp, mask);
5612   }
5613 
5614   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5615   // operations needs to be clipped.
5616   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5617     andq(tmp, (1 << masklen) - 1);
5618   }
5619 
5620   vector_mask_operation_helper(opc, dst, tmp, masklen);
5621 }
5622 
5623 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5624                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5625   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5626          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5627   assert(VM_Version::supports_popcnt(), "");
5628 
5629   bool need_clip = false;
5630   switch(bt) {
5631     case T_BOOLEAN:
5632       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5633       vpxor(xtmp, xtmp, xtmp, vec_enc);
5634       vpsubb(xtmp, xtmp, mask, vec_enc);
5635       vpmovmskb(tmp, xtmp, vec_enc);
5636       need_clip = masklen < 16;
5637       break;
5638     case T_BYTE:
5639       vpmovmskb(tmp, mask, vec_enc);
5640       need_clip = masklen < 16;
5641       break;
5642     case T_SHORT:
5643       vpacksswb(xtmp, mask, mask, vec_enc);
5644       if (masklen >= 16) {
5645         vpermpd(xtmp, xtmp, 8, vec_enc);
5646       }
5647       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5648       need_clip = masklen < 16;
5649       break;
5650     case T_INT:
5651     case T_FLOAT:
5652       vmovmskps(tmp, mask, vec_enc);
5653       need_clip = masklen < 4;
5654       break;
5655     case T_LONG:
5656     case T_DOUBLE:
5657       vmovmskpd(tmp, mask, vec_enc);
5658       need_clip = masklen < 2;
5659       break;
5660     default: assert(false, "Unhandled type, %s", type2name(bt));
5661   }
5662 
5663   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5664   // operations needs to be clipped.
5665   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5666     // need_clip implies masklen < 32
5667     andq(tmp, (1 << masklen) - 1);
5668   }
5669 
5670   vector_mask_operation_helper(opc, dst, tmp, masklen);
5671 }
5672 
5673 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5674                                              Register rtmp2, int mask_len) {
5675   kmov(rtmp1, src);
5676   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5677   mov64(rtmp2, -1L);
5678   pextq(rtmp2, rtmp2, rtmp1);
5679   kmov(dst, rtmp2);
5680 }
5681 
5682 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5683                                                     XMMRegister mask, Register rtmp, Register rscratch,
5684                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5685                                                     int vec_enc) {
5686   assert(type2aelembytes(bt) >= 4, "");
5687   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5688   address compress_perm_table = nullptr;
5689   address expand_perm_table = nullptr;
5690   if (type2aelembytes(bt) == 8) {
5691     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5692     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5693     vmovmskpd(rtmp, mask, vec_enc);
5694   } else {
5695     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5696     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5697     vmovmskps(rtmp, mask, vec_enc);
5698   }
5699   shlq(rtmp, 5); // for 32 byte permute row.
5700   if (opcode == Op_CompressV) {
5701     lea(rscratch, ExternalAddress(compress_perm_table));
5702   } else {
5703     lea(rscratch, ExternalAddress(expand_perm_table));
5704   }
5705   addptr(rtmp, rscratch);
5706   vmovdqu(permv, Address(rtmp));
5707   vpermps(dst, permv, src, Assembler::AVX_256bit);
5708   vpxor(xtmp, xtmp, xtmp, vec_enc);
5709   // Blend the result with zero vector using permute mask, each column entry
5710   // in a permute table row contains either a valid permute index or a -1 (default)
5711   // value, this can potentially be used as a blending mask after
5712   // compressing/expanding the source vector lanes.
5713   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5714 }
5715 
5716 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5717                                                bool merge, BasicType bt, int vec_enc) {
5718   if (opcode == Op_CompressV) {
5719     switch(bt) {
5720     case T_BYTE:
5721       evpcompressb(dst, mask, src, merge, vec_enc);
5722       break;
5723     case T_CHAR:
5724     case T_SHORT:
5725       evpcompressw(dst, mask, src, merge, vec_enc);
5726       break;
5727     case T_INT:
5728       evpcompressd(dst, mask, src, merge, vec_enc);
5729       break;
5730     case T_FLOAT:
5731       evcompressps(dst, mask, src, merge, vec_enc);
5732       break;
5733     case T_LONG:
5734       evpcompressq(dst, mask, src, merge, vec_enc);
5735       break;
5736     case T_DOUBLE:
5737       evcompresspd(dst, mask, src, merge, vec_enc);
5738       break;
5739     default:
5740       fatal("Unsupported type %s", type2name(bt));
5741       break;
5742     }
5743   } else {
5744     assert(opcode == Op_ExpandV, "");
5745     switch(bt) {
5746     case T_BYTE:
5747       evpexpandb(dst, mask, src, merge, vec_enc);
5748       break;
5749     case T_CHAR:
5750     case T_SHORT:
5751       evpexpandw(dst, mask, src, merge, vec_enc);
5752       break;
5753     case T_INT:
5754       evpexpandd(dst, mask, src, merge, vec_enc);
5755       break;
5756     case T_FLOAT:
5757       evexpandps(dst, mask, src, merge, vec_enc);
5758       break;
5759     case T_LONG:
5760       evpexpandq(dst, mask, src, merge, vec_enc);
5761       break;
5762     case T_DOUBLE:
5763       evexpandpd(dst, mask, src, merge, vec_enc);
5764       break;
5765     default:
5766       fatal("Unsupported type %s", type2name(bt));
5767       break;
5768     }
5769   }
5770 }
5771 #endif
5772 
5773 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5774                                            KRegister ktmp1, int vec_enc) {
5775   if (opcode == Op_SignumVD) {
5776     vsubpd(dst, zero, one, vec_enc);
5777     // if src < 0 ? -1 : 1
5778     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5779     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5780     // if src == NaN, -0.0 or 0.0 return src.
5781     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5782     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5783   } else {
5784     assert(opcode == Op_SignumVF, "");
5785     vsubps(dst, zero, one, vec_enc);
5786     // if src < 0 ? -1 : 1
5787     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5788     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5789     // if src == NaN, -0.0 or 0.0 return src.
5790     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5791     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5792   }
5793 }
5794 
5795 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5796                                           XMMRegister xtmp1, int vec_enc) {
5797   if (opcode == Op_SignumVD) {
5798     vsubpd(dst, zero, one, vec_enc);
5799     // if src < 0 ? -1 : 1
5800     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5801     // if src == NaN, -0.0 or 0.0 return src.
5802     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5803     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5804   } else {
5805     assert(opcode == Op_SignumVF, "");
5806     vsubps(dst, zero, one, vec_enc);
5807     // if src < 0 ? -1 : 1
5808     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5809     // if src == NaN, -0.0 or 0.0 return src.
5810     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5811     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5812   }
5813 }
5814 
5815 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5816   if (VM_Version::supports_avx512bw()) {
5817     if (mask_len > 32) {
5818       kmovql(dst, src);
5819     } else {
5820       kmovdl(dst, src);
5821       if (mask_len != 32) {
5822         kshiftrdl(dst, dst, 32 - mask_len);
5823       }
5824     }
5825   } else {
5826     assert(mask_len <= 16, "");
5827     kmovwl(dst, src);
5828     if (mask_len != 16) {
5829       kshiftrwl(dst, dst, 16 - mask_len);
5830     }
5831   }
5832 }
5833 
5834 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5835   int lane_size = type2aelembytes(bt);
5836   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5837   if ((is_LP64 || lane_size < 8) &&
5838       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5839        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5840     movptr(rtmp, imm32);
5841     switch(lane_size) {
5842       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5843       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5844       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5845       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5846       fatal("Unsupported lane size %d", lane_size);
5847       break;
5848     }
5849   } else {
5850     movptr(rtmp, imm32);
5851     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5852     switch(lane_size) {
5853       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5854       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5855       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5856       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5857       fatal("Unsupported lane size %d", lane_size);
5858       break;
5859     }
5860   }
5861 }
5862 
5863 //
5864 // Following is lookup table based popcount computation algorithm:-
5865 //       Index   Bit set count
5866 //     [ 0000 ->   0,
5867 //       0001 ->   1,
5868 //       0010 ->   1,
5869 //       0011 ->   2,
5870 //       0100 ->   1,
5871 //       0101 ->   2,
5872 //       0110 ->   2,
5873 //       0111 ->   3,
5874 //       1000 ->   1,
5875 //       1001 ->   2,
5876 //       1010 ->   3,
5877 //       1011 ->   3,
5878 //       1100 ->   2,
5879 //       1101 ->   3,
5880 //       1111 ->   4 ]
5881 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5882 //     shuffle indices for lookup table access.
5883 //  b. Right shift each byte of vector lane by 4 positions.
5884 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5885 //     shuffle indices for lookup table access.
5886 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5887 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5888 //     count of all the bytes of a quadword.
5889 //  f. Perform step e. for upper 128bit vector lane.
5890 //  g. Pack the bitset count of quadwords back to double word.
5891 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5892 
5893 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5894                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5895   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5896   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5897   vpsrlw(dst, src, 4, vec_enc);
5898   vpand(dst, dst, xtmp1, vec_enc);
5899   vpand(xtmp1, src, xtmp1, vec_enc);
5900   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5901   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5902   vpshufb(dst, xtmp2, dst, vec_enc);
5903   vpaddb(dst, dst, xtmp1, vec_enc);
5904 }
5905 
5906 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5907                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5908   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5909   // Following code is as per steps e,f,g and h of above algorithm.
5910   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5911   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5912   vpsadbw(dst, dst, xtmp2, vec_enc);
5913   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5914   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5915   vpackuswb(dst, xtmp1, dst, vec_enc);
5916 }
5917 
5918 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5919                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5920   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5921   // Add the popcount of upper and lower bytes of word.
5922   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5923   vpsrlw(dst, xtmp1, 8, vec_enc);
5924   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5925   vpaddw(dst, dst, xtmp1, vec_enc);
5926 }
5927 
5928 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5929                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5930   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5931   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5932   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5933 }
5934 
5935 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5936                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5937   switch(bt) {
5938     case T_LONG:
5939       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5940       break;
5941     case T_INT:
5942       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5943       break;
5944     case T_CHAR:
5945     case T_SHORT:
5946       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5947       break;
5948     case T_BYTE:
5949     case T_BOOLEAN:
5950       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5951       break;
5952     default:
5953       fatal("Unsupported type %s", type2name(bt));
5954       break;
5955   }
5956 }
5957 
5958 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5959                                                       KRegister mask, bool merge, int vec_enc) {
5960   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5961   switch(bt) {
5962     case T_LONG:
5963       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5964       evpopcntq(dst, mask, src, merge, vec_enc);
5965       break;
5966     case T_INT:
5967       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5968       evpopcntd(dst, mask, src, merge, vec_enc);
5969       break;
5970     case T_CHAR:
5971     case T_SHORT:
5972       assert(VM_Version::supports_avx512_bitalg(), "");
5973       evpopcntw(dst, mask, src, merge, vec_enc);
5974       break;
5975     case T_BYTE:
5976     case T_BOOLEAN:
5977       assert(VM_Version::supports_avx512_bitalg(), "");
5978       evpopcntb(dst, mask, src, merge, vec_enc);
5979       break;
5980     default:
5981       fatal("Unsupported type %s", type2name(bt));
5982       break;
5983   }
5984 }
5985 
5986 #ifndef _LP64
5987 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5988   assert(VM_Version::supports_avx512bw(), "");
5989   kmovdl(tmp, src);
5990   kunpckdql(dst, tmp, tmp);
5991 }
5992 #endif
5993 
5994 // Bit reversal algorithm first reverses the bits of each byte followed by
5995 // a byte level reversal for multi-byte primitive types (short/int/long).
5996 // Algorithm performs a lookup table access to get reverse bit sequence
5997 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5998 // is obtained by swapping the reverse bit sequences of upper and lower
5999 // nibble of a byte.
6000 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6001                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
6002   if (VM_Version::supports_avx512vlbw()) {
6003 
6004     // Get the reverse bit sequence of lower nibble of each byte.
6005     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6006     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6007     evpandq(dst, xtmp2, src, vec_enc);
6008     vpshufb(dst, xtmp1, dst, vec_enc);
6009     vpsllq(dst, dst, 4, vec_enc);
6010 
6011     // Get the reverse bit sequence of upper nibble of each byte.
6012     vpandn(xtmp2, xtmp2, src, vec_enc);
6013     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6014     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6015 
6016     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6017     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6018     evporq(xtmp2, dst, xtmp2, vec_enc);
6019     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6020 
6021   } else if(vec_enc == Assembler::AVX_512bit) {
6022     // Shift based bit reversal.
6023     assert(bt == T_LONG || bt == T_INT, "");
6024 
6025     // Swap lower and upper nibble of each byte.
6026     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6027 
6028     // Swap two least and most significant bits of each nibble.
6029     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6030 
6031     // Swap adjacent pair of bits.
6032     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6033     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6034 
6035     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6036     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6037   } else {
6038     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6039     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6040 
6041     // Get the reverse bit sequence of lower nibble of each byte.
6042     vpand(dst, xtmp2, src, vec_enc);
6043     vpshufb(dst, xtmp1, dst, vec_enc);
6044     vpsllq(dst, dst, 4, vec_enc);
6045 
6046     // Get the reverse bit sequence of upper nibble of each byte.
6047     vpandn(xtmp2, xtmp2, src, vec_enc);
6048     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6049     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6050 
6051     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6052     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6053     vpor(xtmp2, dst, xtmp2, vec_enc);
6054     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6055   }
6056 }
6057 
6058 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6059                                                 XMMRegister xtmp, Register rscratch) {
6060   assert(VM_Version::supports_gfni(), "");
6061   assert(rscratch != noreg || always_reachable(mask), "missing");
6062 
6063   // Galois field instruction based bit reversal based on following algorithm.
6064   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6065   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6066   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6067   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6068 }
6069 
6070 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6071                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6072   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6073   evpandq(dst, xtmp1, src, vec_enc);
6074   vpsllq(dst, dst, nbits, vec_enc);
6075   vpandn(xtmp1, xtmp1, src, vec_enc);
6076   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6077   evporq(dst, dst, xtmp1, vec_enc);
6078 }
6079 
6080 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6081                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6082   // Shift based bit reversal.
6083   assert(VM_Version::supports_evex(), "");
6084   switch(bt) {
6085     case T_LONG:
6086       // Swap upper and lower double word of each quad word.
6087       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6088       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6089       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6090       break;
6091     case T_INT:
6092       // Swap upper and lower word of each double word.
6093       evprord(xtmp1, k0, src, 16, true, vec_enc);
6094       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6095       break;
6096     case T_CHAR:
6097     case T_SHORT:
6098       // Swap upper and lower byte of each word.
6099       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6100       break;
6101     case T_BYTE:
6102       evmovdquq(dst, k0, src, true, vec_enc);
6103       break;
6104     default:
6105       fatal("Unsupported type %s", type2name(bt));
6106       break;
6107   }
6108 }
6109 
6110 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6111   if (bt == T_BYTE) {
6112     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6113       evmovdquq(dst, k0, src, true, vec_enc);
6114     } else {
6115       vmovdqu(dst, src);
6116     }
6117     return;
6118   }
6119   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6120   // pre-computed shuffle indices.
6121   switch(bt) {
6122     case T_LONG:
6123       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6124       break;
6125     case T_INT:
6126       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6127       break;
6128     case T_CHAR:
6129     case T_SHORT:
6130       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6131       break;
6132     default:
6133       fatal("Unsupported type %s", type2name(bt));
6134       break;
6135   }
6136   vpshufb(dst, src, dst, vec_enc);
6137 }
6138 
6139 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6140                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6141                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6142   assert(is_integral_type(bt), "");
6143   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6144   assert(VM_Version::supports_avx512cd(), "");
6145   switch(bt) {
6146     case T_LONG:
6147       evplzcntq(dst, ktmp, src, merge, vec_enc);
6148       break;
6149     case T_INT:
6150       evplzcntd(dst, ktmp, src, merge, vec_enc);
6151       break;
6152     case T_SHORT:
6153       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6154       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6155       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6156       vpunpckhwd(dst, xtmp1, src, vec_enc);
6157       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6158       vpackusdw(dst, xtmp2, dst, vec_enc);
6159       break;
6160     case T_BYTE:
6161       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6162       // accessing the lookup table.
6163       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6164       // accessing the lookup table.
6165       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6166       assert(VM_Version::supports_avx512bw(), "");
6167       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6168       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6169       vpand(xtmp2, dst, src, vec_enc);
6170       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6171       vpsrlw(xtmp3, src, 4, vec_enc);
6172       vpand(xtmp3, dst, xtmp3, vec_enc);
6173       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6174       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6175       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6176       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6177       break;
6178     default:
6179       fatal("Unsupported type %s", type2name(bt));
6180       break;
6181   }
6182 }
6183 
6184 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6185                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6186   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6187   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6188   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6189   // accessing the lookup table.
6190   vpand(dst, xtmp2, src, vec_enc);
6191   vpshufb(dst, xtmp1, dst, vec_enc);
6192   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6193   // accessing the lookup table.
6194   vpsrlw(xtmp3, src, 4, vec_enc);
6195   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6196   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6197   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6198   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6199   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6200   vpaddb(dst, dst, xtmp2, vec_enc);
6201   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6202 }
6203 
6204 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6205                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6206   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6207   // Add zero counts of lower byte and upper byte of a word if
6208   // upper byte holds a zero value.
6209   vpsrlw(xtmp3, src, 8, vec_enc);
6210   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6211   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6212   vpsllw(xtmp2, dst, 8, vec_enc);
6213   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6214   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6215   vpsrlw(dst, dst, 8, vec_enc);
6216 }
6217 
6218 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6219                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6220   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6221   // hence biased exponent can be used to compute leading zero count as per
6222   // following formula:-
6223   // LZCNT = 31 - (biased_exp - 127)
6224   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6225 
6226   // Broadcast 0xFF
6227   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6228   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6229 
6230   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6231   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6232   // contributes to the leading number of zeros.
6233   vpsrld(xtmp2, src, 1, vec_enc);
6234   vpandn(xtmp3, xtmp2, src, vec_enc);
6235 
6236   // Extract biased exponent.
6237   vcvtdq2ps(dst, xtmp3, vec_enc);
6238   vpsrld(dst, dst, 23, vec_enc);
6239   vpand(dst, dst, xtmp1, vec_enc);
6240 
6241   // Broadcast 127.
6242   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6243   // Exponent = biased_exp - 127
6244   vpsubd(dst, dst, xtmp1, vec_enc);
6245 
6246   // Exponent_plus_one = Exponent + 1
6247   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6248   vpaddd(dst, dst, xtmp3, vec_enc);
6249 
6250   // Replace -ve exponent with zero, exponent is -ve when src
6251   // lane contains a zero value.
6252   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6253   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6254 
6255   // Rematerialize broadcast 32.
6256   vpslld(xtmp1, xtmp3, 5, vec_enc);
6257   // Exponent is 32 if corresponding source lane contains max_int value.
6258   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6259   // LZCNT = 32 - exponent_plus_one
6260   vpsubd(dst, xtmp1, dst, vec_enc);
6261 
6262   // Replace LZCNT with a value 1 if corresponding source lane
6263   // contains max_int value.
6264   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6265 
6266   // Replace biased_exp with 0 if source lane value is less than zero.
6267   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6268   vblendvps(dst, dst, xtmp2, src, vec_enc);
6269 }
6270 
6271 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6272                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6273   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6274   // Add zero counts of lower word and upper word of a double word if
6275   // upper word holds a zero value.
6276   vpsrld(xtmp3, src, 16, vec_enc);
6277   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6278   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6279   vpslld(xtmp2, dst, 16, vec_enc);
6280   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6281   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6282   vpsrld(dst, dst, 16, vec_enc);
6283   // Add zero counts of lower doubleword and upper doubleword of a
6284   // quadword if upper doubleword holds a zero value.
6285   vpsrlq(xtmp3, src, 32, vec_enc);
6286   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6287   vpsllq(xtmp2, dst, 32, vec_enc);
6288   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6289   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6290   vpsrlq(dst, dst, 32, vec_enc);
6291 }
6292 
6293 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6294                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6295                                                        Register rtmp, int vec_enc) {
6296   assert(is_integral_type(bt), "unexpected type");
6297   assert(vec_enc < Assembler::AVX_512bit, "");
6298   switch(bt) {
6299     case T_LONG:
6300       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6301       break;
6302     case T_INT:
6303       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6304       break;
6305     case T_SHORT:
6306       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6307       break;
6308     case T_BYTE:
6309       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6310       break;
6311     default:
6312       fatal("Unsupported type %s", type2name(bt));
6313       break;
6314   }
6315 }
6316 
6317 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6318   switch(bt) {
6319     case T_BYTE:
6320       vpsubb(dst, src1, src2, vec_enc);
6321       break;
6322     case T_SHORT:
6323       vpsubw(dst, src1, src2, vec_enc);
6324       break;
6325     case T_INT:
6326       vpsubd(dst, src1, src2, vec_enc);
6327       break;
6328     case T_LONG:
6329       vpsubq(dst, src1, src2, vec_enc);
6330       break;
6331     default:
6332       fatal("Unsupported type %s", type2name(bt));
6333       break;
6334   }
6335 }
6336 
6337 // Trailing zero count computation is based on leading zero count operation as per
6338 // following equation. All AVX3 targets support AVX512CD feature which offers
6339 // direct vector instruction to compute leading zero count.
6340 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6341 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6342                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6343                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6344   assert(is_integral_type(bt), "");
6345   // xtmp = -1
6346   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6347   // xtmp = xtmp + src
6348   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6349   // xtmp = xtmp & ~src
6350   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6351   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6352   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6353   vpsub(bt, dst, xtmp4, dst, vec_enc);
6354 }
6355 
6356 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6357 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6358 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6359                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6360   assert(is_integral_type(bt), "");
6361   // xtmp = 0
6362   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6363   // xtmp = 0 - src
6364   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6365   // xtmp = xtmp | src
6366   vpor(xtmp3, xtmp3, src, vec_enc);
6367   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6368   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6369   vpsub(bt, dst, xtmp1, dst, vec_enc);
6370 }
6371 
6372 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6373   Label done;
6374   Label neg_divisor_fastpath;
6375   cmpl(divisor, 0);
6376   jccb(Assembler::less, neg_divisor_fastpath);
6377   xorl(rdx, rdx);
6378   divl(divisor);
6379   jmpb(done);
6380   bind(neg_divisor_fastpath);
6381   // Fastpath for divisor < 0:
6382   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6383   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6384   movl(rdx, rax);
6385   subl(rdx, divisor);
6386   if (VM_Version::supports_bmi1()) {
6387     andnl(rax, rdx, rax);
6388   } else {
6389     notl(rdx);
6390     andl(rax, rdx);
6391   }
6392   shrl(rax, 31);
6393   bind(done);
6394 }
6395 
6396 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6397   Label done;
6398   Label neg_divisor_fastpath;
6399   cmpl(divisor, 0);
6400   jccb(Assembler::less, neg_divisor_fastpath);
6401   xorl(rdx, rdx);
6402   divl(divisor);
6403   jmpb(done);
6404   bind(neg_divisor_fastpath);
6405   // Fastpath when divisor < 0:
6406   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6407   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6408   movl(rdx, rax);
6409   subl(rax, divisor);
6410   if (VM_Version::supports_bmi1()) {
6411     andnl(rax, rax, rdx);
6412   } else {
6413     notl(rax);
6414     andl(rax, rdx);
6415   }
6416   sarl(rax, 31);
6417   andl(rax, divisor);
6418   subl(rdx, rax);
6419   bind(done);
6420 }
6421 
6422 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6423   Label done;
6424   Label neg_divisor_fastpath;
6425 
6426   cmpl(divisor, 0);
6427   jccb(Assembler::less, neg_divisor_fastpath);
6428   xorl(rdx, rdx);
6429   divl(divisor);
6430   jmpb(done);
6431   bind(neg_divisor_fastpath);
6432   // Fastpath for divisor < 0:
6433   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6434   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6435   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6436   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6437   movl(rdx, rax);
6438   subl(rax, divisor);
6439   if (VM_Version::supports_bmi1()) {
6440     andnl(rax, rax, rdx);
6441   } else {
6442     notl(rax);
6443     andl(rax, rdx);
6444   }
6445   movl(tmp, rax);
6446   shrl(rax, 31); // quotient
6447   sarl(tmp, 31);
6448   andl(tmp, divisor);
6449   subl(rdx, tmp); // remainder
6450   bind(done);
6451 }
6452 
6453 #ifdef _LP64
6454 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6455                                  XMMRegister xtmp2, Register rtmp) {
6456   if(VM_Version::supports_gfni()) {
6457     // Galois field instruction based bit reversal based on following algorithm.
6458     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6459     mov64(rtmp, 0x8040201008040201L);
6460     movq(xtmp1, src);
6461     movq(xtmp2, rtmp);
6462     gf2p8affineqb(xtmp1, xtmp2, 0);
6463     movq(dst, xtmp1);
6464   } else {
6465     // Swap even and odd numbered bits.
6466     movl(rtmp, src);
6467     andl(rtmp, 0x55555555);
6468     shll(rtmp, 1);
6469     movl(dst, src);
6470     andl(dst, 0xAAAAAAAA);
6471     shrl(dst, 1);
6472     orl(dst, rtmp);
6473 
6474     // Swap LSB and MSB 2 bits of each nibble.
6475     movl(rtmp, dst);
6476     andl(rtmp, 0x33333333);
6477     shll(rtmp, 2);
6478     andl(dst, 0xCCCCCCCC);
6479     shrl(dst, 2);
6480     orl(dst, rtmp);
6481 
6482     // Swap LSB and MSB 4 bits of each byte.
6483     movl(rtmp, dst);
6484     andl(rtmp, 0x0F0F0F0F);
6485     shll(rtmp, 4);
6486     andl(dst, 0xF0F0F0F0);
6487     shrl(dst, 4);
6488     orl(dst, rtmp);
6489   }
6490   bswapl(dst);
6491 }
6492 
6493 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6494                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6495   if(VM_Version::supports_gfni()) {
6496     // Galois field instruction based bit reversal based on following algorithm.
6497     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6498     mov64(rtmp1, 0x8040201008040201L);
6499     movq(xtmp1, src);
6500     movq(xtmp2, rtmp1);
6501     gf2p8affineqb(xtmp1, xtmp2, 0);
6502     movq(dst, xtmp1);
6503   } else {
6504     // Swap even and odd numbered bits.
6505     movq(rtmp1, src);
6506     mov64(rtmp2, 0x5555555555555555L);
6507     andq(rtmp1, rtmp2);
6508     shlq(rtmp1, 1);
6509     movq(dst, src);
6510     notq(rtmp2);
6511     andq(dst, rtmp2);
6512     shrq(dst, 1);
6513     orq(dst, rtmp1);
6514 
6515     // Swap LSB and MSB 2 bits of each nibble.
6516     movq(rtmp1, dst);
6517     mov64(rtmp2, 0x3333333333333333L);
6518     andq(rtmp1, rtmp2);
6519     shlq(rtmp1, 2);
6520     notq(rtmp2);
6521     andq(dst, rtmp2);
6522     shrq(dst, 2);
6523     orq(dst, rtmp1);
6524 
6525     // Swap LSB and MSB 4 bits of each byte.
6526     movq(rtmp1, dst);
6527     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6528     andq(rtmp1, rtmp2);
6529     shlq(rtmp1, 4);
6530     notq(rtmp2);
6531     andq(dst, rtmp2);
6532     shrq(dst, 4);
6533     orq(dst, rtmp1);
6534   }
6535   bswapq(dst);
6536 }
6537 
6538 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6539   Label done;
6540   Label neg_divisor_fastpath;
6541   cmpq(divisor, 0);
6542   jccb(Assembler::less, neg_divisor_fastpath);
6543   xorl(rdx, rdx);
6544   divq(divisor);
6545   jmpb(done);
6546   bind(neg_divisor_fastpath);
6547   // Fastpath for divisor < 0:
6548   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6549   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6550   movq(rdx, rax);
6551   subq(rdx, divisor);
6552   if (VM_Version::supports_bmi1()) {
6553     andnq(rax, rdx, rax);
6554   } else {
6555     notq(rdx);
6556     andq(rax, rdx);
6557   }
6558   shrq(rax, 63);
6559   bind(done);
6560 }
6561 
6562 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6563   Label done;
6564   Label neg_divisor_fastpath;
6565   cmpq(divisor, 0);
6566   jccb(Assembler::less, neg_divisor_fastpath);
6567   xorq(rdx, rdx);
6568   divq(divisor);
6569   jmp(done);
6570   bind(neg_divisor_fastpath);
6571   // Fastpath when divisor < 0:
6572   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6573   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6574   movq(rdx, rax);
6575   subq(rax, divisor);
6576   if (VM_Version::supports_bmi1()) {
6577     andnq(rax, rax, rdx);
6578   } else {
6579     notq(rax);
6580     andq(rax, rdx);
6581   }
6582   sarq(rax, 63);
6583   andq(rax, divisor);
6584   subq(rdx, rax);
6585   bind(done);
6586 }
6587 
6588 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6589   Label done;
6590   Label neg_divisor_fastpath;
6591   cmpq(divisor, 0);
6592   jccb(Assembler::less, neg_divisor_fastpath);
6593   xorq(rdx, rdx);
6594   divq(divisor);
6595   jmp(done);
6596   bind(neg_divisor_fastpath);
6597   // Fastpath for divisor < 0:
6598   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6599   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6600   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6601   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6602   movq(rdx, rax);
6603   subq(rax, divisor);
6604   if (VM_Version::supports_bmi1()) {
6605     andnq(rax, rax, rdx);
6606   } else {
6607     notq(rax);
6608     andq(rax, rdx);
6609   }
6610   movq(tmp, rax);
6611   shrq(rax, 63); // quotient
6612   sarq(tmp, 63);
6613   andq(tmp, divisor);
6614   subq(rdx, tmp); // remainder
6615   bind(done);
6616 }
6617 #endif
6618 
6619 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6620                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6621                                         int vlen_enc) {
6622   assert(VM_Version::supports_avx512bw(), "");
6623   // Byte shuffles are inlane operations and indices are determined using
6624   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6625   // normalized to index range 0-15. This makes sure that all the multiples
6626   // of an index value are placed at same relative position in 128 bit
6627   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6628   // will be 16th element in their respective 128 bit lanes.
6629   movl(rtmp, 16);
6630   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6631 
6632   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6633   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6634   // original shuffle indices and move the shuffled lanes corresponding to true
6635   // mask to destination vector.
6636   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6637   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6638   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6639 
6640   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6641   // and broadcasting second 128 bit lane.
6642   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6643   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6644   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6645   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6646   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6647 
6648   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6649   // and broadcasting third 128 bit lane.
6650   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6651   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6652   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6653   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6654   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6655 
6656   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6657   // and broadcasting third 128 bit lane.
6658   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6659   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6660   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6661   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6662   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6663 }
6664 
6665 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6666                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6667   if (vlen_enc == AVX_128bit) {
6668     vpermilps(dst, src, shuffle, vlen_enc);
6669   } else if (bt == T_INT) {
6670     vpermd(dst, shuffle, src, vlen_enc);
6671   } else {
6672     assert(bt == T_FLOAT, "");
6673     vpermps(dst, shuffle, src, vlen_enc);
6674   }
6675 }
6676 
6677 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6678   switch(opcode) {
6679     case Op_AddHF: vaddsh(dst, src1, src2); break;
6680     case Op_SubHF: vsubsh(dst, src1, src2); break;
6681     case Op_MulHF: vmulsh(dst, src1, src2); break;
6682     case Op_DivHF: vdivsh(dst, src1, src2); break;
6683     case Op_MaxHF: vmaxsh(dst, src1, src2); break;
6684     case Op_MinHF: vminsh(dst, src1, src2); break;
6685     default: assert(false, "%s", NodeClassNames[opcode]); break;
6686   }
6687 }
6688 
6689 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6690   switch(elem_bt) {
6691     case T_BYTE:
6692       if (ideal_opc == Op_SaturatingAddV) {
6693         vpaddsb(dst, src1, src2, vlen_enc);
6694       } else {
6695         assert(ideal_opc == Op_SaturatingSubV, "");
6696         vpsubsb(dst, src1, src2, vlen_enc);
6697       }
6698       break;
6699     case T_SHORT:
6700       if (ideal_opc == Op_SaturatingAddV) {
6701         vpaddsw(dst, src1, src2, vlen_enc);
6702       } else {
6703         assert(ideal_opc == Op_SaturatingSubV, "");
6704         vpsubsw(dst, src1, src2, vlen_enc);
6705       }
6706       break;
6707     default:
6708       fatal("Unsupported type %s", type2name(elem_bt));
6709       break;
6710   }
6711 }
6712 
6713 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6714   switch(elem_bt) {
6715     case T_BYTE:
6716       if (ideal_opc == Op_SaturatingAddV) {
6717         vpaddusb(dst, src1, src2, vlen_enc);
6718       } else {
6719         assert(ideal_opc == Op_SaturatingSubV, "");
6720         vpsubusb(dst, src1, src2, vlen_enc);
6721       }
6722       break;
6723     case T_SHORT:
6724       if (ideal_opc == Op_SaturatingAddV) {
6725         vpaddusw(dst, src1, src2, vlen_enc);
6726       } else {
6727         assert(ideal_opc == Op_SaturatingSubV, "");
6728         vpsubusw(dst, src1, src2, vlen_enc);
6729       }
6730       break;
6731     default:
6732       fatal("Unsupported type %s", type2name(elem_bt));
6733       break;
6734   }
6735 }
6736 
6737 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6738                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6739   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6740   // overflow_mask = Inp1 <u Inp2
6741   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6742   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6743   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6744 }
6745 
6746 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6747                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6748   // Emulate unsigned comparison using signed comparison
6749   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6750   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6751   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6752   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6753 
6754   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6755 
6756   // Res = INP1 - INP2 (non-commutative and non-associative)
6757   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6758   // Res = Mask ? Zero : Res
6759   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6760   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6761 }
6762 
6763 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6764                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6765   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6766   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6767   // Res = Signed Add INP1, INP2
6768   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6769   // T1 = SRC1 | SRC2
6770   vpor(xtmp1, src1, src2, vlen_enc);
6771   // Max_Unsigned = -1
6772   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6773   // Unsigned compare:  Mask = Res <u T1
6774   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6775   // res  = Mask ? Max_Unsigned : Res
6776   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6777 }
6778 
6779 //
6780 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6781 // unsigned addition operation.
6782 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6783 //
6784 // We empirically determined its semantic equivalence to following reduced expression
6785 //    overflow_mask =  (a + b) <u (a | b)
6786 //
6787 // and also verified it though Alive2 solver.
6788 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6789 //
6790 
6791 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6792                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6793   // Res = Signed Add INP1, INP2
6794   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6795   // Compute T1 = INP1 | INP2
6796   vpor(xtmp3, src1, src2, vlen_enc);
6797   // T1 = Minimum signed value.
6798   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6799   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6800   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6801   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6802   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6803   // Compute overflow detection mask = Res<1> <s T1
6804   if (elem_bt == T_INT) {
6805     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6806   } else {
6807     assert(elem_bt == T_LONG, "");
6808     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6809   }
6810   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6811 }
6812 
6813 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6814                                       int vlen_enc, bool xtmp2_hold_M1) {
6815   if (VM_Version::supports_avx512dq()) {
6816     evpmovq2m(ktmp, src, vlen_enc);
6817   } else {
6818     assert(VM_Version::supports_evex(), "");
6819     if (!xtmp2_hold_M1) {
6820       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6821     }
6822     evpsraq(xtmp1, src, 63, vlen_enc);
6823     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6824   }
6825 }
6826 
6827 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6828                                       int vlen_enc, bool xtmp2_hold_M1) {
6829   if (VM_Version::supports_avx512dq()) {
6830     evpmovd2m(ktmp, src, vlen_enc);
6831   } else {
6832     assert(VM_Version::supports_evex(), "");
6833     if (!xtmp2_hold_M1) {
6834       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6835     }
6836     vpsrad(xtmp1, src, 31, vlen_enc);
6837     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6838   }
6839 }
6840 
6841 
6842 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6843   if (elem_bt == T_LONG) {
6844     if (VM_Version::supports_evex()) {
6845       evpsraq(dst, src, 63, vlen_enc);
6846     } else {
6847       vpsrad(dst, src, 31, vlen_enc);
6848       vpshufd(dst, dst, 0xF5, vlen_enc);
6849     }
6850   } else {
6851     assert(elem_bt == T_INT, "");
6852     vpsrad(dst, src, 31, vlen_enc);
6853   }
6854 }
6855 
6856 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6857   if (compute_allones) {
6858     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6859       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6860     } else {
6861       vpcmpeqq(allones, allones, allones, vlen_enc);
6862     }
6863   }
6864   if (elem_bt == T_LONG) {
6865     vpsrlq(dst, allones, 1, vlen_enc);
6866   } else {
6867     assert(elem_bt == T_INT, "");
6868     vpsrld(dst, allones, 1, vlen_enc);
6869   }
6870 }
6871 
6872 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6873   if (compute_allones) {
6874     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6875       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6876     } else {
6877       vpcmpeqq(allones, allones, allones, vlen_enc);
6878     }
6879   }
6880   if (elem_bt == T_LONG) {
6881     vpsllq(dst, allones, 63, vlen_enc);
6882   } else {
6883     assert(elem_bt == T_INT, "");
6884     vpslld(dst, allones, 31, vlen_enc);
6885   }
6886 }
6887 
6888 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6889                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6890   switch(elem_bt) {
6891     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6892     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6893     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6894     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6895     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6896   }
6897 }
6898 
6899 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6900   switch(elem_bt) {
6901     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6902     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6903     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6904     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6905     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6906   }
6907 }
6908 
6909 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6910                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6911   if (elem_bt == T_LONG) {
6912     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6913   } else {
6914     assert(elem_bt == T_INT, "");
6915     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6916   }
6917 }
6918 
6919 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6920                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6921                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6922   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6923   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6924   // Overflow detection based on Hacker's delight section 2-13.
6925   if (ideal_opc == Op_SaturatingAddV) {
6926     // res = src1 + src2
6927     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6928     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6929     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6930     vpxor(xtmp1, dst, src1, vlen_enc);
6931     vpxor(xtmp2, dst, src2, vlen_enc);
6932     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6933   } else {
6934     assert(ideal_opc == Op_SaturatingSubV, "");
6935     // res = src1 - src2
6936     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6937     // Overflow occurs when both inputs have opposite polarity and
6938     // result polarity does not comply with first input polarity.
6939     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6940     vpxor(xtmp1, src1, src2, vlen_enc);
6941     vpxor(xtmp2, dst, src1, vlen_enc);
6942     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6943   }
6944 
6945   // Compute overflow detection mask.
6946   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6947   // Note: xtmp1 hold -1 in all its lanes after above call.
6948 
6949   // Compute mask based on first input polarity.
6950   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6951 
6952   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6953   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6954 
6955   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6956   // set bits in first input polarity mask holds a min value.
6957   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6958   // Blend destination lanes with saturated values using overflow detection mask.
6959   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6960 }
6961 
6962 
6963 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6964                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6965                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6966   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6967   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6968   // Overflow detection based on Hacker's delight section 2-13.
6969   if (ideal_opc == Op_SaturatingAddV) {
6970     // res = src1 + src2
6971     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6972     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6973     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6974     vpxor(xtmp1, dst, src1, vlen_enc);
6975     vpxor(xtmp2, dst, src2, vlen_enc);
6976     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6977   } else {
6978     assert(ideal_opc == Op_SaturatingSubV, "");
6979     // res = src1 - src2
6980     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6981     // Overflow occurs when both inputs have opposite polarity and
6982     // result polarity does not comply with first input polarity.
6983     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6984     vpxor(xtmp1, src1, src2, vlen_enc);
6985     vpxor(xtmp2, dst, src1, vlen_enc);
6986     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6987   }
6988 
6989   // Sign-extend to compute overflow detection mask.
6990   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6991 
6992   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6993   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6994   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6995 
6996   // Compose saturating min/max vector using first input polarity mask.
6997   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6998   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6999 
7000   // Blend result with saturating vector using overflow detection mask.
7001   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
7002 }
7003 
7004 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7005   switch(elem_bt) {
7006     case T_BYTE:
7007       if (ideal_opc == Op_SaturatingAddV) {
7008         vpaddsb(dst, src1, src2, vlen_enc);
7009       } else {
7010         assert(ideal_opc == Op_SaturatingSubV, "");
7011         vpsubsb(dst, src1, src2, vlen_enc);
7012       }
7013       break;
7014     case T_SHORT:
7015       if (ideal_opc == Op_SaturatingAddV) {
7016         vpaddsw(dst, src1, src2, vlen_enc);
7017       } else {
7018         assert(ideal_opc == Op_SaturatingSubV, "");
7019         vpsubsw(dst, src1, src2, vlen_enc);
7020       }
7021       break;
7022     default:
7023       fatal("Unsupported type %s", type2name(elem_bt));
7024       break;
7025   }
7026 }
7027 
7028 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7029   switch(elem_bt) {
7030     case T_BYTE:
7031       if (ideal_opc == Op_SaturatingAddV) {
7032         vpaddusb(dst, src1, src2, vlen_enc);
7033       } else {
7034         assert(ideal_opc == Op_SaturatingSubV, "");
7035         vpsubusb(dst, src1, src2, vlen_enc);
7036       }
7037       break;
7038     case T_SHORT:
7039       if (ideal_opc == Op_SaturatingAddV) {
7040         vpaddusw(dst, src1, src2, vlen_enc);
7041       } else {
7042         assert(ideal_opc == Op_SaturatingSubV, "");
7043         vpsubusw(dst, src1, src2, vlen_enc);
7044       }
7045       break;
7046     default:
7047       fatal("Unsupported type %s", type2name(elem_bt));
7048       break;
7049   }
7050 }
7051 
7052 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7053                                                      XMMRegister src2, int vlen_enc) {
7054   switch(elem_bt) {
7055     case T_BYTE:
7056       evpermi2b(dst, src1, src2, vlen_enc);
7057       break;
7058     case T_SHORT:
7059       evpermi2w(dst, src1, src2, vlen_enc);
7060       break;
7061     case T_INT:
7062       evpermi2d(dst, src1, src2, vlen_enc);
7063       break;
7064     case T_LONG:
7065       evpermi2q(dst, src1, src2, vlen_enc);
7066       break;
7067     case T_FLOAT:
7068       evpermi2ps(dst, src1, src2, vlen_enc);
7069       break;
7070     case T_DOUBLE:
7071       evpermi2pd(dst, src1, src2, vlen_enc);
7072       break;
7073     default:
7074       fatal("Unsupported type %s", type2name(elem_bt));
7075       break;
7076   }
7077 }
7078 
7079 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7080   if (is_unsigned) {
7081     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7082   } else {
7083     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7084   }
7085 }
7086 
7087 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7088   if (is_unsigned) {
7089     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7090   } else {
7091     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7092   }
7093 }