1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  53   if (C->clinit_barrier_on_entry()) {
  54     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  55     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  56 
  57     Label L_skip_barrier;
  58     Register klass = rscratch1;
  59 
  60     mov_metadata(klass, C->method()->holder()->constant_encoding());
  61     clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
  62 
  63     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  64 
  65     bind(L_skip_barrier);
  66   }
  67 
  68   int framesize = C->output()->frame_size_in_bytes();
  69   int bangsize = C->output()->bang_size_in_bytes();
  70   bool fp_mode_24b = false;
  71   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  72 
  73   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  74   // NativeJump::patch_verified_entry will be able to patch out the entry
  75   // code safely. The push to verify stack depth is ok at 5 bytes,
  76   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  77   // stack bang then we must use the 6 byte frame allocation even if
  78   // we have no frame. :-(
  79   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  80 
  81   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  82   // Remove word for return addr
  83   framesize -= wordSize;
  84   stack_bang_size -= wordSize;
  85 
  86   // Calls to C2R adapters often do not accept exceptional returns.
  87   // We require that their callers must bang for them.  But be careful, because
  88   // some VM calls (such as call site linkage) can use several kilobytes of
  89   // stack.  But the stack safety zone should account for that.
  90   // See bugs 4446381, 4468289, 4497237.
  91   if (stack_bang_size > 0) {
  92     generate_stack_overflow_check(stack_bang_size);
  93 
  94     // We always push rbp, so that on return to interpreter rbp, will be
  95     // restored correctly and we can correct the stack.
  96     push(rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       mov(rbp, rsp);
 100     }
 101     // Remove word for ebp
 102     framesize -= wordSize;
 103 
 104     // Create frame
 105     if (framesize) {
 106       subptr(rsp, framesize);
 107     }
 108   } else {
 109     // Create frame (force generation of a 4 byte immediate value)
 110     subptr_imm32(rsp, framesize);
 111 
 112     // Save RBP register now.
 113     framesize -= wordSize;
 114     movptr(Address(rsp, framesize), rbp);
 115     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 116     if (PreserveFramePointer) {
 117       movptr(rbp, rsp);
 118       if (framesize > 0) {
 119         addptr(rbp, framesize);
 120       }
 121     }
 122   }
 123 
 124   if (C->needs_stack_repair()) {
 125     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 126     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 127     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
 128   }
 129 
 130   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 131     framesize -= wordSize;
 132     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 133   }
 134 
 135 #ifndef _LP64
 136   // If method sets FPU control word do it now
 137   if (fp_mode_24b) {
 138     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 139   }
 140   if (UseSSE >= 2 && VerifyFPU) {
 141     verify_FPU(0, "FPU stack must be clean on entry");
 142   }
 143 #endif
 144 
 145 #ifdef ASSERT
 146   if (VerifyStackAtCalls) {
 147     Label L;
 148     push(rax);
 149     mov(rax, rsp);
 150     andptr(rax, StackAlignmentInBytes-1);
 151     cmpptr(rax, StackAlignmentInBytes-wordSize);
 152     pop(rax);
 153     jcc(Assembler::equal, L);
 154     STOP("Stack is not properly aligned!");
 155     bind(L);
 156   }
 157 #endif
 158 }
 159 
 160 void C2_MacroAssembler::entry_barrier() {
 161   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 162 #ifdef _LP64
 163   if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 164     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 165     Label dummy_slow_path;
 166     Label dummy_continuation;
 167     Label* slow_path = &dummy_slow_path;
 168     Label* continuation = &dummy_continuation;
 169     if (!Compile::current()->output()->in_scratch_emit_size()) {
 170       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 171       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 172       Compile::current()->output()->add_stub(stub);
 173       slow_path = &stub->entry();
 174       continuation = &stub->continuation();
 175     }
 176     bs->nmethod_entry_barrier(this, slow_path, continuation);
 177   }
 178 #else
 179   // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 180   bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 181 #endif
 182 }
 183 
 184 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 185   switch (vlen_in_bytes) {
 186     case  4: // fall-through
 187     case  8: // fall-through
 188     case 16: return Assembler::AVX_128bit;
 189     case 32: return Assembler::AVX_256bit;
 190     case 64: return Assembler::AVX_512bit;
 191 
 192     default: {
 193       ShouldNotReachHere();
 194       return Assembler::AVX_NoVec;
 195     }
 196   }
 197 }
 198 
 199 // fast_lock and fast_unlock used by C2
 200 
 201 // Because the transitions from emitted code to the runtime
 202 // monitorenter/exit helper stubs are so slow it's critical that
 203 // we inline both the stack-locking fast path and the inflated fast path.
 204 //
 205 // See also: cmpFastLock and cmpFastUnlock.
 206 //
 207 // What follows is a specialized inline transliteration of the code
 208 // in enter() and exit(). If we're concerned about I$ bloat another
 209 // option would be to emit TrySlowEnter and TrySlowExit methods
 210 // at startup-time.  These methods would accept arguments as
 211 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 212 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 213 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 214 // In practice, however, the # of lock sites is bounded and is usually small.
 215 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 216 // if the processor uses simple bimodal branch predictors keyed by EIP
 217 // Since the helper routines would be called from multiple synchronization
 218 // sites.
 219 //
 220 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 221 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 222 // to those specialized methods.  That'd give us a mostly platform-independent
 223 // implementation that the JITs could optimize and inline at their pleasure.
 224 // Done correctly, the only time we'd need to cross to native could would be
 225 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 226 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 227 // (b) explicit barriers or fence operations.
 228 //
 229 // TODO:
 230 //
 231 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 232 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 233 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 234 //    the lock operators would typically be faster than reifying Self.
 235 //
 236 // *  Ideally I'd define the primitives as:
 237 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 238 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 239 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 240 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 241 //    Furthermore the register assignments are overconstrained, possibly resulting in
 242 //    sub-optimal code near the synchronization site.
 243 //
 244 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 245 //    Alternately, use a better sp-proximity test.
 246 //
 247 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 248 //    Either one is sufficient to uniquely identify a thread.
 249 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 250 //
 251 // *  Intrinsify notify() and notifyAll() for the common cases where the
 252 //    object is locked by the calling thread but the waitlist is empty.
 253 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 254 //
 255 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 256 //    But beware of excessive branch density on AMD Opterons.
 257 //
 258 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 259 //    or failure of the fast path.  If the fast path fails then we pass
 260 //    control to the slow path, typically in C.  In fast_lock and
 261 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 262 //    will emit a conditional branch immediately after the node.
 263 //    So we have branches to branches and lots of ICC.ZF games.
 264 //    Instead, it might be better to have C2 pass a "FailureLabel"
 265 //    into fast_lock and fast_unlock.  In the case of success, control
 266 //    will drop through the node.  ICC.ZF is undefined at exit.
 267 //    In the case of failure, the node will branch directly to the
 268 //    FailureLabel
 269 
 270 
 271 // obj: object to lock
 272 // box: on-stack box address (displaced header location) - KILLED
 273 // rax,: tmp -- KILLED
 274 // scr: tmp -- KILLED
 275 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 276                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 277                                  Metadata* method_data) {
 278   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 279   // Ensure the register assignments are disjoint
 280   assert(tmpReg == rax, "");
 281   assert(cx1Reg == noreg, "");
 282   assert(cx2Reg == noreg, "");
 283   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 284 
 285   // Possible cases that we'll encounter in fast_lock
 286   // ------------------------------------------------
 287   // * Inflated
 288   //    -- unlocked
 289   //    -- Locked
 290   //       = by self
 291   //       = by other
 292   // * neutral
 293   // * stack-locked
 294   //    -- by self
 295   //       = sp-proximity test hits
 296   //       = sp-proximity test generates false-negative
 297   //    -- by other
 298   //
 299 
 300   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 301 
 302   if (DiagnoseSyncOnValueBasedClasses != 0) {
 303     load_klass(tmpReg, objReg, scrReg);
 304     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 305     jcc(Assembler::notZero, DONE_LABEL);
 306   }
 307 
 308   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 309   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 310   jcc(Assembler::notZero, IsInflated);
 311 
 312   if (LockingMode == LM_MONITOR) {
 313     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 314     testptr(objReg, objReg);
 315   } else {
 316     assert(LockingMode == LM_LEGACY, "must be");
 317     // Attempt stack-locking ...
 318     orptr (tmpReg, markWord::unlocked_value);
 319     if (EnableValhalla) {
 320       // Mask inline_type bit such that we go to the slow path if object is an inline type
 321       andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place));
 322     }
 323     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 324     lock();
 325     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 326     jcc(Assembler::equal, COUNT);           // Success
 327 
 328     // Recursive locking.
 329     // The object is stack-locked: markword contains stack pointer to BasicLock.
 330     // Locked by current thread if difference with current SP is less than one page.
 331     subptr(tmpReg, rsp);
 332     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 333     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 334     movptr(Address(boxReg, 0), tmpReg);
 335   }
 336   jmp(DONE_LABEL);
 337 
 338   bind(IsInflated);
 339   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 340 
 341 #ifndef _LP64
 342   // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 343   orl(boxReg, 1);  // set ICC.ZF=0 to indicate failure
 344 #else
 345   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 346   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 347   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 348 
 349   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 350   movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset()));
 351   movq(scrReg, tmpReg);
 352   xorq(tmpReg, tmpReg);
 353   lock();
 354   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 355 
 356   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 357   jccb(Assembler::equal, COUNT);    // CAS above succeeded; propagate ZF = 1 (success)
 358 
 359   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 360   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 361   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 362   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 363 #endif // _LP64
 364   bind(DONE_LABEL);
 365 
 366   // ZFlag == 1 count in fast path
 367   // ZFlag == 0 count in slow path
 368   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 369 
 370   bind(COUNT);
 371   if (LockingMode == LM_LEGACY) {
 372 #ifdef _LP64
 373     // Count monitors in fast path
 374     increment(Address(thread, JavaThread::held_monitor_count_offset()));
 375 #endif
 376   }
 377   xorl(tmpReg, tmpReg); // Set ZF == 1
 378 
 379   bind(NO_COUNT);
 380 
 381   // At NO_COUNT the icc ZFlag is set as follows ...
 382   // fast_unlock uses the same protocol.
 383   // ZFlag == 1 -> Success
 384   // ZFlag == 0 -> Failure - force control through the slow path
 385 }
 386 
 387 // obj: object to unlock
 388 // box: box address (displaced header location), killed.  Must be EAX.
 389 // tmp: killed, cannot be obj nor box.
 390 //
 391 // Some commentary on balanced locking:
 392 //
 393 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 394 // Methods that don't have provably balanced locking are forced to run in the
 395 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 396 // The interpreter provides two properties:
 397 // I1:  At return-time the interpreter automatically and quietly unlocks any
 398 //      objects acquired the current activation (frame).  Recall that the
 399 //      interpreter maintains an on-stack list of locks currently held by
 400 //      a frame.
 401 // I2:  If a method attempts to unlock an object that is not held by the
 402 //      the frame the interpreter throws IMSX.
 403 //
 404 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 405 // B() doesn't have provably balanced locking so it runs in the interpreter.
 406 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 407 // is still locked by A().
 408 //
 409 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 410 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 411 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 412 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 413 // Arguably given that the spec legislates the JNI case as undefined our implementation
 414 // could reasonably *avoid* checking owner in fast_unlock().
 415 // In the interest of performance we elide m->Owner==Self check in unlock.
 416 // A perfectly viable alternative is to elide the owner check except when
 417 // Xcheck:jni is enabled.
 418 
 419 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 420   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 421   assert(boxReg == rax, "");
 422   assert_different_registers(objReg, boxReg, tmpReg);
 423 
 424   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 425 
 426   if (LockingMode == LM_LEGACY) {
 427     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 428     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 429   }
 430   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 431   if (LockingMode != LM_MONITOR) {
 432     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 433     jcc(Assembler::zero, Stacked);
 434   }
 435 
 436   // It's inflated.
 437 
 438 #ifndef _LP64
 439   // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 440   orl(boxReg, 1);  // set ICC.ZF=0 to indicate failure
 441   jmpb(DONE_LABEL);
 442 #else
 443   // Despite our balanced locking property we still check that m->_owner == Self
 444   // as java routines or native JNI code called by this thread might
 445   // have released the lock.
 446   // Refer to the comments in synchronizer.cpp for how we might encode extra
 447   // state in _succ so we can avoid fetching EntryList|cxq.
 448   //
 449   // If there's no contention try a 1-0 exit.  That is, exit without
 450   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 451   // we detect and recover from the race that the 1-0 exit admits.
 452   //
 453   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 454   // before it STs null into _owner, releasing the lock.  Updates
 455   // to data protected by the critical section must be visible before
 456   // we drop the lock (and thus before any other thread could acquire
 457   // the lock and observe the fields protected by the lock).
 458   // IA32's memory-model is SPO, so STs are ordered with respect to
 459   // each other and there's no need for an explicit barrier (fence).
 460   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 461   Label LSuccess, LNotRecursive;
 462 
 463   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 464   jccb(Assembler::equal, LNotRecursive);
 465 
 466   // Recursive inflated unlock
 467   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 468   jmpb(LSuccess);
 469 
 470   bind(LNotRecursive);
 471 
 472   // Set owner to null.
 473   // Release to satisfy the JMM
 474   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 475   // We need a full fence after clearing owner to avoid stranding.
 476   // StoreLoad achieves this.
 477   membar(StoreLoad);
 478 
 479   // Check if the entry lists are empty (EntryList first - by convention).
 480   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 481   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 482   jccb(Assembler::zero, LSuccess);    // If so we are done.
 483 
 484   // Check if there is a successor.
 485   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 486   jccb(Assembler::notZero, LSuccess); // If so we are done.
 487 
 488   // Save the monitor pointer in the current thread, so we can try to
 489   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 490   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 491   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 492 
 493   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 494   jmpb  (DONE_LABEL);
 495 
 496   bind  (LSuccess);
 497   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 498   jmpb  (DONE_LABEL);
 499 #endif  // _LP64
 500 
 501   if (LockingMode == LM_LEGACY) {
 502     bind  (Stacked);
 503     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 504     lock();
 505     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 506     // Intentional fall-thru into DONE_LABEL
 507   }
 508 
 509   bind(DONE_LABEL);
 510 
 511   // ZFlag == 1 count in fast path
 512   // ZFlag == 0 count in slow path
 513   jccb(Assembler::notZero, NO_COUNT);
 514 
 515   bind(COUNT);
 516 
 517   if (LockingMode == LM_LEGACY) {
 518     // Count monitors in fast path
 519 #ifdef _LP64
 520     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 521 #endif
 522   }
 523 
 524   xorl(tmpReg, tmpReg); // Set ZF == 1
 525 
 526   bind(NO_COUNT);
 527 }
 528 
 529 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 530                                               Register t, Register thread) {
 531   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 532   assert(rax_reg == rax, "Used for CAS");
 533   assert_different_registers(obj, box, rax_reg, t, thread);
 534 
 535   // Handle inflated monitor.
 536   Label inflated;
 537   // Finish fast lock successfully. ZF value is irrelevant.
 538   Label locked;
 539   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 540   Label slow_path;
 541 
 542   if (UseObjectMonitorTable) {
 543     // Clear cache in case fast locking succeeds.
 544     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 545   }
 546 
 547   if (DiagnoseSyncOnValueBasedClasses != 0) {
 548     load_klass(rax_reg, obj, t);
 549     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 550     jcc(Assembler::notZero, slow_path);
 551   }
 552 
 553   const Register mark = t;
 554 
 555   { // Lightweight Lock
 556 
 557     Label push;
 558 
 559     const Register top = UseObjectMonitorTable ? rax_reg : box;
 560 
 561     // Load the mark.
 562     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 563 
 564     // Prefetch top.
 565     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 566 
 567     // Check for monitor (0b10).
 568     testptr(mark, markWord::monitor_value);
 569     jcc(Assembler::notZero, inflated);
 570 
 571     // Check if lock-stack is full.
 572     cmpl(top, LockStack::end_offset() - 1);
 573     jcc(Assembler::greater, slow_path);
 574 
 575     // Check if recursive.
 576     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 577     jccb(Assembler::equal, push);
 578 
 579     // Try to lock. Transition lock bits 0b01 => 0b00
 580     movptr(rax_reg, mark);
 581     orptr(rax_reg, markWord::unlocked_value);
 582     andptr(mark, ~(int32_t)markWord::unlocked_value);
 583     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 584     jcc(Assembler::notEqual, slow_path);
 585 
 586     if (UseObjectMonitorTable) {
 587       // Need to reload top, clobbered by CAS.
 588       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 589     }
 590     bind(push);
 591     // After successful lock, push object on lock-stack.
 592     movptr(Address(thread, top), obj);
 593     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 594     jmpb(locked);
 595   }
 596 
 597   { // Handle inflated monitor.
 598     bind(inflated);
 599 
 600 #ifndef _LP64
 601     // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 602     orl(box, 1);  // set ICC.ZF=0 to indicate failure
 603     jmpb(slow_path);
 604 #else
 605     const Register monitor = t;
 606 
 607     if (!UseObjectMonitorTable) {
 608       assert(mark == monitor, "should be the same here");
 609     } else {
 610       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 611       // Fetch ObjectMonitor* from the cache or take the slow-path.
 612       Label monitor_found;
 613 
 614       // Load cache address
 615       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 616 
 617       const int num_unrolled = 2;
 618       for (int i = 0; i < num_unrolled; i++) {
 619         cmpptr(obj, Address(t));
 620         jccb(Assembler::equal, monitor_found);
 621         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 622       }
 623 
 624       Label loop;
 625 
 626       // Search for obj in cache.
 627       bind(loop);
 628 
 629       // Check for match.
 630       cmpptr(obj, Address(t));
 631       jccb(Assembler::equal, monitor_found);
 632 
 633       // Search until null encountered, guaranteed _null_sentinel at end.
 634       cmpptr(Address(t), 1);
 635       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 636       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 637       jmpb(loop);
 638 
 639       // Cache hit.
 640       bind(monitor_found);
 641       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 642     }
 643     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 644     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 645     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 646 
 647     Label monitor_locked;
 648     // Lock the monitor.
 649 
 650     if (UseObjectMonitorTable) {
 651       // Cache the monitor for unlock before trashing box. On failure to acquire
 652       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 653       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 654     }
 655 
 656     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 657     xorptr(rax_reg, rax_reg);
 658     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 659     lock(); cmpxchgptr(box, owner_address);
 660     jccb(Assembler::equal, monitor_locked);
 661 
 662     // Check if recursive.
 663     cmpptr(box, rax_reg);
 664     jccb(Assembler::notEqual, slow_path);
 665 
 666     // Recursive.
 667     increment(recursions_address);
 668 
 669     bind(monitor_locked);
 670 #endif  // _LP64
 671   }
 672 
 673   bind(locked);
 674   // Set ZF = 1
 675   xorl(rax_reg, rax_reg);
 676 
 677 #ifdef ASSERT
 678   // Check that locked label is reached with ZF set.
 679   Label zf_correct;
 680   Label zf_bad_zero;
 681   jcc(Assembler::zero, zf_correct);
 682   jmp(zf_bad_zero);
 683 #endif
 684 
 685   bind(slow_path);
 686 #ifdef ASSERT
 687   // Check that slow_path label is reached with ZF not set.
 688   jcc(Assembler::notZero, zf_correct);
 689   stop("Fast Lock ZF != 0");
 690   bind(zf_bad_zero);
 691   stop("Fast Lock ZF != 1");
 692   bind(zf_correct);
 693 #endif
 694   // C2 uses the value of ZF to determine the continuation.
 695 }
 696 
 697 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 698   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 699   assert(reg_rax == rax, "Used for CAS");
 700   assert_different_registers(obj, reg_rax, t);
 701 
 702   // Handle inflated monitor.
 703   Label inflated, inflated_check_lock_stack;
 704   // Finish fast unlock successfully.  MUST jump with ZF == 1
 705   Label unlocked, slow_path;
 706 
 707   const Register mark = t;
 708   const Register monitor = t;
 709   const Register top = UseObjectMonitorTable ? t : reg_rax;
 710   const Register box = reg_rax;
 711 
 712   Label dummy;
 713   C2FastUnlockLightweightStub* stub = nullptr;
 714 
 715   if (!Compile::current()->output()->in_scratch_emit_size()) {
 716     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 717     Compile::current()->output()->add_stub(stub);
 718   }
 719 
 720   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 721 
 722   { // Lightweight Unlock
 723 
 724     // Load top.
 725     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 726 
 727     if (!UseObjectMonitorTable) {
 728       // Prefetch mark.
 729       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 730     }
 731 
 732     // Check if obj is top of lock-stack.
 733     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 734     // Top of lock stack was not obj. Must be monitor.
 735     jcc(Assembler::notEqual, inflated_check_lock_stack);
 736 
 737     // Pop lock-stack.
 738     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 739     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 740 
 741     // Check if recursive.
 742     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 743     jcc(Assembler::equal, unlocked);
 744 
 745     // We elide the monitor check, let the CAS fail instead.
 746 
 747     if (UseObjectMonitorTable) {
 748       // Load mark.
 749       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 750     }
 751 
 752     // Try to unlock. Transition lock bits 0b00 => 0b01
 753     movptr(reg_rax, mark);
 754     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 755     orptr(mark, markWord::unlocked_value);
 756     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 757     jcc(Assembler::notEqual, push_and_slow_path);
 758     jmp(unlocked);
 759   }
 760 
 761 
 762   { // Handle inflated monitor.
 763     bind(inflated_check_lock_stack);
 764 #ifdef ASSERT
 765     Label check_done;
 766     subl(top, oopSize);
 767     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 768     jcc(Assembler::below, check_done);
 769     cmpptr(obj, Address(thread, top));
 770     jccb(Assembler::notEqual, inflated_check_lock_stack);
 771     stop("Fast Unlock lock on stack");
 772     bind(check_done);
 773     if (UseObjectMonitorTable) {
 774       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 775     }
 776     testptr(mark, markWord::monitor_value);
 777     jccb(Assembler::notZero, inflated);
 778     stop("Fast Unlock not monitor");
 779 #endif
 780 
 781     bind(inflated);
 782 
 783 #ifndef _LP64
 784     // Just take slow path to avoid dealing with 64 bit atomic instructions here.
 785     orl(t, 1);  // set ICC.ZF=0 to indicate failure
 786     jmpb(slow_path);
 787 #else
 788     if (!UseObjectMonitorTable) {
 789       assert(mark == monitor, "should be the same here");
 790     } else {
 791       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 792       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 793       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 794       cmpptr(monitor, alignof(ObjectMonitor*));
 795       jcc(Assembler::below, slow_path);
 796     }
 797     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 798     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 799     const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag};
 800     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 801     const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag};
 802     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 803 
 804     Label recursive;
 805 
 806     // Check if recursive.
 807     cmpptr(recursions_address, 0);
 808     jccb(Assembler::notZero, recursive);
 809 
 810     // Set owner to null.
 811     // Release to satisfy the JMM
 812     movptr(owner_address, NULL_WORD);
 813     // We need a full fence after clearing owner to avoid stranding.
 814     // StoreLoad achieves this.
 815     membar(StoreLoad);
 816 
 817     // Check if the entry lists are empty (EntryList first - by convention).
 818     movptr(reg_rax, EntryList_address);
 819     orptr(reg_rax, cxq_address);
 820     jccb(Assembler::zero, unlocked);    // If so we are done.
 821 
 822     // Check if there is a successor.
 823     cmpptr(succ_address, NULL_WORD);
 824     jccb(Assembler::notZero, unlocked); // If so we are done.
 825 
 826     // Save the monitor pointer in the current thread, so we can try to
 827     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 828     if (!UseObjectMonitorTable) {
 829       andptr(monitor, ~(int32_t)markWord::monitor_value);
 830     }
 831     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 832 
 833     orl(t, 1); // Fast Unlock ZF = 0
 834     jmpb(slow_path);
 835 
 836     // Recursive unlock.
 837     bind(recursive);
 838     decrement(recursions_address);
 839 #endif  // _LP64
 840   }
 841 
 842   bind(unlocked);
 843   xorl(t, t); // Fast Unlock ZF = 1
 844 
 845 #ifdef ASSERT
 846   // Check that unlocked label is reached with ZF set.
 847   Label zf_correct;
 848   Label zf_bad_zero;
 849   jcc(Assembler::zero, zf_correct);
 850   jmp(zf_bad_zero);
 851 #endif
 852 
 853   bind(slow_path);
 854   if (stub != nullptr) {
 855     bind(stub->slow_path_continuation());
 856   }
 857 #ifdef ASSERT
 858   // Check that stub->continuation() label is reached with ZF not set.
 859   jcc(Assembler::notZero, zf_correct);
 860   stop("Fast Unlock ZF != 0");
 861   bind(zf_bad_zero);
 862   stop("Fast Unlock ZF != 1");
 863   bind(zf_correct);
 864 #endif
 865   // C2 uses the value of ZF to determine the continuation.
 866 }
 867 
 868 //-------------------------------------------------------------------------------------------
 869 // Generic instructions support for use in .ad files C2 code generation
 870 
 871 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 872   if (dst != src) {
 873     movdqu(dst, src);
 874   }
 875   if (opcode == Op_AbsVD) {
 876     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 877   } else {
 878     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 879     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 880   }
 881 }
 882 
 883 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 884   if (opcode == Op_AbsVD) {
 885     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 886   } else {
 887     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 888     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 889   }
 890 }
 891 
 892 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 893   if (dst != src) {
 894     movdqu(dst, src);
 895   }
 896   if (opcode == Op_AbsVF) {
 897     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 898   } else {
 899     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 900     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 901   }
 902 }
 903 
 904 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 905   if (opcode == Op_AbsVF) {
 906     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 907   } else {
 908     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 909     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 910   }
 911 }
 912 
 913 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 914   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 915   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 916 
 917   if (opcode == Op_MinV) {
 918     if (elem_bt == T_BYTE) {
 919       pminsb(dst, src);
 920     } else if (elem_bt == T_SHORT) {
 921       pminsw(dst, src);
 922     } else if (elem_bt == T_INT) {
 923       pminsd(dst, src);
 924     } else {
 925       assert(elem_bt == T_LONG, "required");
 926       assert(tmp == xmm0, "required");
 927       assert_different_registers(dst, src, tmp);
 928       movdqu(xmm0, dst);
 929       pcmpgtq(xmm0, src);
 930       blendvpd(dst, src);  // xmm0 as mask
 931     }
 932   } else { // opcode == Op_MaxV
 933     if (elem_bt == T_BYTE) {
 934       pmaxsb(dst, src);
 935     } else if (elem_bt == T_SHORT) {
 936       pmaxsw(dst, src);
 937     } else if (elem_bt == T_INT) {
 938       pmaxsd(dst, src);
 939     } else {
 940       assert(elem_bt == T_LONG, "required");
 941       assert(tmp == xmm0, "required");
 942       assert_different_registers(dst, src, tmp);
 943       movdqu(xmm0, src);
 944       pcmpgtq(xmm0, dst);
 945       blendvpd(dst, src);  // xmm0 as mask
 946     }
 947   }
 948 }
 949 
 950 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 951                                   XMMRegister src1, Address src2, int vlen_enc) {
 952   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 953   if (opcode == Op_UMinV) {
 954     switch(elem_bt) {
 955       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 956       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 957       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 958       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 959       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 960     }
 961   } else {
 962     assert(opcode == Op_UMaxV, "required");
 963     switch(elem_bt) {
 964       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 965       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 966       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 967       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 968       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 969     }
 970   }
 971 }
 972 
 973 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 974   // For optimality, leverage a full vector width of 512 bits
 975   // for operations over smaller vector sizes on AVX512 targets.
 976   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 977     if (opcode == Op_UMaxV) {
 978       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 979     } else {
 980       assert(opcode == Op_UMinV, "required");
 981       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 982     }
 983   } else {
 984     // T1 = -1
 985     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 986     // T1 = -1 << 63
 987     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 988     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 989     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 990     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 991     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 992     // Mask = T2 > T1
 993     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 994     if (opcode == Op_UMaxV) {
 995       // Res = Mask ? Src2 : Src1
 996       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 997     } else {
 998       // Res = Mask ? Src1 : Src2
 999       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
1000     }
1001   }
1002 }
1003 
1004 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
1005                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
1006   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
1007   if (opcode == Op_UMinV) {
1008     switch(elem_bt) {
1009       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
1010       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
1011       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
1012       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
1013       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1014     }
1015   } else {
1016     assert(opcode == Op_UMaxV, "required");
1017     switch(elem_bt) {
1018       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
1019       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
1020       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
1021       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
1022       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1023     }
1024   }
1025 }
1026 
1027 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1028                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1029                                  int vlen_enc) {
1030   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1031 
1032   if (opcode == Op_MinV) {
1033     if (elem_bt == T_BYTE) {
1034       vpminsb(dst, src1, src2, vlen_enc);
1035     } else if (elem_bt == T_SHORT) {
1036       vpminsw(dst, src1, src2, vlen_enc);
1037     } else if (elem_bt == T_INT) {
1038       vpminsd(dst, src1, src2, vlen_enc);
1039     } else {
1040       assert(elem_bt == T_LONG, "required");
1041       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1042         vpminsq(dst, src1, src2, vlen_enc);
1043       } else {
1044         assert_different_registers(dst, src1, src2);
1045         vpcmpgtq(dst, src1, src2, vlen_enc);
1046         vblendvpd(dst, src1, src2, dst, vlen_enc);
1047       }
1048     }
1049   } else { // opcode == Op_MaxV
1050     if (elem_bt == T_BYTE) {
1051       vpmaxsb(dst, src1, src2, vlen_enc);
1052     } else if (elem_bt == T_SHORT) {
1053       vpmaxsw(dst, src1, src2, vlen_enc);
1054     } else if (elem_bt == T_INT) {
1055       vpmaxsd(dst, src1, src2, vlen_enc);
1056     } else {
1057       assert(elem_bt == T_LONG, "required");
1058       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1059         vpmaxsq(dst, src1, src2, vlen_enc);
1060       } else {
1061         assert_different_registers(dst, src1, src2);
1062         vpcmpgtq(dst, src1, src2, vlen_enc);
1063         vblendvpd(dst, src2, src1, dst, vlen_enc);
1064       }
1065     }
1066   }
1067 }
1068 
1069 // Float/Double min max
1070 
1071 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1072                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1073                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1074                                    int vlen_enc) {
1075   assert(UseAVX > 0, "required");
1076   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1077          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1078   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1079   assert_different_registers(a, tmp, atmp, btmp);
1080   assert_different_registers(b, tmp, atmp, btmp);
1081 
1082   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1083   bool is_double_word = is_double_word_type(elem_bt);
1084 
1085   /* Note on 'non-obvious' assembly sequence:
1086    *
1087    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1088    * and Java on how they handle floats:
1089    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1090    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1091    *
1092    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1093    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1094    *                (only useful when signs differ, noop otherwise)
1095    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1096 
1097    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1098    *   btmp = (b < +0.0) ? a : b
1099    *   atmp = (b < +0.0) ? b : a
1100    *   Tmp  = Max_Float(atmp , btmp)
1101    *   Res  = (atmp == NaN) ? atmp : Tmp
1102    */
1103 
1104   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1105   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1106   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1107   XMMRegister mask;
1108 
1109   if (!is_double_word && is_min) {
1110     mask = a;
1111     vblend = &MacroAssembler::vblendvps;
1112     vmaxmin = &MacroAssembler::vminps;
1113     vcmp = &MacroAssembler::vcmpps;
1114   } else if (!is_double_word && !is_min) {
1115     mask = b;
1116     vblend = &MacroAssembler::vblendvps;
1117     vmaxmin = &MacroAssembler::vmaxps;
1118     vcmp = &MacroAssembler::vcmpps;
1119   } else if (is_double_word && is_min) {
1120     mask = a;
1121     vblend = &MacroAssembler::vblendvpd;
1122     vmaxmin = &MacroAssembler::vminpd;
1123     vcmp = &MacroAssembler::vcmppd;
1124   } else {
1125     assert(is_double_word && !is_min, "sanity");
1126     mask = b;
1127     vblend = &MacroAssembler::vblendvpd;
1128     vmaxmin = &MacroAssembler::vmaxpd;
1129     vcmp = &MacroAssembler::vcmppd;
1130   }
1131 
1132   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1133   XMMRegister maxmin, scratch;
1134   if (dst == btmp) {
1135     maxmin = btmp;
1136     scratch = tmp;
1137   } else {
1138     maxmin = tmp;
1139     scratch = btmp;
1140   }
1141 
1142   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1143   if (precompute_mask && !is_double_word) {
1144     vpsrad(tmp, mask, 32, vlen_enc);
1145     mask = tmp;
1146   } else if (precompute_mask && is_double_word) {
1147     vpxor(tmp, tmp, tmp, vlen_enc);
1148     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1149     mask = tmp;
1150   }
1151 
1152   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1153   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1154   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1155   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1156   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1157 }
1158 
1159 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1160                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1161                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1162                                     int vlen_enc) {
1163   assert(UseAVX > 2, "required");
1164   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1165          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1166   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1167   assert_different_registers(dst, a, atmp, btmp);
1168   assert_different_registers(dst, b, atmp, btmp);
1169 
1170   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1171   bool is_double_word = is_double_word_type(elem_bt);
1172   bool merge = true;
1173 
1174   if (!is_double_word && is_min) {
1175     evpmovd2m(ktmp, a, vlen_enc);
1176     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1177     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1178     vminps(dst, atmp, btmp, vlen_enc);
1179     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1180     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1181   } else if (!is_double_word && !is_min) {
1182     evpmovd2m(ktmp, b, vlen_enc);
1183     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1184     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1185     vmaxps(dst, atmp, btmp, vlen_enc);
1186     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1187     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1188   } else if (is_double_word && is_min) {
1189     evpmovq2m(ktmp, a, vlen_enc);
1190     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1191     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1192     vminpd(dst, atmp, btmp, vlen_enc);
1193     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1194     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1195   } else {
1196     assert(is_double_word && !is_min, "sanity");
1197     evpmovq2m(ktmp, b, vlen_enc);
1198     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1199     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1200     vmaxpd(dst, atmp, btmp, vlen_enc);
1201     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1202     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1203   }
1204 }
1205 
1206 // Float/Double signum
1207 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1208   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1209 
1210   Label DONE_LABEL;
1211 
1212   if (opcode == Op_SignumF) {
1213     assert(UseSSE > 0, "required");
1214     ucomiss(dst, zero);
1215     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1216     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1217     movflt(dst, one);
1218     jcc(Assembler::above, DONE_LABEL);
1219     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1220   } else if (opcode == Op_SignumD) {
1221     assert(UseSSE > 1, "required");
1222     ucomisd(dst, zero);
1223     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1224     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1225     movdbl(dst, one);
1226     jcc(Assembler::above, DONE_LABEL);
1227     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1228   }
1229 
1230   bind(DONE_LABEL);
1231 }
1232 
1233 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1234   if (sign) {
1235     pmovsxbw(dst, src);
1236   } else {
1237     pmovzxbw(dst, src);
1238   }
1239 }
1240 
1241 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1242   if (sign) {
1243     vpmovsxbw(dst, src, vector_len);
1244   } else {
1245     vpmovzxbw(dst, src, vector_len);
1246   }
1247 }
1248 
1249 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1250   if (sign) {
1251     vpmovsxbd(dst, src, vector_len);
1252   } else {
1253     vpmovzxbd(dst, src, vector_len);
1254   }
1255 }
1256 
1257 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1258   if (sign) {
1259     vpmovsxwd(dst, src, vector_len);
1260   } else {
1261     vpmovzxwd(dst, src, vector_len);
1262   }
1263 }
1264 
1265 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1266                                      int shift, int vector_len) {
1267   if (opcode == Op_RotateLeftV) {
1268     if (etype == T_INT) {
1269       evprold(dst, src, shift, vector_len);
1270     } else {
1271       assert(etype == T_LONG, "expected type T_LONG");
1272       evprolq(dst, src, shift, vector_len);
1273     }
1274   } else {
1275     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1276     if (etype == T_INT) {
1277       evprord(dst, src, shift, vector_len);
1278     } else {
1279       assert(etype == T_LONG, "expected type T_LONG");
1280       evprorq(dst, src, shift, vector_len);
1281     }
1282   }
1283 }
1284 
1285 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1286                                      XMMRegister shift, int vector_len) {
1287   if (opcode == Op_RotateLeftV) {
1288     if (etype == T_INT) {
1289       evprolvd(dst, src, shift, vector_len);
1290     } else {
1291       assert(etype == T_LONG, "expected type T_LONG");
1292       evprolvq(dst, src, shift, vector_len);
1293     }
1294   } else {
1295     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1296     if (etype == T_INT) {
1297       evprorvd(dst, src, shift, vector_len);
1298     } else {
1299       assert(etype == T_LONG, "expected type T_LONG");
1300       evprorvq(dst, src, shift, vector_len);
1301     }
1302   }
1303 }
1304 
1305 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1306   if (opcode == Op_RShiftVI) {
1307     psrad(dst, shift);
1308   } else if (opcode == Op_LShiftVI) {
1309     pslld(dst, shift);
1310   } else {
1311     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1312     psrld(dst, shift);
1313   }
1314 }
1315 
1316 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1317   switch (opcode) {
1318     case Op_RShiftVI:  psrad(dst, shift); break;
1319     case Op_LShiftVI:  pslld(dst, shift); break;
1320     case Op_URShiftVI: psrld(dst, shift); break;
1321 
1322     default: assert(false, "%s", NodeClassNames[opcode]);
1323   }
1324 }
1325 
1326 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1327   if (opcode == Op_RShiftVI) {
1328     vpsrad(dst, nds, shift, vector_len);
1329   } else if (opcode == Op_LShiftVI) {
1330     vpslld(dst, nds, shift, vector_len);
1331   } else {
1332     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1333     vpsrld(dst, nds, shift, vector_len);
1334   }
1335 }
1336 
1337 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1338   switch (opcode) {
1339     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1340     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1341     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1342 
1343     default: assert(false, "%s", NodeClassNames[opcode]);
1344   }
1345 }
1346 
1347 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1348   switch (opcode) {
1349     case Op_RShiftVB:  // fall-through
1350     case Op_RShiftVS:  psraw(dst, shift); break;
1351 
1352     case Op_LShiftVB:  // fall-through
1353     case Op_LShiftVS:  psllw(dst, shift);   break;
1354 
1355     case Op_URShiftVS: // fall-through
1356     case Op_URShiftVB: psrlw(dst, shift);  break;
1357 
1358     default: assert(false, "%s", NodeClassNames[opcode]);
1359   }
1360 }
1361 
1362 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1363   switch (opcode) {
1364     case Op_RShiftVB:  // fall-through
1365     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1366 
1367     case Op_LShiftVB:  // fall-through
1368     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1369 
1370     case Op_URShiftVS: // fall-through
1371     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1372 
1373     default: assert(false, "%s", NodeClassNames[opcode]);
1374   }
1375 }
1376 
1377 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1378   switch (opcode) {
1379     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1380     case Op_LShiftVL:  psllq(dst, shift); break;
1381     case Op_URShiftVL: psrlq(dst, shift); break;
1382 
1383     default: assert(false, "%s", NodeClassNames[opcode]);
1384   }
1385 }
1386 
1387 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1388   if (opcode == Op_RShiftVL) {
1389     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1390   } else if (opcode == Op_LShiftVL) {
1391     psllq(dst, shift);
1392   } else {
1393     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1394     psrlq(dst, shift);
1395   }
1396 }
1397 
1398 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1399   switch (opcode) {
1400     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1401     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1402     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1403 
1404     default: assert(false, "%s", NodeClassNames[opcode]);
1405   }
1406 }
1407 
1408 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1409   if (opcode == Op_RShiftVL) {
1410     evpsraq(dst, nds, shift, vector_len);
1411   } else if (opcode == Op_LShiftVL) {
1412     vpsllq(dst, nds, shift, vector_len);
1413   } else {
1414     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1415     vpsrlq(dst, nds, shift, vector_len);
1416   }
1417 }
1418 
1419 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1420   switch (opcode) {
1421     case Op_RShiftVB:  // fall-through
1422     case Op_RShiftVS:  // fall-through
1423     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1424 
1425     case Op_LShiftVB:  // fall-through
1426     case Op_LShiftVS:  // fall-through
1427     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1428 
1429     case Op_URShiftVB: // fall-through
1430     case Op_URShiftVS: // fall-through
1431     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1432 
1433     default: assert(false, "%s", NodeClassNames[opcode]);
1434   }
1435 }
1436 
1437 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1438   switch (opcode) {
1439     case Op_RShiftVB:  // fall-through
1440     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1441 
1442     case Op_LShiftVB:  // fall-through
1443     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1444 
1445     case Op_URShiftVB: // fall-through
1446     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1447 
1448     default: assert(false, "%s", NodeClassNames[opcode]);
1449   }
1450 }
1451 
1452 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1453   assert(UseAVX >= 2, "required");
1454   switch (opcode) {
1455     case Op_RShiftVL: {
1456       if (UseAVX > 2) {
1457         assert(tmp == xnoreg, "not used");
1458         if (!VM_Version::supports_avx512vl()) {
1459           vlen_enc = Assembler::AVX_512bit;
1460         }
1461         evpsravq(dst, src, shift, vlen_enc);
1462       } else {
1463         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1464         vpsrlvq(dst, src, shift, vlen_enc);
1465         vpsrlvq(tmp, tmp, shift, vlen_enc);
1466         vpxor(dst, dst, tmp, vlen_enc);
1467         vpsubq(dst, dst, tmp, vlen_enc);
1468       }
1469       break;
1470     }
1471     case Op_LShiftVL: {
1472       assert(tmp == xnoreg, "not used");
1473       vpsllvq(dst, src, shift, vlen_enc);
1474       break;
1475     }
1476     case Op_URShiftVL: {
1477       assert(tmp == xnoreg, "not used");
1478       vpsrlvq(dst, src, shift, vlen_enc);
1479       break;
1480     }
1481     default: assert(false, "%s", NodeClassNames[opcode]);
1482   }
1483 }
1484 
1485 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1486 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1487   assert(opcode == Op_LShiftVB ||
1488          opcode == Op_RShiftVB ||
1489          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1490   bool sign = (opcode != Op_URShiftVB);
1491   assert(vector_len == 0, "required");
1492   vextendbd(sign, dst, src, 1);
1493   vpmovzxbd(vtmp, shift, 1);
1494   varshiftd(opcode, dst, dst, vtmp, 1);
1495   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1496   vextracti128_high(vtmp, dst);
1497   vpackusdw(dst, dst, vtmp, 0);
1498 }
1499 
1500 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1501 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1502   assert(opcode == Op_LShiftVB ||
1503          opcode == Op_RShiftVB ||
1504          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1505   bool sign = (opcode != Op_URShiftVB);
1506   int ext_vector_len = vector_len + 1;
1507   vextendbw(sign, dst, src, ext_vector_len);
1508   vpmovzxbw(vtmp, shift, ext_vector_len);
1509   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1510   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1511   if (vector_len == 0) {
1512     vextracti128_high(vtmp, dst);
1513     vpackuswb(dst, dst, vtmp, vector_len);
1514   } else {
1515     vextracti64x4_high(vtmp, dst);
1516     vpackuswb(dst, dst, vtmp, vector_len);
1517     vpermq(dst, dst, 0xD8, vector_len);
1518   }
1519 }
1520 
1521 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1522   switch(typ) {
1523     case T_BYTE:
1524       pinsrb(dst, val, idx);
1525       break;
1526     case T_SHORT:
1527       pinsrw(dst, val, idx);
1528       break;
1529     case T_INT:
1530       pinsrd(dst, val, idx);
1531       break;
1532     case T_LONG:
1533       pinsrq(dst, val, idx);
1534       break;
1535     default:
1536       assert(false,"Should not reach here.");
1537       break;
1538   }
1539 }
1540 
1541 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1542   switch(typ) {
1543     case T_BYTE:
1544       vpinsrb(dst, src, val, idx);
1545       break;
1546     case T_SHORT:
1547       vpinsrw(dst, src, val, idx);
1548       break;
1549     case T_INT:
1550       vpinsrd(dst, src, val, idx);
1551       break;
1552     case T_LONG:
1553       vpinsrq(dst, src, val, idx);
1554       break;
1555     default:
1556       assert(false,"Should not reach here.");
1557       break;
1558   }
1559 }
1560 
1561 #ifdef _LP64
1562 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1563                                                 XMMRegister dst, Register base,
1564                                                 Register idx_base,
1565                                                 Register offset, Register mask,
1566                                                 Register mask_idx, Register rtmp,
1567                                                 int vlen_enc) {
1568   vpxor(dst, dst, dst, vlen_enc);
1569   if (elem_bt == T_SHORT) {
1570     for (int i = 0; i < 4; i++) {
1571       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1572       Label skip_load;
1573       btq(mask, mask_idx);
1574       jccb(Assembler::carryClear, skip_load);
1575       movl(rtmp, Address(idx_base, i * 4));
1576       if (offset != noreg) {
1577         addl(rtmp, offset);
1578       }
1579       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1580       bind(skip_load);
1581       incq(mask_idx);
1582     }
1583   } else {
1584     assert(elem_bt == T_BYTE, "");
1585     for (int i = 0; i < 8; i++) {
1586       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1587       Label skip_load;
1588       btq(mask, mask_idx);
1589       jccb(Assembler::carryClear, skip_load);
1590       movl(rtmp, Address(idx_base, i * 4));
1591       if (offset != noreg) {
1592         addl(rtmp, offset);
1593       }
1594       pinsrb(dst, Address(base, rtmp), i);
1595       bind(skip_load);
1596       incq(mask_idx);
1597     }
1598   }
1599 }
1600 #endif // _LP64
1601 
1602 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1603                                          Register base, Register idx_base,
1604                                          Register offset, Register rtmp,
1605                                          int vlen_enc) {
1606   vpxor(dst, dst, dst, vlen_enc);
1607   if (elem_bt == T_SHORT) {
1608     for (int i = 0; i < 4; i++) {
1609       // dst[i] = src[offset + idx_base[i]]
1610       movl(rtmp, Address(idx_base, i * 4));
1611       if (offset != noreg) {
1612         addl(rtmp, offset);
1613       }
1614       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1615     }
1616   } else {
1617     assert(elem_bt == T_BYTE, "");
1618     for (int i = 0; i < 8; i++) {
1619       // dst[i] = src[offset + idx_base[i]]
1620       movl(rtmp, Address(idx_base, i * 4));
1621       if (offset != noreg) {
1622         addl(rtmp, offset);
1623       }
1624       pinsrb(dst, Address(base, rtmp), i);
1625     }
1626   }
1627 }
1628 
1629 /*
1630  * Gather using hybrid algorithm, first partially unroll scalar loop
1631  * to accumulate values from gather indices into a quad-word(64bit) slice.
1632  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1633  * permutation to place the slice into appropriate vector lane
1634  * locations in destination vector. Following pseudo code describes the
1635  * algorithm in detail:
1636  *
1637  * DST_VEC = ZERO_VEC
1638  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1639  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1640  * FOREACH_ITER:
1641  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1642  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1643  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1644  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1645  *
1646  * With each iteration, doubleword permute indices (0,1) corresponding
1647  * to gathered quadword gets right shifted by two lane positions.
1648  *
1649  */
1650 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1651                                         Register base, Register idx_base,
1652                                         Register offset, Register mask,
1653                                         XMMRegister xtmp1, XMMRegister xtmp2,
1654                                         XMMRegister temp_dst, Register rtmp,
1655                                         Register mask_idx, Register length,
1656                                         int vector_len, int vlen_enc) {
1657   Label GATHER8_LOOP;
1658   assert(is_subword_type(elem_ty), "");
1659   movl(length, vector_len);
1660   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1661   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1662   vallones(xtmp2, vlen_enc);
1663   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1664   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1665   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1666 
1667   bind(GATHER8_LOOP);
1668     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1669     if (mask == noreg) {
1670       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1671     } else {
1672       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1673     }
1674     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1675     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1676     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1677     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1678     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1679     vpor(dst, dst, temp_dst, vlen_enc);
1680     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1681     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1682     jcc(Assembler::notEqual, GATHER8_LOOP);
1683 }
1684 
1685 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1686   switch(typ) {
1687     case T_INT:
1688       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1689       break;
1690     case T_FLOAT:
1691       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1692       break;
1693     case T_LONG:
1694       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1695       break;
1696     case T_DOUBLE:
1697       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1698       break;
1699     default:
1700       assert(false,"Should not reach here.");
1701       break;
1702   }
1703 }
1704 
1705 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1706   switch(typ) {
1707     case T_INT:
1708       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1709       break;
1710     case T_FLOAT:
1711       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1712       break;
1713     case T_LONG:
1714       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1715       break;
1716     case T_DOUBLE:
1717       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1718       break;
1719     default:
1720       assert(false,"Should not reach here.");
1721       break;
1722   }
1723 }
1724 
1725 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1726   switch(typ) {
1727     case T_INT:
1728       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1729       break;
1730     case T_FLOAT:
1731       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1732       break;
1733     case T_LONG:
1734       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1735       break;
1736     case T_DOUBLE:
1737       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1738       break;
1739     default:
1740       assert(false,"Should not reach here.");
1741       break;
1742   }
1743 }
1744 
1745 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1746   if (vlen_in_bytes <= 16) {
1747     pxor (dst, dst);
1748     psubb(dst, src);
1749     switch (elem_bt) {
1750       case T_BYTE:   /* nothing to do */ break;
1751       case T_SHORT:  pmovsxbw(dst, dst); break;
1752       case T_INT:    pmovsxbd(dst, dst); break;
1753       case T_FLOAT:  pmovsxbd(dst, dst); break;
1754       case T_LONG:   pmovsxbq(dst, dst); break;
1755       case T_DOUBLE: pmovsxbq(dst, dst); break;
1756 
1757       default: assert(false, "%s", type2name(elem_bt));
1758     }
1759   } else {
1760     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1761     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1762 
1763     vpxor (dst, dst, dst, vlen_enc);
1764     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1765 
1766     switch (elem_bt) {
1767       case T_BYTE:   /* nothing to do */            break;
1768       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1769       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1770       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1771       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1772       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1773 
1774       default: assert(false, "%s", type2name(elem_bt));
1775     }
1776   }
1777 }
1778 
1779 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1780   if (novlbwdq) {
1781     vpmovsxbd(xtmp, src, vlen_enc);
1782     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1783             Assembler::eq, true, vlen_enc, noreg);
1784   } else {
1785     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1786     vpsubb(xtmp, xtmp, src, vlen_enc);
1787     evpmovb2m(dst, xtmp, vlen_enc);
1788   }
1789 }
1790 
1791 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1792   if (is_integral_type(bt)) {
1793     switch (vlen_in_bytes) {
1794       case 4:  movdl(dst, src);   break;
1795       case 8:  movq(dst, src);    break;
1796       case 16: movdqu(dst, src);  break;
1797       case 32: vmovdqu(dst, src); break;
1798       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1799       default: ShouldNotReachHere();
1800     }
1801   } else {
1802     switch (vlen_in_bytes) {
1803       case 4:  movflt(dst, src); break;
1804       case 8:  movdbl(dst, src); break;
1805       case 16: movups(dst, src); break;
1806       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1807       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1808       default: ShouldNotReachHere();
1809     }
1810   }
1811 }
1812 
1813 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1814   assert(rscratch != noreg || always_reachable(src), "missing");
1815 
1816   if (reachable(src)) {
1817     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1818   } else {
1819     lea(rscratch, src);
1820     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1821   }
1822 }
1823 
1824 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1825   int vlen_enc = vector_length_encoding(vlen);
1826   if (VM_Version::supports_avx()) {
1827     if (bt == T_LONG) {
1828       if (VM_Version::supports_avx2()) {
1829         vpbroadcastq(dst, src, vlen_enc);
1830       } else {
1831         vmovddup(dst, src, vlen_enc);
1832       }
1833     } else if (bt == T_DOUBLE) {
1834       if (vlen_enc != Assembler::AVX_128bit) {
1835         vbroadcastsd(dst, src, vlen_enc, noreg);
1836       } else {
1837         vmovddup(dst, src, vlen_enc);
1838       }
1839     } else {
1840       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1841         vpbroadcastd(dst, src, vlen_enc);
1842       } else {
1843         vbroadcastss(dst, src, vlen_enc);
1844       }
1845     }
1846   } else if (VM_Version::supports_sse3()) {
1847     movddup(dst, src);
1848   } else {
1849     load_vector(bt, dst, src, vlen);
1850   }
1851 }
1852 
1853 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1854   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1855   int offset = exact_log2(type2aelembytes(bt)) << 6;
1856   if (is_floating_point_type(bt)) {
1857     offset += 128;
1858   }
1859   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1860   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1861 }
1862 
1863 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1864 
1865 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1866   int vector_len = Assembler::AVX_128bit;
1867 
1868   switch (opcode) {
1869     case Op_AndReductionV:  pand(dst, src); break;
1870     case Op_OrReductionV:   por (dst, src); break;
1871     case Op_XorReductionV:  pxor(dst, src); break;
1872     case Op_MinReductionV:
1873       switch (typ) {
1874         case T_BYTE:        pminsb(dst, src); break;
1875         case T_SHORT:       pminsw(dst, src); break;
1876         case T_INT:         pminsd(dst, src); break;
1877         case T_LONG:        assert(UseAVX > 2, "required");
1878                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1879         default:            assert(false, "wrong type");
1880       }
1881       break;
1882     case Op_MaxReductionV:
1883       switch (typ) {
1884         case T_BYTE:        pmaxsb(dst, src); break;
1885         case T_SHORT:       pmaxsw(dst, src); break;
1886         case T_INT:         pmaxsd(dst, src); break;
1887         case T_LONG:        assert(UseAVX > 2, "required");
1888                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1889         default:            assert(false, "wrong type");
1890       }
1891       break;
1892     case Op_AddReductionVF: addss(dst, src); break;
1893     case Op_AddReductionVD: addsd(dst, src); break;
1894     case Op_AddReductionVI:
1895       switch (typ) {
1896         case T_BYTE:        paddb(dst, src); break;
1897         case T_SHORT:       paddw(dst, src); break;
1898         case T_INT:         paddd(dst, src); break;
1899         default:            assert(false, "wrong type");
1900       }
1901       break;
1902     case Op_AddReductionVL: paddq(dst, src); break;
1903     case Op_MulReductionVF: mulss(dst, src); break;
1904     case Op_MulReductionVD: mulsd(dst, src); break;
1905     case Op_MulReductionVI:
1906       switch (typ) {
1907         case T_SHORT:       pmullw(dst, src); break;
1908         case T_INT:         pmulld(dst, src); break;
1909         default:            assert(false, "wrong type");
1910       }
1911       break;
1912     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1913                             evpmullq(dst, dst, src, vector_len); break;
1914     default:                assert(false, "wrong opcode");
1915   }
1916 }
1917 
1918 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1919   switch (opcode) {
1920     case Op_AddReductionVF: addps(dst, src); break;
1921     case Op_AddReductionVD: addpd(dst, src); break;
1922     case Op_MulReductionVF: mulps(dst, src); break;
1923     case Op_MulReductionVD: mulpd(dst, src); break;
1924     default:                assert(false, "%s", NodeClassNames[opcode]);
1925   }
1926 }
1927 
1928 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1929   int vector_len = Assembler::AVX_256bit;
1930 
1931   switch (opcode) {
1932     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1933     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1934     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1935     case Op_MinReductionV:
1936       switch (typ) {
1937         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1938         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1939         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1940         case T_LONG:        assert(UseAVX > 2, "required");
1941                             vpminsq(dst, src1, src2, vector_len); break;
1942         default:            assert(false, "wrong type");
1943       }
1944       break;
1945     case Op_MaxReductionV:
1946       switch (typ) {
1947         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1948         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1949         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1950         case T_LONG:        assert(UseAVX > 2, "required");
1951                             vpmaxsq(dst, src1, src2, vector_len); break;
1952         default:            assert(false, "wrong type");
1953       }
1954       break;
1955     case Op_AddReductionVI:
1956       switch (typ) {
1957         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1958         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1959         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1960         default:            assert(false, "wrong type");
1961       }
1962       break;
1963     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1964     case Op_MulReductionVI:
1965       switch (typ) {
1966         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1967         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1968         default:            assert(false, "wrong type");
1969       }
1970       break;
1971     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1972     default:                assert(false, "wrong opcode");
1973   }
1974 }
1975 
1976 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1977   int vector_len = Assembler::AVX_256bit;
1978 
1979   switch (opcode) {
1980     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1981     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1982     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1983     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1984     default:                assert(false, "%s", NodeClassNames[opcode]);
1985   }
1986 }
1987 
1988 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1989                                   XMMRegister dst, XMMRegister src,
1990                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1991   switch (opcode) {
1992     case Op_AddReductionVF:
1993     case Op_MulReductionVF:
1994       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1995       break;
1996 
1997     case Op_AddReductionVD:
1998     case Op_MulReductionVD:
1999       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2000       break;
2001 
2002     default: assert(false, "wrong opcode");
2003   }
2004 }
2005 
2006 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
2007                                             XMMRegister dst, XMMRegister src,
2008                                             XMMRegister vtmp1, XMMRegister vtmp2) {
2009   switch (opcode) {
2010     case Op_AddReductionVF:
2011     case Op_MulReductionVF:
2012       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2013       break;
2014 
2015     case Op_AddReductionVD:
2016     case Op_MulReductionVD:
2017       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2018       break;
2019 
2020     default: assert(false, "%s", NodeClassNames[opcode]);
2021   }
2022 }
2023 
2024 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2025                              Register dst, Register src1, XMMRegister src2,
2026                              XMMRegister vtmp1, XMMRegister vtmp2) {
2027   switch (vlen) {
2028     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2029     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2030     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2031     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2032 
2033     default: assert(false, "wrong vector length");
2034   }
2035 }
2036 
2037 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2038                              Register dst, Register src1, XMMRegister src2,
2039                              XMMRegister vtmp1, XMMRegister vtmp2) {
2040   switch (vlen) {
2041     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2042     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2043     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2044     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2045 
2046     default: assert(false, "wrong vector length");
2047   }
2048 }
2049 
2050 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2051                              Register dst, Register src1, XMMRegister src2,
2052                              XMMRegister vtmp1, XMMRegister vtmp2) {
2053   switch (vlen) {
2054     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2055     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2056     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2057     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2058 
2059     default: assert(false, "wrong vector length");
2060   }
2061 }
2062 
2063 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2064                              Register dst, Register src1, XMMRegister src2,
2065                              XMMRegister vtmp1, XMMRegister vtmp2) {
2066   switch (vlen) {
2067     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2068     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2069     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2070     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2071 
2072     default: assert(false, "wrong vector length");
2073   }
2074 }
2075 
2076 #ifdef _LP64
2077 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2078                              Register dst, Register src1, XMMRegister src2,
2079                              XMMRegister vtmp1, XMMRegister vtmp2) {
2080   switch (vlen) {
2081     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2082     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2083     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2084 
2085     default: assert(false, "wrong vector length");
2086   }
2087 }
2088 #endif // _LP64
2089 
2090 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2091   switch (vlen) {
2092     case 2:
2093       assert(vtmp2 == xnoreg, "");
2094       reduce2F(opcode, dst, src, vtmp1);
2095       break;
2096     case 4:
2097       assert(vtmp2 == xnoreg, "");
2098       reduce4F(opcode, dst, src, vtmp1);
2099       break;
2100     case 8:
2101       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2102       break;
2103     case 16:
2104       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2105       break;
2106     default: assert(false, "wrong vector length");
2107   }
2108 }
2109 
2110 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2111   switch (vlen) {
2112     case 2:
2113       assert(vtmp2 == xnoreg, "");
2114       reduce2D(opcode, dst, src, vtmp1);
2115       break;
2116     case 4:
2117       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2118       break;
2119     case 8:
2120       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2121       break;
2122     default: assert(false, "wrong vector length");
2123   }
2124 }
2125 
2126 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2127   switch (vlen) {
2128     case 2:
2129       assert(vtmp1 == xnoreg, "");
2130       assert(vtmp2 == xnoreg, "");
2131       unorderedReduce2F(opcode, dst, src);
2132       break;
2133     case 4:
2134       assert(vtmp2 == xnoreg, "");
2135       unorderedReduce4F(opcode, dst, src, vtmp1);
2136       break;
2137     case 8:
2138       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2139       break;
2140     case 16:
2141       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2142       break;
2143     default: assert(false, "wrong vector length");
2144   }
2145 }
2146 
2147 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2148   switch (vlen) {
2149     case 2:
2150       assert(vtmp1 == xnoreg, "");
2151       assert(vtmp2 == xnoreg, "");
2152       unorderedReduce2D(opcode, dst, src);
2153       break;
2154     case 4:
2155       assert(vtmp2 == xnoreg, "");
2156       unorderedReduce4D(opcode, dst, src, vtmp1);
2157       break;
2158     case 8:
2159       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2160       break;
2161     default: assert(false, "wrong vector length");
2162   }
2163 }
2164 
2165 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2166   if (opcode == Op_AddReductionVI) {
2167     if (vtmp1 != src2) {
2168       movdqu(vtmp1, src2);
2169     }
2170     phaddd(vtmp1, vtmp1);
2171   } else {
2172     pshufd(vtmp1, src2, 0x1);
2173     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2174   }
2175   movdl(vtmp2, src1);
2176   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2177   movdl(dst, vtmp1);
2178 }
2179 
2180 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2181   if (opcode == Op_AddReductionVI) {
2182     if (vtmp1 != src2) {
2183       movdqu(vtmp1, src2);
2184     }
2185     phaddd(vtmp1, src2);
2186     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2187   } else {
2188     pshufd(vtmp2, src2, 0xE);
2189     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2190     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2191   }
2192 }
2193 
2194 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2195   if (opcode == Op_AddReductionVI) {
2196     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2197     vextracti128_high(vtmp2, vtmp1);
2198     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2199     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2200   } else {
2201     vextracti128_high(vtmp1, src2);
2202     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2203     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2204   }
2205 }
2206 
2207 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2208   vextracti64x4_high(vtmp2, src2);
2209   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2210   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2211 }
2212 
2213 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2214   pshufd(vtmp2, src2, 0x1);
2215   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2216   movdqu(vtmp1, vtmp2);
2217   psrldq(vtmp1, 2);
2218   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2219   movdqu(vtmp2, vtmp1);
2220   psrldq(vtmp2, 1);
2221   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2222   movdl(vtmp2, src1);
2223   pmovsxbd(vtmp1, vtmp1);
2224   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2225   pextrb(dst, vtmp1, 0x0);
2226   movsbl(dst, dst);
2227 }
2228 
2229 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2230   pshufd(vtmp1, src2, 0xE);
2231   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2232   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2233 }
2234 
2235 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2236   vextracti128_high(vtmp2, src2);
2237   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2238   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2239 }
2240 
2241 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2242   vextracti64x4_high(vtmp1, src2);
2243   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2244   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2245 }
2246 
2247 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2248   pmovsxbw(vtmp2, src2);
2249   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2250 }
2251 
2252 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2253   if (UseAVX > 1) {
2254     int vector_len = Assembler::AVX_256bit;
2255     vpmovsxbw(vtmp1, src2, vector_len);
2256     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2257   } else {
2258     pmovsxbw(vtmp2, src2);
2259     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2260     pshufd(vtmp2, src2, 0x1);
2261     pmovsxbw(vtmp2, src2);
2262     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2263   }
2264 }
2265 
2266 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2267   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2268     int vector_len = Assembler::AVX_512bit;
2269     vpmovsxbw(vtmp1, src2, vector_len);
2270     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2271   } else {
2272     assert(UseAVX >= 2,"Should not reach here.");
2273     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2274     vextracti128_high(vtmp2, src2);
2275     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2276   }
2277 }
2278 
2279 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2280   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2281   vextracti64x4_high(vtmp2, src2);
2282   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2283 }
2284 
2285 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2286   if (opcode == Op_AddReductionVI) {
2287     if (vtmp1 != src2) {
2288       movdqu(vtmp1, src2);
2289     }
2290     phaddw(vtmp1, vtmp1);
2291     phaddw(vtmp1, vtmp1);
2292   } else {
2293     pshufd(vtmp2, src2, 0x1);
2294     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2295     movdqu(vtmp1, vtmp2);
2296     psrldq(vtmp1, 2);
2297     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2298   }
2299   movdl(vtmp2, src1);
2300   pmovsxwd(vtmp1, vtmp1);
2301   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2302   pextrw(dst, vtmp1, 0x0);
2303   movswl(dst, dst);
2304 }
2305 
2306 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2307   if (opcode == Op_AddReductionVI) {
2308     if (vtmp1 != src2) {
2309       movdqu(vtmp1, src2);
2310     }
2311     phaddw(vtmp1, src2);
2312   } else {
2313     pshufd(vtmp1, src2, 0xE);
2314     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2315   }
2316   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2317 }
2318 
2319 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2320   if (opcode == Op_AddReductionVI) {
2321     int vector_len = Assembler::AVX_256bit;
2322     vphaddw(vtmp2, src2, src2, vector_len);
2323     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2324   } else {
2325     vextracti128_high(vtmp2, src2);
2326     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2327   }
2328   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2329 }
2330 
2331 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2332   int vector_len = Assembler::AVX_256bit;
2333   vextracti64x4_high(vtmp1, src2);
2334   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2335   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2336 }
2337 
2338 #ifdef _LP64
2339 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2340   pshufd(vtmp2, src2, 0xE);
2341   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2342   movdq(vtmp1, src1);
2343   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2344   movdq(dst, vtmp1);
2345 }
2346 
2347 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2348   vextracti128_high(vtmp1, src2);
2349   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2350   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2351 }
2352 
2353 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2354   vextracti64x4_high(vtmp2, src2);
2355   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2356   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2357 }
2358 
2359 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2360   mov64(temp, -1L);
2361   bzhiq(temp, temp, len);
2362   kmovql(dst, temp);
2363 }
2364 #endif // _LP64
2365 
2366 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2367   reduce_operation_128(T_FLOAT, opcode, dst, src);
2368   pshufd(vtmp, src, 0x1);
2369   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2370 }
2371 
2372 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2373   reduce2F(opcode, dst, src, vtmp);
2374   pshufd(vtmp, src, 0x2);
2375   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2376   pshufd(vtmp, src, 0x3);
2377   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2378 }
2379 
2380 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2381   reduce4F(opcode, dst, src, vtmp2);
2382   vextractf128_high(vtmp2, src);
2383   reduce4F(opcode, dst, vtmp2, vtmp1);
2384 }
2385 
2386 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2387   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2388   vextracti64x4_high(vtmp1, src);
2389   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2390 }
2391 
2392 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2393   pshufd(dst, src, 0x1);
2394   reduce_operation_128(T_FLOAT, opcode, dst, src);
2395 }
2396 
2397 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2398   pshufd(vtmp, src, 0xE);
2399   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2400   unorderedReduce2F(opcode, dst, vtmp);
2401 }
2402 
2403 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2404   vextractf128_high(vtmp1, src);
2405   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2406   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2407 }
2408 
2409 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2410   vextractf64x4_high(vtmp2, src);
2411   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2412   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2413 }
2414 
2415 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2416   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2417   pshufd(vtmp, src, 0xE);
2418   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2419 }
2420 
2421 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2422   reduce2D(opcode, dst, src, vtmp2);
2423   vextractf128_high(vtmp2, src);
2424   reduce2D(opcode, dst, vtmp2, vtmp1);
2425 }
2426 
2427 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2428   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2429   vextracti64x4_high(vtmp1, src);
2430   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2431 }
2432 
2433 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2434   pshufd(dst, src, 0xE);
2435   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2436 }
2437 
2438 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2439   vextractf128_high(vtmp, src);
2440   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2441   unorderedReduce2D(opcode, dst, vtmp);
2442 }
2443 
2444 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2445   vextractf64x4_high(vtmp2, src);
2446   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2447   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2448 }
2449 
2450 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2451   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2452 }
2453 
2454 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2455   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2456 }
2457 
2458 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2459   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2460 }
2461 
2462 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2463                                  int vec_enc) {
2464   switch(elem_bt) {
2465     case T_INT:
2466     case T_FLOAT:
2467       vmaskmovps(dst, src, mask, vec_enc);
2468       break;
2469     case T_LONG:
2470     case T_DOUBLE:
2471       vmaskmovpd(dst, src, mask, vec_enc);
2472       break;
2473     default:
2474       fatal("Unsupported type %s", type2name(elem_bt));
2475       break;
2476   }
2477 }
2478 
2479 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2480                                  int vec_enc) {
2481   switch(elem_bt) {
2482     case T_INT:
2483     case T_FLOAT:
2484       vmaskmovps(dst, src, mask, vec_enc);
2485       break;
2486     case T_LONG:
2487     case T_DOUBLE:
2488       vmaskmovpd(dst, src, mask, vec_enc);
2489       break;
2490     default:
2491       fatal("Unsupported type %s", type2name(elem_bt));
2492       break;
2493   }
2494 }
2495 
2496 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2497                                           XMMRegister dst, XMMRegister src,
2498                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2499                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2500   const int permconst[] = {1, 14};
2501   XMMRegister wsrc = src;
2502   XMMRegister wdst = xmm_0;
2503   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2504 
2505   int vlen_enc = Assembler::AVX_128bit;
2506   if (vlen == 16) {
2507     vlen_enc = Assembler::AVX_256bit;
2508   }
2509 
2510   for (int i = log2(vlen) - 1; i >=0; i--) {
2511     if (i == 0 && !is_dst_valid) {
2512       wdst = dst;
2513     }
2514     if (i == 3) {
2515       vextracti64x4_high(wtmp, wsrc);
2516     } else if (i == 2) {
2517       vextracti128_high(wtmp, wsrc);
2518     } else { // i = [0,1]
2519       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2520     }
2521     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2522     wsrc = wdst;
2523     vlen_enc = Assembler::AVX_128bit;
2524   }
2525   if (is_dst_valid) {
2526     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2527   }
2528 }
2529 
2530 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2531                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2532                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2533   XMMRegister wsrc = src;
2534   XMMRegister wdst = xmm_0;
2535   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2536   int vlen_enc = Assembler::AVX_128bit;
2537   if (vlen == 8) {
2538     vlen_enc = Assembler::AVX_256bit;
2539   }
2540   for (int i = log2(vlen) - 1; i >=0; i--) {
2541     if (i == 0 && !is_dst_valid) {
2542       wdst = dst;
2543     }
2544     if (i == 1) {
2545       vextracti128_high(wtmp, wsrc);
2546     } else if (i == 2) {
2547       vextracti64x4_high(wtmp, wsrc);
2548     } else {
2549       assert(i == 0, "%d", i);
2550       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2551     }
2552     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2553     wsrc = wdst;
2554     vlen_enc = Assembler::AVX_128bit;
2555   }
2556   if (is_dst_valid) {
2557     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2558   }
2559 }
2560 
2561 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2562   switch (bt) {
2563     case T_BYTE:  pextrb(dst, src, idx); break;
2564     case T_SHORT: pextrw(dst, src, idx); break;
2565     case T_INT:   pextrd(dst, src, idx); break;
2566     case T_LONG:  pextrq(dst, src, idx); break;
2567 
2568     default:
2569       assert(false,"Should not reach here.");
2570       break;
2571   }
2572 }
2573 
2574 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2575   int esize =  type2aelembytes(typ);
2576   int elem_per_lane = 16/esize;
2577   int lane = elemindex / elem_per_lane;
2578   int eindex = elemindex % elem_per_lane;
2579 
2580   if (lane >= 2) {
2581     assert(UseAVX > 2, "required");
2582     vextractf32x4(dst, src, lane & 3);
2583     return dst;
2584   } else if (lane > 0) {
2585     assert(UseAVX > 0, "required");
2586     vextractf128(dst, src, lane);
2587     return dst;
2588   } else {
2589     return src;
2590   }
2591 }
2592 
2593 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2594   if (typ == T_BYTE) {
2595     movsbl(dst, dst);
2596   } else if (typ == T_SHORT) {
2597     movswl(dst, dst);
2598   }
2599 }
2600 
2601 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2602   int esize =  type2aelembytes(typ);
2603   int elem_per_lane = 16/esize;
2604   int eindex = elemindex % elem_per_lane;
2605   assert(is_integral_type(typ),"required");
2606 
2607   if (eindex == 0) {
2608     if (typ == T_LONG) {
2609       movq(dst, src);
2610     } else {
2611       movdl(dst, src);
2612       movsxl(typ, dst);
2613     }
2614   } else {
2615     extract(typ, dst, src, eindex);
2616     movsxl(typ, dst);
2617   }
2618 }
2619 
2620 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2621   int esize =  type2aelembytes(typ);
2622   int elem_per_lane = 16/esize;
2623   int eindex = elemindex % elem_per_lane;
2624   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2625 
2626   if (eindex == 0) {
2627     movq(dst, src);
2628   } else {
2629     if (typ == T_FLOAT) {
2630       if (UseAVX == 0) {
2631         movdqu(dst, src);
2632         shufps(dst, dst, eindex);
2633       } else {
2634         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2635       }
2636     } else {
2637       if (UseAVX == 0) {
2638         movdqu(dst, src);
2639         psrldq(dst, eindex*esize);
2640       } else {
2641         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2642       }
2643       movq(dst, dst);
2644     }
2645   }
2646   // Zero upper bits
2647   if (typ == T_FLOAT) {
2648     if (UseAVX == 0) {
2649       assert(vtmp != xnoreg, "required.");
2650       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2651       pand(dst, vtmp);
2652     } else {
2653       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2654     }
2655   }
2656 }
2657 
2658 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2659   switch(typ) {
2660     case T_BYTE:
2661     case T_BOOLEAN:
2662       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2663       break;
2664     case T_SHORT:
2665     case T_CHAR:
2666       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2667       break;
2668     case T_INT:
2669     case T_FLOAT:
2670       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2671       break;
2672     case T_LONG:
2673     case T_DOUBLE:
2674       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2675       break;
2676     default:
2677       assert(false,"Should not reach here.");
2678       break;
2679   }
2680 }
2681 
2682 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2683   assert(rscratch != noreg || always_reachable(src2), "missing");
2684 
2685   switch(typ) {
2686     case T_BOOLEAN:
2687     case T_BYTE:
2688       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2689       break;
2690     case T_CHAR:
2691     case T_SHORT:
2692       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2693       break;
2694     case T_INT:
2695     case T_FLOAT:
2696       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2697       break;
2698     case T_LONG:
2699     case T_DOUBLE:
2700       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2701       break;
2702     default:
2703       assert(false,"Should not reach here.");
2704       break;
2705   }
2706 }
2707 
2708 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2709   switch(typ) {
2710     case T_BYTE:
2711       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2712       break;
2713     case T_SHORT:
2714       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2715       break;
2716     case T_INT:
2717     case T_FLOAT:
2718       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2719       break;
2720     case T_LONG:
2721     case T_DOUBLE:
2722       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2723       break;
2724     default:
2725       assert(false,"Should not reach here.");
2726       break;
2727   }
2728 }
2729 
2730 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2731   assert(vlen_in_bytes <= 32, "");
2732   int esize = type2aelembytes(bt);
2733   if (vlen_in_bytes == 32) {
2734     assert(vtmp == xnoreg, "required.");
2735     if (esize >= 4) {
2736       vtestps(src1, src2, AVX_256bit);
2737     } else {
2738       vptest(src1, src2, AVX_256bit);
2739     }
2740     return;
2741   }
2742   if (vlen_in_bytes < 16) {
2743     // Duplicate the lower part to fill the whole register,
2744     // Don't need to do so for src2
2745     assert(vtmp != xnoreg, "required");
2746     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2747     pshufd(vtmp, src1, shuffle_imm);
2748   } else {
2749     assert(vtmp == xnoreg, "required");
2750     vtmp = src1;
2751   }
2752   if (esize >= 4 && VM_Version::supports_avx()) {
2753     vtestps(vtmp, src2, AVX_128bit);
2754   } else {
2755     ptest(vtmp, src2);
2756   }
2757 }
2758 
2759 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2760 #ifdef ASSERT
2761   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2762   bool is_bw_supported = VM_Version::supports_avx512bw();
2763   if (is_bw && !is_bw_supported) {
2764     assert(vlen_enc != Assembler::AVX_512bit, "required");
2765     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2766            "XMM register should be 0-15");
2767   }
2768 #endif // ASSERT
2769   switch (elem_bt) {
2770     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2771     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2772     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2773     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2774     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2775     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2776     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2777   }
2778 }
2779 
2780 #ifdef _LP64
2781 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2782   assert(UseAVX >= 2, "required");
2783   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2784   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2785   if ((UseAVX > 2) &&
2786       (!is_bw || VM_Version::supports_avx512bw()) &&
2787       (!is_vl || VM_Version::supports_avx512vl())) {
2788     switch (elem_bt) {
2789       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2790       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2791       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2792       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2793       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2794     }
2795   } else {
2796     assert(vlen_enc != Assembler::AVX_512bit, "required");
2797     assert((dst->encoding() < 16),"XMM register should be 0-15");
2798     switch (elem_bt) {
2799       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2800       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2801       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2802       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2803       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2804       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2805       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2806     }
2807   }
2808 }
2809 #endif
2810 
2811 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2812   switch (to_elem_bt) {
2813     case T_SHORT:
2814       vpmovsxbw(dst, src, vlen_enc);
2815       break;
2816     case T_INT:
2817       vpmovsxbd(dst, src, vlen_enc);
2818       break;
2819     case T_FLOAT:
2820       vpmovsxbd(dst, src, vlen_enc);
2821       vcvtdq2ps(dst, dst, vlen_enc);
2822       break;
2823     case T_LONG:
2824       vpmovsxbq(dst, src, vlen_enc);
2825       break;
2826     case T_DOUBLE: {
2827       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2828       vpmovsxbd(dst, src, mid_vlen_enc);
2829       vcvtdq2pd(dst, dst, vlen_enc);
2830       break;
2831     }
2832     default:
2833       fatal("Unsupported type %s", type2name(to_elem_bt));
2834       break;
2835   }
2836 }
2837 
2838 //-------------------------------------------------------------------------------------------
2839 
2840 // IndexOf for constant substrings with size >= 8 chars
2841 // which don't need to be loaded through stack.
2842 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2843                                          Register cnt1, Register cnt2,
2844                                          int int_cnt2,  Register result,
2845                                          XMMRegister vec, Register tmp,
2846                                          int ae) {
2847   ShortBranchVerifier sbv(this);
2848   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2849   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2850 
2851   // This method uses the pcmpestri instruction with bound registers
2852   //   inputs:
2853   //     xmm - substring
2854   //     rax - substring length (elements count)
2855   //     mem - scanned string
2856   //     rdx - string length (elements count)
2857   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2858   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2859   //   outputs:
2860   //     rcx - matched index in string
2861   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2862   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2863   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2864   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2865   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2866 
2867   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2868         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2869         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2870 
2871   // Note, inline_string_indexOf() generates checks:
2872   // if (substr.count > string.count) return -1;
2873   // if (substr.count == 0) return 0;
2874   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2875 
2876   // Load substring.
2877   if (ae == StrIntrinsicNode::UL) {
2878     pmovzxbw(vec, Address(str2, 0));
2879   } else {
2880     movdqu(vec, Address(str2, 0));
2881   }
2882   movl(cnt2, int_cnt2);
2883   movptr(result, str1); // string addr
2884 
2885   if (int_cnt2 > stride) {
2886     jmpb(SCAN_TO_SUBSTR);
2887 
2888     // Reload substr for rescan, this code
2889     // is executed only for large substrings (> 8 chars)
2890     bind(RELOAD_SUBSTR);
2891     if (ae == StrIntrinsicNode::UL) {
2892       pmovzxbw(vec, Address(str2, 0));
2893     } else {
2894       movdqu(vec, Address(str2, 0));
2895     }
2896     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2897 
2898     bind(RELOAD_STR);
2899     // We came here after the beginning of the substring was
2900     // matched but the rest of it was not so we need to search
2901     // again. Start from the next element after the previous match.
2902 
2903     // cnt2 is number of substring reminding elements and
2904     // cnt1 is number of string reminding elements when cmp failed.
2905     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2906     subl(cnt1, cnt2);
2907     addl(cnt1, int_cnt2);
2908     movl(cnt2, int_cnt2); // Now restore cnt2
2909 
2910     decrementl(cnt1);     // Shift to next element
2911     cmpl(cnt1, cnt2);
2912     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2913 
2914     addptr(result, (1<<scale1));
2915 
2916   } // (int_cnt2 > 8)
2917 
2918   // Scan string for start of substr in 16-byte vectors
2919   bind(SCAN_TO_SUBSTR);
2920   pcmpestri(vec, Address(result, 0), mode);
2921   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2922   subl(cnt1, stride);
2923   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2924   cmpl(cnt1, cnt2);
2925   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2926   addptr(result, 16);
2927   jmpb(SCAN_TO_SUBSTR);
2928 
2929   // Found a potential substr
2930   bind(FOUND_CANDIDATE);
2931   // Matched whole vector if first element matched (tmp(rcx) == 0).
2932   if (int_cnt2 == stride) {
2933     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2934   } else { // int_cnt2 > 8
2935     jccb(Assembler::overflow, FOUND_SUBSTR);
2936   }
2937   // After pcmpestri tmp(rcx) contains matched element index
2938   // Compute start addr of substr
2939   lea(result, Address(result, tmp, scale1));
2940 
2941   // Make sure string is still long enough
2942   subl(cnt1, tmp);
2943   cmpl(cnt1, cnt2);
2944   if (int_cnt2 == stride) {
2945     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2946   } else { // int_cnt2 > 8
2947     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2948   }
2949   // Left less then substring.
2950 
2951   bind(RET_NOT_FOUND);
2952   movl(result, -1);
2953   jmp(EXIT);
2954 
2955   if (int_cnt2 > stride) {
2956     // This code is optimized for the case when whole substring
2957     // is matched if its head is matched.
2958     bind(MATCH_SUBSTR_HEAD);
2959     pcmpestri(vec, Address(result, 0), mode);
2960     // Reload only string if does not match
2961     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2962 
2963     Label CONT_SCAN_SUBSTR;
2964     // Compare the rest of substring (> 8 chars).
2965     bind(FOUND_SUBSTR);
2966     // First 8 chars are already matched.
2967     negptr(cnt2);
2968     addptr(cnt2, stride);
2969 
2970     bind(SCAN_SUBSTR);
2971     subl(cnt1, stride);
2972     cmpl(cnt2, -stride); // Do not read beyond substring
2973     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2974     // Back-up strings to avoid reading beyond substring:
2975     // cnt1 = cnt1 - cnt2 + 8
2976     addl(cnt1, cnt2); // cnt2 is negative
2977     addl(cnt1, stride);
2978     movl(cnt2, stride); negptr(cnt2);
2979     bind(CONT_SCAN_SUBSTR);
2980     if (int_cnt2 < (int)G) {
2981       int tail_off1 = int_cnt2<<scale1;
2982       int tail_off2 = int_cnt2<<scale2;
2983       if (ae == StrIntrinsicNode::UL) {
2984         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2985       } else {
2986         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2987       }
2988       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2989     } else {
2990       // calculate index in register to avoid integer overflow (int_cnt2*2)
2991       movl(tmp, int_cnt2);
2992       addptr(tmp, cnt2);
2993       if (ae == StrIntrinsicNode::UL) {
2994         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2995       } else {
2996         movdqu(vec, Address(str2, tmp, scale2, 0));
2997       }
2998       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2999     }
3000     // Need to reload strings pointers if not matched whole vector
3001     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3002     addptr(cnt2, stride);
3003     jcc(Assembler::negative, SCAN_SUBSTR);
3004     // Fall through if found full substring
3005 
3006   } // (int_cnt2 > 8)
3007 
3008   bind(RET_FOUND);
3009   // Found result if we matched full small substring.
3010   // Compute substr offset
3011   subptr(result, str1);
3012   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3013     shrl(result, 1); // index
3014   }
3015   bind(EXIT);
3016 
3017 } // string_indexofC8
3018 
3019 // Small strings are loaded through stack if they cross page boundary.
3020 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
3021                                        Register cnt1, Register cnt2,
3022                                        int int_cnt2,  Register result,
3023                                        XMMRegister vec, Register tmp,
3024                                        int ae) {
3025   ShortBranchVerifier sbv(this);
3026   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3027   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3028 
3029   //
3030   // int_cnt2 is length of small (< 8 chars) constant substring
3031   // or (-1) for non constant substring in which case its length
3032   // is in cnt2 register.
3033   //
3034   // Note, inline_string_indexOf() generates checks:
3035   // if (substr.count > string.count) return -1;
3036   // if (substr.count == 0) return 0;
3037   //
3038   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3039   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3040   // This method uses the pcmpestri instruction with bound registers
3041   //   inputs:
3042   //     xmm - substring
3043   //     rax - substring length (elements count)
3044   //     mem - scanned string
3045   //     rdx - string length (elements count)
3046   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3047   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3048   //   outputs:
3049   //     rcx - matched index in string
3050   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3051   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3052   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3053   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3054 
3055   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3056         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3057         FOUND_CANDIDATE;
3058 
3059   { //========================================================
3060     // We don't know where these strings are located
3061     // and we can't read beyond them. Load them through stack.
3062     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3063 
3064     movptr(tmp, rsp); // save old SP
3065 
3066     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3067       if (int_cnt2 == (1>>scale2)) { // One byte
3068         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3069         load_unsigned_byte(result, Address(str2, 0));
3070         movdl(vec, result); // move 32 bits
3071       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3072         // Not enough header space in 32-bit VM: 12+3 = 15.
3073         movl(result, Address(str2, -1));
3074         shrl(result, 8);
3075         movdl(vec, result); // move 32 bits
3076       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3077         load_unsigned_short(result, Address(str2, 0));
3078         movdl(vec, result); // move 32 bits
3079       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3080         movdl(vec, Address(str2, 0)); // move 32 bits
3081       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3082         movq(vec, Address(str2, 0));  // move 64 bits
3083       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3084         // Array header size is 12 bytes in 32-bit VM
3085         // + 6 bytes for 3 chars == 18 bytes,
3086         // enough space to load vec and shift.
3087         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3088         if (ae == StrIntrinsicNode::UL) {
3089           int tail_off = int_cnt2-8;
3090           pmovzxbw(vec, Address(str2, tail_off));
3091           psrldq(vec, -2*tail_off);
3092         }
3093         else {
3094           int tail_off = int_cnt2*(1<<scale2);
3095           movdqu(vec, Address(str2, tail_off-16));
3096           psrldq(vec, 16-tail_off);
3097         }
3098       }
3099     } else { // not constant substring
3100       cmpl(cnt2, stride);
3101       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3102 
3103       // We can read beyond string if srt+16 does not cross page boundary
3104       // since heaps are aligned and mapped by pages.
3105       assert(os::vm_page_size() < (int)G, "default page should be small");
3106       movl(result, str2); // We need only low 32 bits
3107       andl(result, ((int)os::vm_page_size()-1));
3108       cmpl(result, ((int)os::vm_page_size()-16));
3109       jccb(Assembler::belowEqual, CHECK_STR);
3110 
3111       // Move small strings to stack to allow load 16 bytes into vec.
3112       subptr(rsp, 16);
3113       int stk_offset = wordSize-(1<<scale2);
3114       push(cnt2);
3115 
3116       bind(COPY_SUBSTR);
3117       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3118         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3119         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3120       } else if (ae == StrIntrinsicNode::UU) {
3121         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3122         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3123       }
3124       decrement(cnt2);
3125       jccb(Assembler::notZero, COPY_SUBSTR);
3126 
3127       pop(cnt2);
3128       movptr(str2, rsp);  // New substring address
3129     } // non constant
3130 
3131     bind(CHECK_STR);
3132     cmpl(cnt1, stride);
3133     jccb(Assembler::aboveEqual, BIG_STRINGS);
3134 
3135     // Check cross page boundary.
3136     movl(result, str1); // We need only low 32 bits
3137     andl(result, ((int)os::vm_page_size()-1));
3138     cmpl(result, ((int)os::vm_page_size()-16));
3139     jccb(Assembler::belowEqual, BIG_STRINGS);
3140 
3141     subptr(rsp, 16);
3142     int stk_offset = -(1<<scale1);
3143     if (int_cnt2 < 0) { // not constant
3144       push(cnt2);
3145       stk_offset += wordSize;
3146     }
3147     movl(cnt2, cnt1);
3148 
3149     bind(COPY_STR);
3150     if (ae == StrIntrinsicNode::LL) {
3151       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3152       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3153     } else {
3154       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3155       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3156     }
3157     decrement(cnt2);
3158     jccb(Assembler::notZero, COPY_STR);
3159 
3160     if (int_cnt2 < 0) { // not constant
3161       pop(cnt2);
3162     }
3163     movptr(str1, rsp);  // New string address
3164 
3165     bind(BIG_STRINGS);
3166     // Load substring.
3167     if (int_cnt2 < 0) { // -1
3168       if (ae == StrIntrinsicNode::UL) {
3169         pmovzxbw(vec, Address(str2, 0));
3170       } else {
3171         movdqu(vec, Address(str2, 0));
3172       }
3173       push(cnt2);       // substr count
3174       push(str2);       // substr addr
3175       push(str1);       // string addr
3176     } else {
3177       // Small (< 8 chars) constant substrings are loaded already.
3178       movl(cnt2, int_cnt2);
3179     }
3180     push(tmp);  // original SP
3181 
3182   } // Finished loading
3183 
3184   //========================================================
3185   // Start search
3186   //
3187 
3188   movptr(result, str1); // string addr
3189 
3190   if (int_cnt2  < 0) {  // Only for non constant substring
3191     jmpb(SCAN_TO_SUBSTR);
3192 
3193     // SP saved at sp+0
3194     // String saved at sp+1*wordSize
3195     // Substr saved at sp+2*wordSize
3196     // Substr count saved at sp+3*wordSize
3197 
3198     // Reload substr for rescan, this code
3199     // is executed only for large substrings (> 8 chars)
3200     bind(RELOAD_SUBSTR);
3201     movptr(str2, Address(rsp, 2*wordSize));
3202     movl(cnt2, Address(rsp, 3*wordSize));
3203     if (ae == StrIntrinsicNode::UL) {
3204       pmovzxbw(vec, Address(str2, 0));
3205     } else {
3206       movdqu(vec, Address(str2, 0));
3207     }
3208     // We came here after the beginning of the substring was
3209     // matched but the rest of it was not so we need to search
3210     // again. Start from the next element after the previous match.
3211     subptr(str1, result); // Restore counter
3212     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3213       shrl(str1, 1);
3214     }
3215     addl(cnt1, str1);
3216     decrementl(cnt1);   // Shift to next element
3217     cmpl(cnt1, cnt2);
3218     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3219 
3220     addptr(result, (1<<scale1));
3221   } // non constant
3222 
3223   // Scan string for start of substr in 16-byte vectors
3224   bind(SCAN_TO_SUBSTR);
3225   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3226   pcmpestri(vec, Address(result, 0), mode);
3227   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3228   subl(cnt1, stride);
3229   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3230   cmpl(cnt1, cnt2);
3231   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3232   addptr(result, 16);
3233 
3234   bind(ADJUST_STR);
3235   cmpl(cnt1, stride); // Do not read beyond string
3236   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3237   // Back-up string to avoid reading beyond string.
3238   lea(result, Address(result, cnt1, scale1, -16));
3239   movl(cnt1, stride);
3240   jmpb(SCAN_TO_SUBSTR);
3241 
3242   // Found a potential substr
3243   bind(FOUND_CANDIDATE);
3244   // After pcmpestri tmp(rcx) contains matched element index
3245 
3246   // Make sure string is still long enough
3247   subl(cnt1, tmp);
3248   cmpl(cnt1, cnt2);
3249   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3250   // Left less then substring.
3251 
3252   bind(RET_NOT_FOUND);
3253   movl(result, -1);
3254   jmp(CLEANUP);
3255 
3256   bind(FOUND_SUBSTR);
3257   // Compute start addr of substr
3258   lea(result, Address(result, tmp, scale1));
3259   if (int_cnt2 > 0) { // Constant substring
3260     // Repeat search for small substring (< 8 chars)
3261     // from new point without reloading substring.
3262     // Have to check that we don't read beyond string.
3263     cmpl(tmp, stride-int_cnt2);
3264     jccb(Assembler::greater, ADJUST_STR);
3265     // Fall through if matched whole substring.
3266   } else { // non constant
3267     assert(int_cnt2 == -1, "should be != 0");
3268 
3269     addl(tmp, cnt2);
3270     // Found result if we matched whole substring.
3271     cmpl(tmp, stride);
3272     jcc(Assembler::lessEqual, RET_FOUND);
3273 
3274     // Repeat search for small substring (<= 8 chars)
3275     // from new point 'str1' without reloading substring.
3276     cmpl(cnt2, stride);
3277     // Have to check that we don't read beyond string.
3278     jccb(Assembler::lessEqual, ADJUST_STR);
3279 
3280     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3281     // Compare the rest of substring (> 8 chars).
3282     movptr(str1, result);
3283 
3284     cmpl(tmp, cnt2);
3285     // First 8 chars are already matched.
3286     jccb(Assembler::equal, CHECK_NEXT);
3287 
3288     bind(SCAN_SUBSTR);
3289     pcmpestri(vec, Address(str1, 0), mode);
3290     // Need to reload strings pointers if not matched whole vector
3291     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3292 
3293     bind(CHECK_NEXT);
3294     subl(cnt2, stride);
3295     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3296     addptr(str1, 16);
3297     if (ae == StrIntrinsicNode::UL) {
3298       addptr(str2, 8);
3299     } else {
3300       addptr(str2, 16);
3301     }
3302     subl(cnt1, stride);
3303     cmpl(cnt2, stride); // Do not read beyond substring
3304     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3305     // Back-up strings to avoid reading beyond substring.
3306 
3307     if (ae == StrIntrinsicNode::UL) {
3308       lea(str2, Address(str2, cnt2, scale2, -8));
3309       lea(str1, Address(str1, cnt2, scale1, -16));
3310     } else {
3311       lea(str2, Address(str2, cnt2, scale2, -16));
3312       lea(str1, Address(str1, cnt2, scale1, -16));
3313     }
3314     subl(cnt1, cnt2);
3315     movl(cnt2, stride);
3316     addl(cnt1, stride);
3317     bind(CONT_SCAN_SUBSTR);
3318     if (ae == StrIntrinsicNode::UL) {
3319       pmovzxbw(vec, Address(str2, 0));
3320     } else {
3321       movdqu(vec, Address(str2, 0));
3322     }
3323     jmp(SCAN_SUBSTR);
3324 
3325     bind(RET_FOUND_LONG);
3326     movptr(str1, Address(rsp, wordSize));
3327   } // non constant
3328 
3329   bind(RET_FOUND);
3330   // Compute substr offset
3331   subptr(result, str1);
3332   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3333     shrl(result, 1); // index
3334   }
3335   bind(CLEANUP);
3336   pop(rsp); // restore SP
3337 
3338 } // string_indexof
3339 
3340 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3341                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3342   ShortBranchVerifier sbv(this);
3343   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3344 
3345   int stride = 8;
3346 
3347   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3348         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3349         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3350         FOUND_SEQ_CHAR, DONE_LABEL;
3351 
3352   movptr(result, str1);
3353   if (UseAVX >= 2) {
3354     cmpl(cnt1, stride);
3355     jcc(Assembler::less, SCAN_TO_CHAR);
3356     cmpl(cnt1, 2*stride);
3357     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3358     movdl(vec1, ch);
3359     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3360     vpxor(vec2, vec2);
3361     movl(tmp, cnt1);
3362     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3363     andl(cnt1,0x0000000F);  //tail count (in chars)
3364 
3365     bind(SCAN_TO_16_CHAR_LOOP);
3366     vmovdqu(vec3, Address(result, 0));
3367     vpcmpeqw(vec3, vec3, vec1, 1);
3368     vptest(vec2, vec3);
3369     jcc(Assembler::carryClear, FOUND_CHAR);
3370     addptr(result, 32);
3371     subl(tmp, 2*stride);
3372     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3373     jmp(SCAN_TO_8_CHAR);
3374     bind(SCAN_TO_8_CHAR_INIT);
3375     movdl(vec1, ch);
3376     pshuflw(vec1, vec1, 0x00);
3377     pshufd(vec1, vec1, 0);
3378     pxor(vec2, vec2);
3379   }
3380   bind(SCAN_TO_8_CHAR);
3381   cmpl(cnt1, stride);
3382   jcc(Assembler::less, SCAN_TO_CHAR);
3383   if (UseAVX < 2) {
3384     movdl(vec1, ch);
3385     pshuflw(vec1, vec1, 0x00);
3386     pshufd(vec1, vec1, 0);
3387     pxor(vec2, vec2);
3388   }
3389   movl(tmp, cnt1);
3390   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3391   andl(cnt1,0x00000007);  //tail count (in chars)
3392 
3393   bind(SCAN_TO_8_CHAR_LOOP);
3394   movdqu(vec3, Address(result, 0));
3395   pcmpeqw(vec3, vec1);
3396   ptest(vec2, vec3);
3397   jcc(Assembler::carryClear, FOUND_CHAR);
3398   addptr(result, 16);
3399   subl(tmp, stride);
3400   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3401   bind(SCAN_TO_CHAR);
3402   testl(cnt1, cnt1);
3403   jcc(Assembler::zero, RET_NOT_FOUND);
3404   bind(SCAN_TO_CHAR_LOOP);
3405   load_unsigned_short(tmp, Address(result, 0));
3406   cmpl(ch, tmp);
3407   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3408   addptr(result, 2);
3409   subl(cnt1, 1);
3410   jccb(Assembler::zero, RET_NOT_FOUND);
3411   jmp(SCAN_TO_CHAR_LOOP);
3412 
3413   bind(RET_NOT_FOUND);
3414   movl(result, -1);
3415   jmpb(DONE_LABEL);
3416 
3417   bind(FOUND_CHAR);
3418   if (UseAVX >= 2) {
3419     vpmovmskb(tmp, vec3);
3420   } else {
3421     pmovmskb(tmp, vec3);
3422   }
3423   bsfl(ch, tmp);
3424   addptr(result, ch);
3425 
3426   bind(FOUND_SEQ_CHAR);
3427   subptr(result, str1);
3428   shrl(result, 1);
3429 
3430   bind(DONE_LABEL);
3431 } // string_indexof_char
3432 
3433 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3434                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3435   ShortBranchVerifier sbv(this);
3436   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3437 
3438   int stride = 16;
3439 
3440   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3441         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3442         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3443         FOUND_SEQ_CHAR, DONE_LABEL;
3444 
3445   movptr(result, str1);
3446   if (UseAVX >= 2) {
3447     cmpl(cnt1, stride);
3448     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3449     cmpl(cnt1, stride*2);
3450     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3451     movdl(vec1, ch);
3452     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3453     vpxor(vec2, vec2);
3454     movl(tmp, cnt1);
3455     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3456     andl(cnt1,0x0000001F);  //tail count (in chars)
3457 
3458     bind(SCAN_TO_32_CHAR_LOOP);
3459     vmovdqu(vec3, Address(result, 0));
3460     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3461     vptest(vec2, vec3);
3462     jcc(Assembler::carryClear, FOUND_CHAR);
3463     addptr(result, 32);
3464     subl(tmp, stride*2);
3465     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3466     jmp(SCAN_TO_16_CHAR);
3467 
3468     bind(SCAN_TO_16_CHAR_INIT);
3469     movdl(vec1, ch);
3470     pxor(vec2, vec2);
3471     pshufb(vec1, vec2);
3472   }
3473 
3474   bind(SCAN_TO_16_CHAR);
3475   cmpl(cnt1, stride);
3476   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3477   if (UseAVX < 2) {
3478     movdl(vec1, ch);
3479     pxor(vec2, vec2);
3480     pshufb(vec1, vec2);
3481   }
3482   movl(tmp, cnt1);
3483   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3484   andl(cnt1,0x0000000F);  //tail count (in bytes)
3485 
3486   bind(SCAN_TO_16_CHAR_LOOP);
3487   movdqu(vec3, Address(result, 0));
3488   pcmpeqb(vec3, vec1);
3489   ptest(vec2, vec3);
3490   jcc(Assembler::carryClear, FOUND_CHAR);
3491   addptr(result, 16);
3492   subl(tmp, stride);
3493   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3494 
3495   bind(SCAN_TO_CHAR_INIT);
3496   testl(cnt1, cnt1);
3497   jcc(Assembler::zero, RET_NOT_FOUND);
3498   bind(SCAN_TO_CHAR_LOOP);
3499   load_unsigned_byte(tmp, Address(result, 0));
3500   cmpl(ch, tmp);
3501   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3502   addptr(result, 1);
3503   subl(cnt1, 1);
3504   jccb(Assembler::zero, RET_NOT_FOUND);
3505   jmp(SCAN_TO_CHAR_LOOP);
3506 
3507   bind(RET_NOT_FOUND);
3508   movl(result, -1);
3509   jmpb(DONE_LABEL);
3510 
3511   bind(FOUND_CHAR);
3512   if (UseAVX >= 2) {
3513     vpmovmskb(tmp, vec3);
3514   } else {
3515     pmovmskb(tmp, vec3);
3516   }
3517   bsfl(ch, tmp);
3518   addptr(result, ch);
3519 
3520   bind(FOUND_SEQ_CHAR);
3521   subptr(result, str1);
3522 
3523   bind(DONE_LABEL);
3524 } // stringL_indexof_char
3525 
3526 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3527   switch (eltype) {
3528   case T_BOOLEAN: return sizeof(jboolean);
3529   case T_BYTE:  return sizeof(jbyte);
3530   case T_SHORT: return sizeof(jshort);
3531   case T_CHAR:  return sizeof(jchar);
3532   case T_INT:   return sizeof(jint);
3533   default:
3534     ShouldNotReachHere();
3535     return -1;
3536   }
3537 }
3538 
3539 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3540   switch (eltype) {
3541   // T_BOOLEAN used as surrogate for unsigned byte
3542   case T_BOOLEAN: movzbl(dst, src);   break;
3543   case T_BYTE:    movsbl(dst, src);   break;
3544   case T_SHORT:   movswl(dst, src);   break;
3545   case T_CHAR:    movzwl(dst, src);   break;
3546   case T_INT:     movl(dst, src);     break;
3547   default:
3548     ShouldNotReachHere();
3549   }
3550 }
3551 
3552 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3553   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3554 }
3555 
3556 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3557   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3558 }
3559 
3560 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3561   const int vlen = Assembler::AVX_256bit;
3562   switch (eltype) {
3563   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3564   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3565   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3566   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3567   case T_INT:
3568     // do nothing
3569     break;
3570   default:
3571     ShouldNotReachHere();
3572   }
3573 }
3574 
3575 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3576                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3577                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3578                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3579                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3580                                         BasicType eltype) {
3581   ShortBranchVerifier sbv(this);
3582   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3583   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3584   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3585 
3586   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3587         SHORT_UNROLLED_LOOP_EXIT,
3588         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3589         UNROLLED_VECTOR_LOOP_BEGIN,
3590         END;
3591   switch (eltype) {
3592   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3593   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3594   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3595   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3596   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3597   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3598   }
3599 
3600   // For "renaming" for readibility of the code
3601   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3602                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3603                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3604 
3605   const int elsize = arrays_hashcode_elsize(eltype);
3606 
3607   /*
3608     if (cnt1 >= 2) {
3609       if (cnt1 >= 32) {
3610         UNROLLED VECTOR LOOP
3611       }
3612       UNROLLED SCALAR LOOP
3613     }
3614     SINGLE SCALAR
3615    */
3616 
3617   cmpl(cnt1, 32);
3618   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3619 
3620   // cnt1 >= 32 && generate_vectorized_loop
3621   xorl(index, index);
3622 
3623   // vresult = IntVector.zero(I256);
3624   for (int idx = 0; idx < 4; idx++) {
3625     vpxor(vresult[idx], vresult[idx]);
3626   }
3627   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3628   Register bound = tmp2;
3629   Register next = tmp3;
3630   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3631   movl(next, Address(tmp2, 0));
3632   movdl(vnext, next);
3633   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3634 
3635   // index = 0;
3636   // bound = cnt1 & ~(32 - 1);
3637   movl(bound, cnt1);
3638   andl(bound, ~(32 - 1));
3639   // for (; index < bound; index += 32) {
3640   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3641   // result *= next;
3642   imull(result, next);
3643   // loop fission to upfront the cost of fetching from memory, OOO execution
3644   // can then hopefully do a better job of prefetching
3645   for (int idx = 0; idx < 4; idx++) {
3646     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3647   }
3648   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3649   for (int idx = 0; idx < 4; idx++) {
3650     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3651     arrays_hashcode_elvcast(vtmp[idx], eltype);
3652     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3653   }
3654   // index += 32;
3655   addl(index, 32);
3656   // index < bound;
3657   cmpl(index, bound);
3658   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3659   // }
3660 
3661   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3662   subl(cnt1, bound);
3663   // release bound
3664 
3665   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3666   for (int idx = 0; idx < 4; idx++) {
3667     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3668     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3669     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3670   }
3671   // result += vresult.reduceLanes(ADD);
3672   for (int idx = 0; idx < 4; idx++) {
3673     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3674   }
3675 
3676   // } else if (cnt1 < 32) {
3677 
3678   bind(SHORT_UNROLLED_BEGIN);
3679   // int i = 1;
3680   movl(index, 1);
3681   cmpl(index, cnt1);
3682   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3683 
3684   // for (; i < cnt1 ; i += 2) {
3685   bind(SHORT_UNROLLED_LOOP_BEGIN);
3686   movl(tmp3, 961);
3687   imull(result, tmp3);
3688   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3689   movl(tmp3, tmp2);
3690   shll(tmp3, 5);
3691   subl(tmp3, tmp2);
3692   addl(result, tmp3);
3693   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3694   addl(result, tmp3);
3695   addl(index, 2);
3696   cmpl(index, cnt1);
3697   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3698 
3699   // }
3700   // if (i >= cnt1) {
3701   bind(SHORT_UNROLLED_LOOP_EXIT);
3702   jccb(Assembler::greater, END);
3703   movl(tmp2, result);
3704   shll(result, 5);
3705   subl(result, tmp2);
3706   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3707   addl(result, tmp3);
3708   // }
3709   bind(END);
3710 
3711   BLOCK_COMMENT("} // arrays_hashcode");
3712 
3713 } // arrays_hashcode
3714 
3715 // helper function for string_compare
3716 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3717                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3718                                            Address::ScaleFactor scale2, Register index, int ae) {
3719   if (ae == StrIntrinsicNode::LL) {
3720     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3721     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3722   } else if (ae == StrIntrinsicNode::UU) {
3723     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3724     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3725   } else {
3726     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3727     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3728   }
3729 }
3730 
3731 // Compare strings, used for char[] and byte[].
3732 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3733                                        Register cnt1, Register cnt2, Register result,
3734                                        XMMRegister vec1, int ae, KRegister mask) {
3735   ShortBranchVerifier sbv(this);
3736   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3737   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3738   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3739   int stride2x2 = 0x40;
3740   Address::ScaleFactor scale = Address::no_scale;
3741   Address::ScaleFactor scale1 = Address::no_scale;
3742   Address::ScaleFactor scale2 = Address::no_scale;
3743 
3744   if (ae != StrIntrinsicNode::LL) {
3745     stride2x2 = 0x20;
3746   }
3747 
3748   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3749     shrl(cnt2, 1);
3750   }
3751   // Compute the minimum of the string lengths and the
3752   // difference of the string lengths (stack).
3753   // Do the conditional move stuff
3754   movl(result, cnt1);
3755   subl(cnt1, cnt2);
3756   push(cnt1);
3757   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3758 
3759   // Is the minimum length zero?
3760   testl(cnt2, cnt2);
3761   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3762   if (ae == StrIntrinsicNode::LL) {
3763     // Load first bytes
3764     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3765     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3766   } else if (ae == StrIntrinsicNode::UU) {
3767     // Load first characters
3768     load_unsigned_short(result, Address(str1, 0));
3769     load_unsigned_short(cnt1, Address(str2, 0));
3770   } else {
3771     load_unsigned_byte(result, Address(str1, 0));
3772     load_unsigned_short(cnt1, Address(str2, 0));
3773   }
3774   subl(result, cnt1);
3775   jcc(Assembler::notZero,  POP_LABEL);
3776 
3777   if (ae == StrIntrinsicNode::UU) {
3778     // Divide length by 2 to get number of chars
3779     shrl(cnt2, 1);
3780   }
3781   cmpl(cnt2, 1);
3782   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3783 
3784   // Check if the strings start at the same location and setup scale and stride
3785   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3786     cmpptr(str1, str2);
3787     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3788     if (ae == StrIntrinsicNode::LL) {
3789       scale = Address::times_1;
3790       stride = 16;
3791     } else {
3792       scale = Address::times_2;
3793       stride = 8;
3794     }
3795   } else {
3796     scale1 = Address::times_1;
3797     scale2 = Address::times_2;
3798     // scale not used
3799     stride = 8;
3800   }
3801 
3802   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3803     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3804     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3805     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3806     Label COMPARE_TAIL_LONG;
3807     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3808 
3809     int pcmpmask = 0x19;
3810     if (ae == StrIntrinsicNode::LL) {
3811       pcmpmask &= ~0x01;
3812     }
3813 
3814     // Setup to compare 16-chars (32-bytes) vectors,
3815     // start from first character again because it has aligned address.
3816     if (ae == StrIntrinsicNode::LL) {
3817       stride2 = 32;
3818     } else {
3819       stride2 = 16;
3820     }
3821     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3822       adr_stride = stride << scale;
3823     } else {
3824       adr_stride1 = 8;  //stride << scale1;
3825       adr_stride2 = 16; //stride << scale2;
3826     }
3827 
3828     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3829     // rax and rdx are used by pcmpestri as elements counters
3830     movl(result, cnt2);
3831     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3832     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3833 
3834     // fast path : compare first 2 8-char vectors.
3835     bind(COMPARE_16_CHARS);
3836     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3837       movdqu(vec1, Address(str1, 0));
3838     } else {
3839       pmovzxbw(vec1, Address(str1, 0));
3840     }
3841     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3842     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3843 
3844     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3845       movdqu(vec1, Address(str1, adr_stride));
3846       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3847     } else {
3848       pmovzxbw(vec1, Address(str1, adr_stride1));
3849       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3850     }
3851     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3852     addl(cnt1, stride);
3853 
3854     // Compare the characters at index in cnt1
3855     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3856     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3857     subl(result, cnt2);
3858     jmp(POP_LABEL);
3859 
3860     // Setup the registers to start vector comparison loop
3861     bind(COMPARE_WIDE_VECTORS);
3862     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3863       lea(str1, Address(str1, result, scale));
3864       lea(str2, Address(str2, result, scale));
3865     } else {
3866       lea(str1, Address(str1, result, scale1));
3867       lea(str2, Address(str2, result, scale2));
3868     }
3869     subl(result, stride2);
3870     subl(cnt2, stride2);
3871     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3872     negptr(result);
3873 
3874     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3875     bind(COMPARE_WIDE_VECTORS_LOOP);
3876 
3877 #ifdef _LP64
3878     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3879       cmpl(cnt2, stride2x2);
3880       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3881       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3882       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3883 
3884       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3885       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3886         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3887         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3888       } else {
3889         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3890         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3891       }
3892       kortestql(mask, mask);
3893       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3894       addptr(result, stride2x2);  // update since we already compared at this addr
3895       subl(cnt2, stride2x2);      // and sub the size too
3896       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3897 
3898       vpxor(vec1, vec1);
3899       jmpb(COMPARE_WIDE_TAIL);
3900     }//if (VM_Version::supports_avx512vlbw())
3901 #endif // _LP64
3902 
3903 
3904     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3905     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3906       vmovdqu(vec1, Address(str1, result, scale));
3907       vpxor(vec1, Address(str2, result, scale));
3908     } else {
3909       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3910       vpxor(vec1, Address(str2, result, scale2));
3911     }
3912     vptest(vec1, vec1);
3913     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3914     addptr(result, stride2);
3915     subl(cnt2, stride2);
3916     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3917     // clean upper bits of YMM registers
3918     vpxor(vec1, vec1);
3919 
3920     // compare wide vectors tail
3921     bind(COMPARE_WIDE_TAIL);
3922     testptr(result, result);
3923     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3924 
3925     movl(result, stride2);
3926     movl(cnt2, result);
3927     negptr(result);
3928     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3929 
3930     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3931     bind(VECTOR_NOT_EQUAL);
3932     // clean upper bits of YMM registers
3933     vpxor(vec1, vec1);
3934     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3935       lea(str1, Address(str1, result, scale));
3936       lea(str2, Address(str2, result, scale));
3937     } else {
3938       lea(str1, Address(str1, result, scale1));
3939       lea(str2, Address(str2, result, scale2));
3940     }
3941     jmp(COMPARE_16_CHARS);
3942 
3943     // Compare tail chars, length between 1 to 15 chars
3944     bind(COMPARE_TAIL_LONG);
3945     movl(cnt2, result);
3946     cmpl(cnt2, stride);
3947     jcc(Assembler::less, COMPARE_SMALL_STR);
3948 
3949     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3950       movdqu(vec1, Address(str1, 0));
3951     } else {
3952       pmovzxbw(vec1, Address(str1, 0));
3953     }
3954     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3955     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3956     subptr(cnt2, stride);
3957     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3958     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3959       lea(str1, Address(str1, result, scale));
3960       lea(str2, Address(str2, result, scale));
3961     } else {
3962       lea(str1, Address(str1, result, scale1));
3963       lea(str2, Address(str2, result, scale2));
3964     }
3965     negptr(cnt2);
3966     jmpb(WHILE_HEAD_LABEL);
3967 
3968     bind(COMPARE_SMALL_STR);
3969   } else if (UseSSE42Intrinsics) {
3970     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3971     int pcmpmask = 0x19;
3972     // Setup to compare 8-char (16-byte) vectors,
3973     // start from first character again because it has aligned address.
3974     movl(result, cnt2);
3975     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3976     if (ae == StrIntrinsicNode::LL) {
3977       pcmpmask &= ~0x01;
3978     }
3979     jcc(Assembler::zero, COMPARE_TAIL);
3980     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3981       lea(str1, Address(str1, result, scale));
3982       lea(str2, Address(str2, result, scale));
3983     } else {
3984       lea(str1, Address(str1, result, scale1));
3985       lea(str2, Address(str2, result, scale2));
3986     }
3987     negptr(result);
3988 
3989     // pcmpestri
3990     //   inputs:
3991     //     vec1- substring
3992     //     rax - negative string length (elements count)
3993     //     mem - scanned string
3994     //     rdx - string length (elements count)
3995     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3996     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3997     //   outputs:
3998     //     rcx - first mismatched element index
3999     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
4000 
4001     bind(COMPARE_WIDE_VECTORS);
4002     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4003       movdqu(vec1, Address(str1, result, scale));
4004       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4005     } else {
4006       pmovzxbw(vec1, Address(str1, result, scale1));
4007       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4008     }
4009     // After pcmpestri cnt1(rcx) contains mismatched element index
4010 
4011     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
4012     addptr(result, stride);
4013     subptr(cnt2, stride);
4014     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4015 
4016     // compare wide vectors tail
4017     testptr(result, result);
4018     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4019 
4020     movl(cnt2, stride);
4021     movl(result, stride);
4022     negptr(result);
4023     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4024       movdqu(vec1, Address(str1, result, scale));
4025       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4026     } else {
4027       pmovzxbw(vec1, Address(str1, result, scale1));
4028       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4029     }
4030     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4031 
4032     // Mismatched characters in the vectors
4033     bind(VECTOR_NOT_EQUAL);
4034     addptr(cnt1, result);
4035     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4036     subl(result, cnt2);
4037     jmpb(POP_LABEL);
4038 
4039     bind(COMPARE_TAIL); // limit is zero
4040     movl(cnt2, result);
4041     // Fallthru to tail compare
4042   }
4043   // Shift str2 and str1 to the end of the arrays, negate min
4044   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4045     lea(str1, Address(str1, cnt2, scale));
4046     lea(str2, Address(str2, cnt2, scale));
4047   } else {
4048     lea(str1, Address(str1, cnt2, scale1));
4049     lea(str2, Address(str2, cnt2, scale2));
4050   }
4051   decrementl(cnt2);  // first character was compared already
4052   negptr(cnt2);
4053 
4054   // Compare the rest of the elements
4055   bind(WHILE_HEAD_LABEL);
4056   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4057   subl(result, cnt1);
4058   jccb(Assembler::notZero, POP_LABEL);
4059   increment(cnt2);
4060   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4061 
4062   // Strings are equal up to min length.  Return the length difference.
4063   bind(LENGTH_DIFF_LABEL);
4064   pop(result);
4065   if (ae == StrIntrinsicNode::UU) {
4066     // Divide diff by 2 to get number of chars
4067     sarl(result, 1);
4068   }
4069   jmpb(DONE_LABEL);
4070 
4071 #ifdef _LP64
4072   if (VM_Version::supports_avx512vlbw()) {
4073 
4074     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4075 
4076     kmovql(cnt1, mask);
4077     notq(cnt1);
4078     bsfq(cnt2, cnt1);
4079     if (ae != StrIntrinsicNode::LL) {
4080       // Divide diff by 2 to get number of chars
4081       sarl(cnt2, 1);
4082     }
4083     addq(result, cnt2);
4084     if (ae == StrIntrinsicNode::LL) {
4085       load_unsigned_byte(cnt1, Address(str2, result));
4086       load_unsigned_byte(result, Address(str1, result));
4087     } else if (ae == StrIntrinsicNode::UU) {
4088       load_unsigned_short(cnt1, Address(str2, result, scale));
4089       load_unsigned_short(result, Address(str1, result, scale));
4090     } else {
4091       load_unsigned_short(cnt1, Address(str2, result, scale2));
4092       load_unsigned_byte(result, Address(str1, result, scale1));
4093     }
4094     subl(result, cnt1);
4095     jmpb(POP_LABEL);
4096   }//if (VM_Version::supports_avx512vlbw())
4097 #endif // _LP64
4098 
4099   // Discard the stored length difference
4100   bind(POP_LABEL);
4101   pop(cnt1);
4102 
4103   // That's it
4104   bind(DONE_LABEL);
4105   if(ae == StrIntrinsicNode::UL) {
4106     negl(result);
4107   }
4108 
4109 }
4110 
4111 // Search for Non-ASCII character (Negative byte value) in a byte array,
4112 // return the index of the first such character, otherwise the length
4113 // of the array segment searched.
4114 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4115 //   @IntrinsicCandidate
4116 //   public static int countPositives(byte[] ba, int off, int len) {
4117 //     for (int i = off; i < off + len; i++) {
4118 //       if (ba[i] < 0) {
4119 //         return i - off;
4120 //       }
4121 //     }
4122 //     return len;
4123 //   }
4124 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4125   Register result, Register tmp1,
4126   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4127   // rsi: byte array
4128   // rcx: len
4129   // rax: result
4130   ShortBranchVerifier sbv(this);
4131   assert_different_registers(ary1, len, result, tmp1);
4132   assert_different_registers(vec1, vec2);
4133   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4134 
4135   movl(result, len); // copy
4136   // len == 0
4137   testl(len, len);
4138   jcc(Assembler::zero, DONE);
4139 
4140   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4141     VM_Version::supports_avx512vlbw() &&
4142     VM_Version::supports_bmi2()) {
4143 
4144     Label test_64_loop, test_tail, BREAK_LOOP;
4145     movl(tmp1, len);
4146     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4147 
4148     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4149     andl(len,  0xffffffc0); // vector count (in chars)
4150     jccb(Assembler::zero, test_tail);
4151 
4152     lea(ary1, Address(ary1, len, Address::times_1));
4153     negptr(len);
4154 
4155     bind(test_64_loop);
4156     // Check whether our 64 elements of size byte contain negatives
4157     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4158     kortestql(mask1, mask1);
4159     jcc(Assembler::notZero, BREAK_LOOP);
4160 
4161     addptr(len, 64);
4162     jccb(Assembler::notZero, test_64_loop);
4163 
4164     bind(test_tail);
4165     // bail out when there is nothing to be done
4166     testl(tmp1, -1);
4167     jcc(Assembler::zero, DONE);
4168 
4169 
4170     // check the tail for absense of negatives
4171     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4172 #ifdef _LP64
4173     {
4174       Register tmp3_aliased = len;
4175       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4176       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4177       notq(tmp3_aliased);
4178       kmovql(mask2, tmp3_aliased);
4179     }
4180 #else
4181     Label k_init;
4182     jmp(k_init);
4183 
4184     // We could not read 64-bits from a general purpose register thus we move
4185     // data required to compose 64 1's to the instruction stream
4186     // We emit 64 byte wide series of elements from 0..63 which later on would
4187     // be used as a compare targets with tail count contained in tmp1 register.
4188     // Result would be a k register having tmp1 consecutive number or 1
4189     // counting from least significant bit.
4190     address tmp = pc();
4191     emit_int64(0x0706050403020100);
4192     emit_int64(0x0F0E0D0C0B0A0908);
4193     emit_int64(0x1716151413121110);
4194     emit_int64(0x1F1E1D1C1B1A1918);
4195     emit_int64(0x2726252423222120);
4196     emit_int64(0x2F2E2D2C2B2A2928);
4197     emit_int64(0x3736353433323130);
4198     emit_int64(0x3F3E3D3C3B3A3938);
4199 
4200     bind(k_init);
4201     lea(len, InternalAddress(tmp));
4202     // create mask to test for negative byte inside a vector
4203     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4204     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4205 
4206 #endif
4207     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4208     ktestq(mask1, mask2);
4209     jcc(Assembler::zero, DONE);
4210 
4211     // do a full check for negative registers in the tail
4212     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4213                      // ary1 already pointing to the right place
4214     jmpb(TAIL_START);
4215 
4216     bind(BREAK_LOOP);
4217     // At least one byte in the last 64 byte block was negative.
4218     // Set up to look at the last 64 bytes as if they were a tail
4219     lea(ary1, Address(ary1, len, Address::times_1));
4220     addptr(result, len);
4221     // Ignore the very last byte: if all others are positive,
4222     // it must be negative, so we can skip right to the 2+1 byte
4223     // end comparison at this point
4224     orl(result, 63);
4225     movl(len, 63);
4226     // Fallthru to tail compare
4227   } else {
4228 
4229     if (UseAVX >= 2 && UseSSE >= 2) {
4230       // With AVX2, use 32-byte vector compare
4231       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4232 
4233       // Compare 32-byte vectors
4234       testl(len, 0xffffffe0);   // vector count (in bytes)
4235       jccb(Assembler::zero, TAIL_START);
4236 
4237       andl(len, 0xffffffe0);
4238       lea(ary1, Address(ary1, len, Address::times_1));
4239       negptr(len);
4240 
4241       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4242       movdl(vec2, tmp1);
4243       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4244 
4245       bind(COMPARE_WIDE_VECTORS);
4246       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4247       vptest(vec1, vec2);
4248       jccb(Assembler::notZero, BREAK_LOOP);
4249       addptr(len, 32);
4250       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4251 
4252       testl(result, 0x0000001f);   // any bytes remaining?
4253       jcc(Assembler::zero, DONE);
4254 
4255       // Quick test using the already prepared vector mask
4256       movl(len, result);
4257       andl(len, 0x0000001f);
4258       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4259       vptest(vec1, vec2);
4260       jcc(Assembler::zero, DONE);
4261       // There are zeros, jump to the tail to determine exactly where
4262       jmpb(TAIL_START);
4263 
4264       bind(BREAK_LOOP);
4265       // At least one byte in the last 32-byte vector is negative.
4266       // Set up to look at the last 32 bytes as if they were a tail
4267       lea(ary1, Address(ary1, len, Address::times_1));
4268       addptr(result, len);
4269       // Ignore the very last byte: if all others are positive,
4270       // it must be negative, so we can skip right to the 2+1 byte
4271       // end comparison at this point
4272       orl(result, 31);
4273       movl(len, 31);
4274       // Fallthru to tail compare
4275     } else if (UseSSE42Intrinsics) {
4276       // With SSE4.2, use double quad vector compare
4277       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4278 
4279       // Compare 16-byte vectors
4280       testl(len, 0xfffffff0);   // vector count (in bytes)
4281       jcc(Assembler::zero, TAIL_START);
4282 
4283       andl(len, 0xfffffff0);
4284       lea(ary1, Address(ary1, len, Address::times_1));
4285       negptr(len);
4286 
4287       movl(tmp1, 0x80808080);
4288       movdl(vec2, tmp1);
4289       pshufd(vec2, vec2, 0);
4290 
4291       bind(COMPARE_WIDE_VECTORS);
4292       movdqu(vec1, Address(ary1, len, Address::times_1));
4293       ptest(vec1, vec2);
4294       jccb(Assembler::notZero, BREAK_LOOP);
4295       addptr(len, 16);
4296       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4297 
4298       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4299       jcc(Assembler::zero, DONE);
4300 
4301       // Quick test using the already prepared vector mask
4302       movl(len, result);
4303       andl(len, 0x0000000f);   // tail count (in bytes)
4304       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4305       ptest(vec1, vec2);
4306       jcc(Assembler::zero, DONE);
4307       jmpb(TAIL_START);
4308 
4309       bind(BREAK_LOOP);
4310       // At least one byte in the last 16-byte vector is negative.
4311       // Set up and look at the last 16 bytes as if they were a tail
4312       lea(ary1, Address(ary1, len, Address::times_1));
4313       addptr(result, len);
4314       // Ignore the very last byte: if all others are positive,
4315       // it must be negative, so we can skip right to the 2+1 byte
4316       // end comparison at this point
4317       orl(result, 15);
4318       movl(len, 15);
4319       // Fallthru to tail compare
4320     }
4321   }
4322 
4323   bind(TAIL_START);
4324   // Compare 4-byte vectors
4325   andl(len, 0xfffffffc); // vector count (in bytes)
4326   jccb(Assembler::zero, COMPARE_CHAR);
4327 
4328   lea(ary1, Address(ary1, len, Address::times_1));
4329   negptr(len);
4330 
4331   bind(COMPARE_VECTORS);
4332   movl(tmp1, Address(ary1, len, Address::times_1));
4333   andl(tmp1, 0x80808080);
4334   jccb(Assembler::notZero, TAIL_ADJUST);
4335   addptr(len, 4);
4336   jccb(Assembler::notZero, COMPARE_VECTORS);
4337 
4338   // Compare trailing char (final 2-3 bytes), if any
4339   bind(COMPARE_CHAR);
4340 
4341   testl(result, 0x2);   // tail  char
4342   jccb(Assembler::zero, COMPARE_BYTE);
4343   load_unsigned_short(tmp1, Address(ary1, 0));
4344   andl(tmp1, 0x00008080);
4345   jccb(Assembler::notZero, CHAR_ADJUST);
4346   lea(ary1, Address(ary1, 2));
4347 
4348   bind(COMPARE_BYTE);
4349   testl(result, 0x1);   // tail  byte
4350   jccb(Assembler::zero, DONE);
4351   load_unsigned_byte(tmp1, Address(ary1, 0));
4352   testl(tmp1, 0x00000080);
4353   jccb(Assembler::zero, DONE);
4354   subptr(result, 1);
4355   jmpb(DONE);
4356 
4357   bind(TAIL_ADJUST);
4358   // there are negative bits in the last 4 byte block.
4359   // Adjust result and check the next three bytes
4360   addptr(result, len);
4361   orl(result, 3);
4362   lea(ary1, Address(ary1, len, Address::times_1));
4363   jmpb(COMPARE_CHAR);
4364 
4365   bind(CHAR_ADJUST);
4366   // We are looking at a char + optional byte tail, and found that one
4367   // of the bytes in the char is negative. Adjust the result, check the
4368   // first byte and readjust if needed.
4369   andl(result, 0xfffffffc);
4370   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4371   jccb(Assembler::notZero, DONE);
4372   addptr(result, 1);
4373 
4374   // That's it
4375   bind(DONE);
4376   if (UseAVX >= 2 && UseSSE >= 2) {
4377     // clean upper bits of YMM registers
4378     vpxor(vec1, vec1);
4379     vpxor(vec2, vec2);
4380   }
4381 }
4382 
4383 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4384 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4385                                       Register limit, Register result, Register chr,
4386                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4387                                       KRegister mask, bool expand_ary2) {
4388   // for expand_ary2, limit is the (smaller) size of the second array.
4389   ShortBranchVerifier sbv(this);
4390   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4391 
4392   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4393          "Expansion only implemented for AVX2");
4394 
4395   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4396   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4397 
4398   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4399   int scaleIncr = expand_ary2 ? 8 : 16;
4400 
4401   if (is_array_equ) {
4402     // Check the input args
4403     cmpoop(ary1, ary2);
4404     jcc(Assembler::equal, TRUE_LABEL);
4405 
4406     // Need additional checks for arrays_equals.
4407     testptr(ary1, ary1);
4408     jcc(Assembler::zero, FALSE_LABEL);
4409     testptr(ary2, ary2);
4410     jcc(Assembler::zero, FALSE_LABEL);
4411 
4412     // Check the lengths
4413     movl(limit, Address(ary1, length_offset));
4414     cmpl(limit, Address(ary2, length_offset));
4415     jcc(Assembler::notEqual, FALSE_LABEL);
4416   }
4417 
4418   // count == 0
4419   testl(limit, limit);
4420   jcc(Assembler::zero, TRUE_LABEL);
4421 
4422   if (is_array_equ) {
4423     // Load array address
4424     lea(ary1, Address(ary1, base_offset));
4425     lea(ary2, Address(ary2, base_offset));
4426   }
4427 
4428   if (is_array_equ && is_char) {
4429     // arrays_equals when used for char[].
4430     shll(limit, 1);      // byte count != 0
4431   }
4432   movl(result, limit); // copy
4433 
4434   if (UseAVX >= 2) {
4435     // With AVX2, use 32-byte vector compare
4436     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4437 
4438     // Compare 32-byte vectors
4439     if (expand_ary2) {
4440       andl(result, 0x0000000f);  //   tail count (in bytes)
4441       andl(limit, 0xfffffff0);   // vector count (in bytes)
4442       jcc(Assembler::zero, COMPARE_TAIL);
4443     } else {
4444       andl(result, 0x0000001f);  //   tail count (in bytes)
4445       andl(limit, 0xffffffe0);   // vector count (in bytes)
4446       jcc(Assembler::zero, COMPARE_TAIL_16);
4447     }
4448 
4449     lea(ary1, Address(ary1, limit, scaleFactor));
4450     lea(ary2, Address(ary2, limit, Address::times_1));
4451     negptr(limit);
4452 
4453 #ifdef _LP64
4454     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4455       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4456 
4457       cmpl(limit, -64);
4458       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4459 
4460       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4461 
4462       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4463       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4464       kortestql(mask, mask);
4465       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4466       addptr(limit, 64);  // update since we already compared at this addr
4467       cmpl(limit, -64);
4468       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4469 
4470       // At this point we may still need to compare -limit+result bytes.
4471       // We could execute the next two instruction and just continue via non-wide path:
4472       //  cmpl(limit, 0);
4473       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4474       // But since we stopped at the points ary{1,2}+limit which are
4475       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4476       // (|limit| <= 32 and result < 32),
4477       // we may just compare the last 64 bytes.
4478       //
4479       addptr(result, -64);   // it is safe, bc we just came from this area
4480       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4481       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4482       kortestql(mask, mask);
4483       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4484 
4485       jmp(TRUE_LABEL);
4486 
4487       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4488 
4489     }//if (VM_Version::supports_avx512vlbw())
4490 #endif //_LP64
4491     bind(COMPARE_WIDE_VECTORS);
4492     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4493     if (expand_ary2) {
4494       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4495     } else {
4496       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4497     }
4498     vpxor(vec1, vec2);
4499 
4500     vptest(vec1, vec1);
4501     jcc(Assembler::notZero, FALSE_LABEL);
4502     addptr(limit, scaleIncr * 2);
4503     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4504 
4505     testl(result, result);
4506     jcc(Assembler::zero, TRUE_LABEL);
4507 
4508     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4509     if (expand_ary2) {
4510       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4511     } else {
4512       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4513     }
4514     vpxor(vec1, vec2);
4515 
4516     vptest(vec1, vec1);
4517     jcc(Assembler::notZero, FALSE_LABEL);
4518     jmp(TRUE_LABEL);
4519 
4520     bind(COMPARE_TAIL_16); // limit is zero
4521     movl(limit, result);
4522 
4523     // Compare 16-byte chunks
4524     andl(result, 0x0000000f);  //   tail count (in bytes)
4525     andl(limit, 0xfffffff0);   // vector count (in bytes)
4526     jcc(Assembler::zero, COMPARE_TAIL);
4527 
4528     lea(ary1, Address(ary1, limit, scaleFactor));
4529     lea(ary2, Address(ary2, limit, Address::times_1));
4530     negptr(limit);
4531 
4532     bind(COMPARE_WIDE_VECTORS_16);
4533     movdqu(vec1, Address(ary1, limit, scaleFactor));
4534     if (expand_ary2) {
4535       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4536     } else {
4537       movdqu(vec2, Address(ary2, limit, Address::times_1));
4538     }
4539     pxor(vec1, vec2);
4540 
4541     ptest(vec1, vec1);
4542     jcc(Assembler::notZero, FALSE_LABEL);
4543     addptr(limit, scaleIncr);
4544     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4545 
4546     bind(COMPARE_TAIL); // limit is zero
4547     movl(limit, result);
4548     // Fallthru to tail compare
4549   } else if (UseSSE42Intrinsics) {
4550     // With SSE4.2, use double quad vector compare
4551     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4552 
4553     // Compare 16-byte vectors
4554     andl(result, 0x0000000f);  //   tail count (in bytes)
4555     andl(limit, 0xfffffff0);   // vector count (in bytes)
4556     jcc(Assembler::zero, COMPARE_TAIL);
4557 
4558     lea(ary1, Address(ary1, limit, Address::times_1));
4559     lea(ary2, Address(ary2, limit, Address::times_1));
4560     negptr(limit);
4561 
4562     bind(COMPARE_WIDE_VECTORS);
4563     movdqu(vec1, Address(ary1, limit, Address::times_1));
4564     movdqu(vec2, Address(ary2, limit, Address::times_1));
4565     pxor(vec1, vec2);
4566 
4567     ptest(vec1, vec1);
4568     jcc(Assembler::notZero, FALSE_LABEL);
4569     addptr(limit, 16);
4570     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4571 
4572     testl(result, result);
4573     jcc(Assembler::zero, TRUE_LABEL);
4574 
4575     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4576     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4577     pxor(vec1, vec2);
4578 
4579     ptest(vec1, vec1);
4580     jccb(Assembler::notZero, FALSE_LABEL);
4581     jmpb(TRUE_LABEL);
4582 
4583     bind(COMPARE_TAIL); // limit is zero
4584     movl(limit, result);
4585     // Fallthru to tail compare
4586   }
4587 
4588   // Compare 4-byte vectors
4589   if (expand_ary2) {
4590     testl(result, result);
4591     jccb(Assembler::zero, TRUE_LABEL);
4592   } else {
4593     andl(limit, 0xfffffffc); // vector count (in bytes)
4594     jccb(Assembler::zero, COMPARE_CHAR);
4595   }
4596 
4597   lea(ary1, Address(ary1, limit, scaleFactor));
4598   lea(ary2, Address(ary2, limit, Address::times_1));
4599   negptr(limit);
4600 
4601   bind(COMPARE_VECTORS);
4602   if (expand_ary2) {
4603     // There are no "vector" operations for bytes to shorts
4604     movzbl(chr, Address(ary2, limit, Address::times_1));
4605     cmpw(Address(ary1, limit, Address::times_2), chr);
4606     jccb(Assembler::notEqual, FALSE_LABEL);
4607     addptr(limit, 1);
4608     jcc(Assembler::notZero, COMPARE_VECTORS);
4609     jmp(TRUE_LABEL);
4610   } else {
4611     movl(chr, Address(ary1, limit, Address::times_1));
4612     cmpl(chr, Address(ary2, limit, Address::times_1));
4613     jccb(Assembler::notEqual, FALSE_LABEL);
4614     addptr(limit, 4);
4615     jcc(Assembler::notZero, COMPARE_VECTORS);
4616   }
4617 
4618   // Compare trailing char (final 2 bytes), if any
4619   bind(COMPARE_CHAR);
4620   testl(result, 0x2);   // tail  char
4621   jccb(Assembler::zero, COMPARE_BYTE);
4622   load_unsigned_short(chr, Address(ary1, 0));
4623   load_unsigned_short(limit, Address(ary2, 0));
4624   cmpl(chr, limit);
4625   jccb(Assembler::notEqual, FALSE_LABEL);
4626 
4627   if (is_array_equ && is_char) {
4628     bind(COMPARE_BYTE);
4629   } else {
4630     lea(ary1, Address(ary1, 2));
4631     lea(ary2, Address(ary2, 2));
4632 
4633     bind(COMPARE_BYTE);
4634     testl(result, 0x1);   // tail  byte
4635     jccb(Assembler::zero, TRUE_LABEL);
4636     load_unsigned_byte(chr, Address(ary1, 0));
4637     load_unsigned_byte(limit, Address(ary2, 0));
4638     cmpl(chr, limit);
4639     jccb(Assembler::notEqual, FALSE_LABEL);
4640   }
4641   bind(TRUE_LABEL);
4642   movl(result, 1);   // return true
4643   jmpb(DONE);
4644 
4645   bind(FALSE_LABEL);
4646   xorl(result, result); // return false
4647 
4648   // That's it
4649   bind(DONE);
4650   if (UseAVX >= 2) {
4651     // clean upper bits of YMM registers
4652     vpxor(vec1, vec1);
4653     vpxor(vec2, vec2);
4654   }
4655 }
4656 
4657 #ifdef _LP64
4658 
4659 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4660 #define __ masm.
4661   Register dst = stub.data<0>();
4662   XMMRegister src = stub.data<1>();
4663   address target = stub.data<2>();
4664   __ bind(stub.entry());
4665   __ subptr(rsp, 8);
4666   __ movdbl(Address(rsp), src);
4667   __ call(RuntimeAddress(target));
4668   __ pop(dst);
4669   __ jmp(stub.continuation());
4670 #undef __
4671 }
4672 
4673 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4674   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4675   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4676 
4677   address slowpath_target;
4678   if (dst_bt == T_INT) {
4679     if (src_bt == T_FLOAT) {
4680       cvttss2sil(dst, src);
4681       cmpl(dst, 0x80000000);
4682       slowpath_target = StubRoutines::x86::f2i_fixup();
4683     } else {
4684       cvttsd2sil(dst, src);
4685       cmpl(dst, 0x80000000);
4686       slowpath_target = StubRoutines::x86::d2i_fixup();
4687     }
4688   } else {
4689     if (src_bt == T_FLOAT) {
4690       cvttss2siq(dst, src);
4691       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4692       slowpath_target = StubRoutines::x86::f2l_fixup();
4693     } else {
4694       cvttsd2siq(dst, src);
4695       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4696       slowpath_target = StubRoutines::x86::d2l_fixup();
4697     }
4698   }
4699 
4700   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4701   jcc(Assembler::equal, stub->entry());
4702   bind(stub->continuation());
4703 }
4704 
4705 #endif // _LP64
4706 
4707 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4708                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4709   switch(ideal_opc) {
4710     case Op_LShiftVS:
4711       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4712     case Op_LShiftVI:
4713       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4714     case Op_LShiftVL:
4715       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4716     case Op_RShiftVS:
4717       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4718     case Op_RShiftVI:
4719       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4720     case Op_RShiftVL:
4721       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4722     case Op_URShiftVS:
4723       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4724     case Op_URShiftVI:
4725       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4726     case Op_URShiftVL:
4727       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4728     case Op_RotateRightV:
4729       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4730     case Op_RotateLeftV:
4731       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4732     default:
4733       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4734       break;
4735   }
4736 }
4737 
4738 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4739                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4740   if (is_unsigned) {
4741     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4742   } else {
4743     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4744   }
4745 }
4746 
4747 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4748                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4749   switch (elem_bt) {
4750     case T_BYTE:
4751       if (ideal_opc == Op_SaturatingAddV) {
4752         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4753       } else {
4754         assert(ideal_opc == Op_SaturatingSubV, "");
4755         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4756       }
4757       break;
4758     case T_SHORT:
4759       if (ideal_opc == Op_SaturatingAddV) {
4760         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4761       } else {
4762         assert(ideal_opc == Op_SaturatingSubV, "");
4763         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4764       }
4765       break;
4766     default:
4767       fatal("Unsupported type %s", type2name(elem_bt));
4768       break;
4769   }
4770 }
4771 
4772 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4773                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4774   switch (elem_bt) {
4775     case T_BYTE:
4776       if (ideal_opc == Op_SaturatingAddV) {
4777         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4778       } else {
4779         assert(ideal_opc == Op_SaturatingSubV, "");
4780         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4781       }
4782       break;
4783     case T_SHORT:
4784       if (ideal_opc == Op_SaturatingAddV) {
4785         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4786       } else {
4787         assert(ideal_opc == Op_SaturatingSubV, "");
4788         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4789       }
4790       break;
4791     default:
4792       fatal("Unsupported type %s", type2name(elem_bt));
4793       break;
4794   }
4795 }
4796 
4797 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4798                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4799   if (is_unsigned) {
4800     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4801   } else {
4802     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4803   }
4804 }
4805 
4806 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4807                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4808   switch (elem_bt) {
4809     case T_BYTE:
4810       if (ideal_opc == Op_SaturatingAddV) {
4811         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4812       } else {
4813         assert(ideal_opc == Op_SaturatingSubV, "");
4814         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4815       }
4816       break;
4817     case T_SHORT:
4818       if (ideal_opc == Op_SaturatingAddV) {
4819         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4820       } else {
4821         assert(ideal_opc == Op_SaturatingSubV, "");
4822         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4823       }
4824       break;
4825     default:
4826       fatal("Unsupported type %s", type2name(elem_bt));
4827       break;
4828   }
4829 }
4830 
4831 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4832                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4833   switch (elem_bt) {
4834     case T_BYTE:
4835       if (ideal_opc == Op_SaturatingAddV) {
4836         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4837       } else {
4838         assert(ideal_opc == Op_SaturatingSubV, "");
4839         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4840       }
4841       break;
4842     case T_SHORT:
4843       if (ideal_opc == Op_SaturatingAddV) {
4844         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4845       } else {
4846         assert(ideal_opc == Op_SaturatingSubV, "");
4847         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4848       }
4849       break;
4850     default:
4851       fatal("Unsupported type %s", type2name(elem_bt));
4852       break;
4853   }
4854 }
4855 
4856 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4857                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4858                                     bool is_varshift) {
4859   switch (ideal_opc) {
4860     case Op_AddVB:
4861       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4862     case Op_AddVS:
4863       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4864     case Op_AddVI:
4865       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4866     case Op_AddVL:
4867       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4868     case Op_AddVF:
4869       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4870     case Op_AddVD:
4871       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4872     case Op_SubVB:
4873       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4874     case Op_SubVS:
4875       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4876     case Op_SubVI:
4877       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4878     case Op_SubVL:
4879       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4880     case Op_SubVF:
4881       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4882     case Op_SubVD:
4883       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4884     case Op_MulVS:
4885       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4886     case Op_MulVI:
4887       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4888     case Op_MulVL:
4889       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4890     case Op_MulVF:
4891       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4892     case Op_MulVD:
4893       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4894     case Op_DivVF:
4895       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4896     case Op_DivVD:
4897       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4898     case Op_SqrtVF:
4899       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4900     case Op_SqrtVD:
4901       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4902     case Op_AbsVB:
4903       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4904     case Op_AbsVS:
4905       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4906     case Op_AbsVI:
4907       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4908     case Op_AbsVL:
4909       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4910     case Op_FmaVF:
4911       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4912     case Op_FmaVD:
4913       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4914     case Op_VectorRearrange:
4915       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4916     case Op_LShiftVS:
4917       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4918     case Op_LShiftVI:
4919       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4920     case Op_LShiftVL:
4921       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4922     case Op_RShiftVS:
4923       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4924     case Op_RShiftVI:
4925       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4926     case Op_RShiftVL:
4927       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4928     case Op_URShiftVS:
4929       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4930     case Op_URShiftVI:
4931       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4932     case Op_URShiftVL:
4933       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4934     case Op_RotateLeftV:
4935       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4936     case Op_RotateRightV:
4937       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4938     case Op_MaxV:
4939       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4940     case Op_MinV:
4941       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4942     case Op_UMinV:
4943       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4944     case Op_UMaxV:
4945       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4946     case Op_XorV:
4947       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4948     case Op_OrV:
4949       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4950     case Op_AndV:
4951       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4952     default:
4953       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4954       break;
4955   }
4956 }
4957 
4958 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4959                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4960   switch (ideal_opc) {
4961     case Op_AddVB:
4962       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4963     case Op_AddVS:
4964       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4965     case Op_AddVI:
4966       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4967     case Op_AddVL:
4968       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4969     case Op_AddVF:
4970       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4971     case Op_AddVD:
4972       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4973     case Op_SubVB:
4974       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4975     case Op_SubVS:
4976       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4977     case Op_SubVI:
4978       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4979     case Op_SubVL:
4980       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4981     case Op_SubVF:
4982       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4983     case Op_SubVD:
4984       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4985     case Op_MulVS:
4986       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4987     case Op_MulVI:
4988       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4989     case Op_MulVL:
4990       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4991     case Op_MulVF:
4992       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4993     case Op_MulVD:
4994       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4995     case Op_DivVF:
4996       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4997     case Op_DivVD:
4998       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4999     case Op_FmaVF:
5000       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
5001     case Op_FmaVD:
5002       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
5003     case Op_MaxV:
5004       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5005     case Op_MinV:
5006       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5007     case Op_UMaxV:
5008       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5009     case Op_UMinV:
5010       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5011     case Op_XorV:
5012       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5013     case Op_OrV:
5014       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5015     case Op_AndV:
5016       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5017     default:
5018       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
5019       break;
5020   }
5021 }
5022 
5023 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
5024                                   KRegister src1, KRegister src2) {
5025   BasicType etype = T_ILLEGAL;
5026   switch(mask_len) {
5027     case 2:
5028     case 4:
5029     case 8:  etype = T_BYTE; break;
5030     case 16: etype = T_SHORT; break;
5031     case 32: etype = T_INT; break;
5032     case 64: etype = T_LONG; break;
5033     default: fatal("Unsupported type"); break;
5034   }
5035   assert(etype != T_ILLEGAL, "");
5036   switch(ideal_opc) {
5037     case Op_AndVMask:
5038       kand(etype, dst, src1, src2); break;
5039     case Op_OrVMask:
5040       kor(etype, dst, src1, src2); break;
5041     case Op_XorVMask:
5042       kxor(etype, dst, src1, src2); break;
5043     default:
5044       fatal("Unsupported masked operation"); break;
5045   }
5046 }
5047 
5048 /*
5049  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5050  * If src is NaN, the result is 0.
5051  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
5052  * the result is equal to the value of Integer.MIN_VALUE.
5053  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
5054  * the result is equal to the value of Integer.MAX_VALUE.
5055  */
5056 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5057                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5058                                                                    Register rscratch, AddressLiteral float_sign_flip,
5059                                                                    int vec_enc) {
5060   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5061   Label done;
5062   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
5063   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5064   vptest(xtmp2, xtmp2, vec_enc);
5065   jccb(Assembler::equal, done);
5066 
5067   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
5068   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
5069 
5070   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5071   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
5072   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
5073 
5074   // Recompute the mask for remaining special value.
5075   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
5076   // Extract SRC values corresponding to TRUE mask lanes.
5077   vpand(xtmp4, xtmp2, src, vec_enc);
5078   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
5079   // values are set.
5080   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5081 
5082   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5083   bind(done);
5084 }
5085 
5086 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5087                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5088                                                                     Register rscratch, AddressLiteral float_sign_flip,
5089                                                                     int vec_enc) {
5090   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5091   Label done;
5092   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5093   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5094   kortestwl(ktmp1, ktmp1);
5095   jccb(Assembler::equal, done);
5096 
5097   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5098   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5099   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5100 
5101   kxorwl(ktmp1, ktmp1, ktmp2);
5102   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5103   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5104   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5105   bind(done);
5106 }
5107 
5108 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5109                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5110                                                                      Register rscratch, AddressLiteral double_sign_flip,
5111                                                                      int vec_enc) {
5112   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5113 
5114   Label done;
5115   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5116   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5117   kortestwl(ktmp1, ktmp1);
5118   jccb(Assembler::equal, done);
5119 
5120   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5121   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5122   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5123 
5124   kxorwl(ktmp1, ktmp1, ktmp2);
5125   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5126   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5127   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5128   bind(done);
5129 }
5130 
5131 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5132                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5133                                                                      Register rscratch, AddressLiteral float_sign_flip,
5134                                                                      int vec_enc) {
5135   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5136   Label done;
5137   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5138   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5139   kortestwl(ktmp1, ktmp1);
5140   jccb(Assembler::equal, done);
5141 
5142   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5143   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5144   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5145 
5146   kxorwl(ktmp1, ktmp1, ktmp2);
5147   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5148   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5149   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5150   bind(done);
5151 }
5152 
5153 /*
5154  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5155  * If src is NaN, the result is 0.
5156  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5157  * the result is equal to the value of Long.MIN_VALUE.
5158  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5159  * the result is equal to the value of Long.MAX_VALUE.
5160  */
5161 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5162                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5163                                                                       Register rscratch, AddressLiteral double_sign_flip,
5164                                                                       int vec_enc) {
5165   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5166 
5167   Label done;
5168   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5169   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5170   kortestwl(ktmp1, ktmp1);
5171   jccb(Assembler::equal, done);
5172 
5173   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5174   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5175   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5176 
5177   kxorwl(ktmp1, ktmp1, ktmp2);
5178   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5179   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5180   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5181   bind(done);
5182 }
5183 
5184 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5185                                                              XMMRegister xtmp, int index, int vec_enc) {
5186    assert(vec_enc < Assembler::AVX_512bit, "");
5187    if (vec_enc == Assembler::AVX_256bit) {
5188      vextractf128_high(xtmp, src);
5189      vshufps(dst, src, xtmp, index, vec_enc);
5190    } else {
5191      vshufps(dst, src, zero, index, vec_enc);
5192    }
5193 }
5194 
5195 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5196                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5197                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5198   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5199 
5200   Label done;
5201   // Compare the destination lanes with float_sign_flip
5202   // value to get mask for all special values.
5203   movdqu(xtmp1, float_sign_flip, rscratch);
5204   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5205   ptest(xtmp2, xtmp2);
5206   jccb(Assembler::equal, done);
5207 
5208   // Flip float_sign_flip to get max integer value.
5209   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5210   pxor(xtmp1, xtmp4);
5211 
5212   // Set detination lanes corresponding to unordered source lanes as zero.
5213   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5214   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5215 
5216   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5217   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5218   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5219 
5220   // Recompute the mask for remaining special value.
5221   pxor(xtmp2, xtmp3);
5222   // Extract mask corresponding to non-negative source lanes.
5223   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5224 
5225   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5226   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5227   pand(xtmp3, xtmp2);
5228 
5229   // Replace destination lanes holding special value(0x80000000) with max int
5230   // if corresponding source lane holds a +ve value.
5231   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5232   bind(done);
5233 }
5234 
5235 
5236 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5237                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5238   switch(to_elem_bt) {
5239     case T_SHORT:
5240       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5241       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5242       vpackusdw(dst, dst, zero, vec_enc);
5243       if (vec_enc == Assembler::AVX_256bit) {
5244         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5245       }
5246       break;
5247     case  T_BYTE:
5248       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5249       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5250       vpackusdw(dst, dst, zero, vec_enc);
5251       if (vec_enc == Assembler::AVX_256bit) {
5252         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5253       }
5254       vpackuswb(dst, dst, zero, vec_enc);
5255       break;
5256     default: assert(false, "%s", type2name(to_elem_bt));
5257   }
5258 }
5259 
5260 /*
5261  * Algorithm for vector D2L and F2I conversions:-
5262  * a) Perform vector D2L/F2I cast.
5263  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5264  *    It signifies that source value could be any of the special floating point
5265  *    values(NaN,-Inf,Inf,Max,-Min).
5266  * c) Set destination to zero if source is NaN value.
5267  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5268  */
5269 
5270 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5271                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5272                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5273   int to_elem_sz = type2aelembytes(to_elem_bt);
5274   assert(to_elem_sz <= 4, "");
5275   vcvttps2dq(dst, src, vec_enc);
5276   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5277   if (to_elem_sz < 4) {
5278     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5279     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5280   }
5281 }
5282 
5283 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5284                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5285                                             Register rscratch, int vec_enc) {
5286   int to_elem_sz = type2aelembytes(to_elem_bt);
5287   assert(to_elem_sz <= 4, "");
5288   vcvttps2dq(dst, src, vec_enc);
5289   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5290   switch(to_elem_bt) {
5291     case T_INT:
5292       break;
5293     case T_SHORT:
5294       evpmovdw(dst, dst, vec_enc);
5295       break;
5296     case T_BYTE:
5297       evpmovdb(dst, dst, vec_enc);
5298       break;
5299     default: assert(false, "%s", type2name(to_elem_bt));
5300   }
5301 }
5302 
5303 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5304                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5305                                             Register rscratch, int vec_enc) {
5306   evcvttps2qq(dst, src, vec_enc);
5307   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5308 }
5309 
5310 // Handling for downcasting from double to integer or sub-word types on AVX2.
5311 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5312                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5313                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5314   int to_elem_sz = type2aelembytes(to_elem_bt);
5315   assert(to_elem_sz < 8, "");
5316   vcvttpd2dq(dst, src, vec_enc);
5317   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5318                                               float_sign_flip, vec_enc);
5319   if (to_elem_sz < 4) {
5320     // xtmp4 holds all zero lanes.
5321     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5322   }
5323 }
5324 
5325 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5326                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5327                                             KRegister ktmp2, AddressLiteral sign_flip,
5328                                             Register rscratch, int vec_enc) {
5329   if (VM_Version::supports_avx512dq()) {
5330     evcvttpd2qq(dst, src, vec_enc);
5331     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5332     switch(to_elem_bt) {
5333       case T_LONG:
5334         break;
5335       case T_INT:
5336         evpmovsqd(dst, dst, vec_enc);
5337         break;
5338       case T_SHORT:
5339         evpmovsqd(dst, dst, vec_enc);
5340         evpmovdw(dst, dst, vec_enc);
5341         break;
5342       case T_BYTE:
5343         evpmovsqd(dst, dst, vec_enc);
5344         evpmovdb(dst, dst, vec_enc);
5345         break;
5346       default: assert(false, "%s", type2name(to_elem_bt));
5347     }
5348   } else {
5349     assert(type2aelembytes(to_elem_bt) <= 4, "");
5350     vcvttpd2dq(dst, src, vec_enc);
5351     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5352     switch(to_elem_bt) {
5353       case T_INT:
5354         break;
5355       case T_SHORT:
5356         evpmovdw(dst, dst, vec_enc);
5357         break;
5358       case T_BYTE:
5359         evpmovdb(dst, dst, vec_enc);
5360         break;
5361       default: assert(false, "%s", type2name(to_elem_bt));
5362     }
5363   }
5364 }
5365 
5366 #ifdef _LP64
5367 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5368                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5369                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5370   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5371   // and re-instantiate original MXCSR.RC mode after that.
5372   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5373 
5374   mov64(tmp, julong_cast(0.5L));
5375   evpbroadcastq(xtmp1, tmp, vec_enc);
5376   vaddpd(xtmp1, src , xtmp1, vec_enc);
5377   evcvtpd2qq(dst, xtmp1, vec_enc);
5378   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5379                                                 double_sign_flip, vec_enc);;
5380 
5381   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5382 }
5383 
5384 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5385                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5386                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5387   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5388   // and re-instantiate original MXCSR.RC mode after that.
5389   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5390 
5391   movl(tmp, jint_cast(0.5));
5392   movq(xtmp1, tmp);
5393   vbroadcastss(xtmp1, xtmp1, vec_enc);
5394   vaddps(xtmp1, src , xtmp1, vec_enc);
5395   vcvtps2dq(dst, xtmp1, vec_enc);
5396   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5397                                               float_sign_flip, vec_enc);
5398 
5399   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5400 }
5401 
5402 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5403                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5404                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5405   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5406   // and re-instantiate original MXCSR.RC mode after that.
5407   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5408 
5409   movl(tmp, jint_cast(0.5));
5410   movq(xtmp1, tmp);
5411   vbroadcastss(xtmp1, xtmp1, vec_enc);
5412   vaddps(xtmp1, src , xtmp1, vec_enc);
5413   vcvtps2dq(dst, xtmp1, vec_enc);
5414   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5415 
5416   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5417 }
5418 #endif // _LP64
5419 
5420 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5421                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5422   switch (from_elem_bt) {
5423     case T_BYTE:
5424       switch (to_elem_bt) {
5425         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5426         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5427         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5428         default: ShouldNotReachHere();
5429       }
5430       break;
5431     case T_SHORT:
5432       switch (to_elem_bt) {
5433         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5434         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5435         default: ShouldNotReachHere();
5436       }
5437       break;
5438     case T_INT:
5439       assert(to_elem_bt == T_LONG, "");
5440       vpmovzxdq(dst, src, vlen_enc);
5441       break;
5442     default:
5443       ShouldNotReachHere();
5444   }
5445 }
5446 
5447 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5448                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5449   switch (from_elem_bt) {
5450     case T_BYTE:
5451       switch (to_elem_bt) {
5452         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5453         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5454         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5455         default: ShouldNotReachHere();
5456       }
5457       break;
5458     case T_SHORT:
5459       switch (to_elem_bt) {
5460         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5461         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5462         default: ShouldNotReachHere();
5463       }
5464       break;
5465     case T_INT:
5466       assert(to_elem_bt == T_LONG, "");
5467       vpmovsxdq(dst, src, vlen_enc);
5468       break;
5469     default:
5470       ShouldNotReachHere();
5471   }
5472 }
5473 
5474 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5475                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5476   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5477   assert(vlen_enc != AVX_512bit, "");
5478 
5479   int dst_bt_size = type2aelembytes(dst_bt);
5480   int src_bt_size = type2aelembytes(src_bt);
5481   if (dst_bt_size > src_bt_size) {
5482     switch (dst_bt_size / src_bt_size) {
5483       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5484       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5485       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5486       default: ShouldNotReachHere();
5487     }
5488   } else {
5489     assert(dst_bt_size < src_bt_size, "");
5490     switch (src_bt_size / dst_bt_size) {
5491       case 2: {
5492         if (vlen_enc == AVX_128bit) {
5493           vpacksswb(dst, src, src, vlen_enc);
5494         } else {
5495           vpacksswb(dst, src, src, vlen_enc);
5496           vpermq(dst, dst, 0x08, vlen_enc);
5497         }
5498         break;
5499       }
5500       case 4: {
5501         if (vlen_enc == AVX_128bit) {
5502           vpackssdw(dst, src, src, vlen_enc);
5503           vpacksswb(dst, dst, dst, vlen_enc);
5504         } else {
5505           vpackssdw(dst, src, src, vlen_enc);
5506           vpermq(dst, dst, 0x08, vlen_enc);
5507           vpacksswb(dst, dst, dst, AVX_128bit);
5508         }
5509         break;
5510       }
5511       case 8: {
5512         if (vlen_enc == AVX_128bit) {
5513           vpshufd(dst, src, 0x08, vlen_enc);
5514           vpackssdw(dst, dst, dst, vlen_enc);
5515           vpacksswb(dst, dst, dst, vlen_enc);
5516         } else {
5517           vpshufd(dst, src, 0x08, vlen_enc);
5518           vpermq(dst, dst, 0x08, vlen_enc);
5519           vpackssdw(dst, dst, dst, AVX_128bit);
5520           vpacksswb(dst, dst, dst, AVX_128bit);
5521         }
5522         break;
5523       }
5524       default: ShouldNotReachHere();
5525     }
5526   }
5527 }
5528 
5529 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5530                                    bool merge, BasicType bt, int vlen_enc) {
5531   if (bt == T_INT) {
5532     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5533   } else {
5534     assert(bt == T_LONG, "");
5535     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5536   }
5537 }
5538 
5539 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5540                                    bool merge, BasicType bt, int vlen_enc) {
5541   if (bt == T_INT) {
5542     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5543   } else {
5544     assert(bt == T_LONG, "");
5545     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5546   }
5547 }
5548 
5549 #ifdef _LP64
5550 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5551                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5552                                                int vec_enc) {
5553   int index = 0;
5554   int vindex = 0;
5555   mov64(rtmp1, 0x0101010101010101L);
5556   pdepq(rtmp1, src, rtmp1);
5557   if (mask_len > 8) {
5558     movq(rtmp2, src);
5559     vpxor(xtmp, xtmp, xtmp, vec_enc);
5560     movq(xtmp, rtmp1);
5561   }
5562   movq(dst, rtmp1);
5563 
5564   mask_len -= 8;
5565   while (mask_len > 0) {
5566     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5567     index++;
5568     if ((index % 2) == 0) {
5569       pxor(xtmp, xtmp);
5570     }
5571     mov64(rtmp1, 0x0101010101010101L);
5572     shrq(rtmp2, 8);
5573     pdepq(rtmp1, rtmp2, rtmp1);
5574     pinsrq(xtmp, rtmp1, index % 2);
5575     vindex = index / 2;
5576     if (vindex) {
5577       // Write entire 16 byte vector when both 64 bit
5578       // lanes are update to save redundant instructions.
5579       if (index % 2) {
5580         vinsertf128(dst, dst, xtmp, vindex);
5581       }
5582     } else {
5583       vmovdqu(dst, xtmp);
5584     }
5585     mask_len -= 8;
5586   }
5587 }
5588 
5589 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5590   switch(opc) {
5591     case Op_VectorMaskTrueCount:
5592       popcntq(dst, tmp);
5593       break;
5594     case Op_VectorMaskLastTrue:
5595       if (VM_Version::supports_lzcnt()) {
5596         lzcntq(tmp, tmp);
5597         movl(dst, 63);
5598         subl(dst, tmp);
5599       } else {
5600         movl(dst, -1);
5601         bsrq(tmp, tmp);
5602         cmov32(Assembler::notZero, dst, tmp);
5603       }
5604       break;
5605     case Op_VectorMaskFirstTrue:
5606       if (VM_Version::supports_bmi1()) {
5607         if (masklen < 32) {
5608           orl(tmp, 1 << masklen);
5609           tzcntl(dst, tmp);
5610         } else if (masklen == 32) {
5611           tzcntl(dst, tmp);
5612         } else {
5613           assert(masklen == 64, "");
5614           tzcntq(dst, tmp);
5615         }
5616       } else {
5617         if (masklen < 32) {
5618           orl(tmp, 1 << masklen);
5619           bsfl(dst, tmp);
5620         } else {
5621           assert(masklen == 32 || masklen == 64, "");
5622           movl(dst, masklen);
5623           if (masklen == 32)  {
5624             bsfl(tmp, tmp);
5625           } else {
5626             bsfq(tmp, tmp);
5627           }
5628           cmov32(Assembler::notZero, dst, tmp);
5629         }
5630       }
5631       break;
5632     case Op_VectorMaskToLong:
5633       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5634       break;
5635     default: assert(false, "Unhandled mask operation");
5636   }
5637 }
5638 
5639 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5640                                               int masklen, int masksize, int vec_enc) {
5641   assert(VM_Version::supports_popcnt(), "");
5642 
5643   if(VM_Version::supports_avx512bw()) {
5644     kmovql(tmp, mask);
5645   } else {
5646     assert(masklen <= 16, "");
5647     kmovwl(tmp, mask);
5648   }
5649 
5650   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5651   // operations needs to be clipped.
5652   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5653     andq(tmp, (1 << masklen) - 1);
5654   }
5655 
5656   vector_mask_operation_helper(opc, dst, tmp, masklen);
5657 }
5658 
5659 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5660                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5661   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5662          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5663   assert(VM_Version::supports_popcnt(), "");
5664 
5665   bool need_clip = false;
5666   switch(bt) {
5667     case T_BOOLEAN:
5668       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5669       vpxor(xtmp, xtmp, xtmp, vec_enc);
5670       vpsubb(xtmp, xtmp, mask, vec_enc);
5671       vpmovmskb(tmp, xtmp, vec_enc);
5672       need_clip = masklen < 16;
5673       break;
5674     case T_BYTE:
5675       vpmovmskb(tmp, mask, vec_enc);
5676       need_clip = masklen < 16;
5677       break;
5678     case T_SHORT:
5679       vpacksswb(xtmp, mask, mask, vec_enc);
5680       if (masklen >= 16) {
5681         vpermpd(xtmp, xtmp, 8, vec_enc);
5682       }
5683       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5684       need_clip = masklen < 16;
5685       break;
5686     case T_INT:
5687     case T_FLOAT:
5688       vmovmskps(tmp, mask, vec_enc);
5689       need_clip = masklen < 4;
5690       break;
5691     case T_LONG:
5692     case T_DOUBLE:
5693       vmovmskpd(tmp, mask, vec_enc);
5694       need_clip = masklen < 2;
5695       break;
5696     default: assert(false, "Unhandled type, %s", type2name(bt));
5697   }
5698 
5699   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5700   // operations needs to be clipped.
5701   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5702     // need_clip implies masklen < 32
5703     andq(tmp, (1 << masklen) - 1);
5704   }
5705 
5706   vector_mask_operation_helper(opc, dst, tmp, masklen);
5707 }
5708 
5709 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5710                                              Register rtmp2, int mask_len) {
5711   kmov(rtmp1, src);
5712   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5713   mov64(rtmp2, -1L);
5714   pextq(rtmp2, rtmp2, rtmp1);
5715   kmov(dst, rtmp2);
5716 }
5717 
5718 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5719                                                     XMMRegister mask, Register rtmp, Register rscratch,
5720                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5721                                                     int vec_enc) {
5722   assert(type2aelembytes(bt) >= 4, "");
5723   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5724   address compress_perm_table = nullptr;
5725   address expand_perm_table = nullptr;
5726   if (type2aelembytes(bt) == 8) {
5727     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5728     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5729     vmovmskpd(rtmp, mask, vec_enc);
5730   } else {
5731     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5732     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5733     vmovmskps(rtmp, mask, vec_enc);
5734   }
5735   shlq(rtmp, 5); // for 32 byte permute row.
5736   if (opcode == Op_CompressV) {
5737     lea(rscratch, ExternalAddress(compress_perm_table));
5738   } else {
5739     lea(rscratch, ExternalAddress(expand_perm_table));
5740   }
5741   addptr(rtmp, rscratch);
5742   vmovdqu(permv, Address(rtmp));
5743   vpermps(dst, permv, src, Assembler::AVX_256bit);
5744   vpxor(xtmp, xtmp, xtmp, vec_enc);
5745   // Blend the result with zero vector using permute mask, each column entry
5746   // in a permute table row contains either a valid permute index or a -1 (default)
5747   // value, this can potentially be used as a blending mask after
5748   // compressing/expanding the source vector lanes.
5749   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5750 }
5751 
5752 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5753                                                bool merge, BasicType bt, int vec_enc) {
5754   if (opcode == Op_CompressV) {
5755     switch(bt) {
5756     case T_BYTE:
5757       evpcompressb(dst, mask, src, merge, vec_enc);
5758       break;
5759     case T_CHAR:
5760     case T_SHORT:
5761       evpcompressw(dst, mask, src, merge, vec_enc);
5762       break;
5763     case T_INT:
5764       evpcompressd(dst, mask, src, merge, vec_enc);
5765       break;
5766     case T_FLOAT:
5767       evcompressps(dst, mask, src, merge, vec_enc);
5768       break;
5769     case T_LONG:
5770       evpcompressq(dst, mask, src, merge, vec_enc);
5771       break;
5772     case T_DOUBLE:
5773       evcompresspd(dst, mask, src, merge, vec_enc);
5774       break;
5775     default:
5776       fatal("Unsupported type %s", type2name(bt));
5777       break;
5778     }
5779   } else {
5780     assert(opcode == Op_ExpandV, "");
5781     switch(bt) {
5782     case T_BYTE:
5783       evpexpandb(dst, mask, src, merge, vec_enc);
5784       break;
5785     case T_CHAR:
5786     case T_SHORT:
5787       evpexpandw(dst, mask, src, merge, vec_enc);
5788       break;
5789     case T_INT:
5790       evpexpandd(dst, mask, src, merge, vec_enc);
5791       break;
5792     case T_FLOAT:
5793       evexpandps(dst, mask, src, merge, vec_enc);
5794       break;
5795     case T_LONG:
5796       evpexpandq(dst, mask, src, merge, vec_enc);
5797       break;
5798     case T_DOUBLE:
5799       evexpandpd(dst, mask, src, merge, vec_enc);
5800       break;
5801     default:
5802       fatal("Unsupported type %s", type2name(bt));
5803       break;
5804     }
5805   }
5806 }
5807 #endif
5808 
5809 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5810                                            KRegister ktmp1, int vec_enc) {
5811   if (opcode == Op_SignumVD) {
5812     vsubpd(dst, zero, one, vec_enc);
5813     // if src < 0 ? -1 : 1
5814     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5815     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5816     // if src == NaN, -0.0 or 0.0 return src.
5817     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5818     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5819   } else {
5820     assert(opcode == Op_SignumVF, "");
5821     vsubps(dst, zero, one, vec_enc);
5822     // if src < 0 ? -1 : 1
5823     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5824     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5825     // if src == NaN, -0.0 or 0.0 return src.
5826     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5827     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5828   }
5829 }
5830 
5831 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5832                                           XMMRegister xtmp1, int vec_enc) {
5833   if (opcode == Op_SignumVD) {
5834     vsubpd(dst, zero, one, vec_enc);
5835     // if src < 0 ? -1 : 1
5836     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5837     // if src == NaN, -0.0 or 0.0 return src.
5838     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5839     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5840   } else {
5841     assert(opcode == Op_SignumVF, "");
5842     vsubps(dst, zero, one, vec_enc);
5843     // if src < 0 ? -1 : 1
5844     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5845     // if src == NaN, -0.0 or 0.0 return src.
5846     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5847     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5848   }
5849 }
5850 
5851 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5852   if (VM_Version::supports_avx512bw()) {
5853     if (mask_len > 32) {
5854       kmovql(dst, src);
5855     } else {
5856       kmovdl(dst, src);
5857       if (mask_len != 32) {
5858         kshiftrdl(dst, dst, 32 - mask_len);
5859       }
5860     }
5861   } else {
5862     assert(mask_len <= 16, "");
5863     kmovwl(dst, src);
5864     if (mask_len != 16) {
5865       kshiftrwl(dst, dst, 16 - mask_len);
5866     }
5867   }
5868 }
5869 
5870 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5871   int lane_size = type2aelembytes(bt);
5872   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5873   if ((is_LP64 || lane_size < 8) &&
5874       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5875        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5876     movptr(rtmp, imm32);
5877     switch(lane_size) {
5878       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5879       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5880       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5881       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5882       fatal("Unsupported lane size %d", lane_size);
5883       break;
5884     }
5885   } else {
5886     movptr(rtmp, imm32);
5887     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5888     switch(lane_size) {
5889       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5890       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5891       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5892       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5893       fatal("Unsupported lane size %d", lane_size);
5894       break;
5895     }
5896   }
5897 }
5898 
5899 //
5900 // Following is lookup table based popcount computation algorithm:-
5901 //       Index   Bit set count
5902 //     [ 0000 ->   0,
5903 //       0001 ->   1,
5904 //       0010 ->   1,
5905 //       0011 ->   2,
5906 //       0100 ->   1,
5907 //       0101 ->   2,
5908 //       0110 ->   2,
5909 //       0111 ->   3,
5910 //       1000 ->   1,
5911 //       1001 ->   2,
5912 //       1010 ->   3,
5913 //       1011 ->   3,
5914 //       1100 ->   2,
5915 //       1101 ->   3,
5916 //       1111 ->   4 ]
5917 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5918 //     shuffle indices for lookup table access.
5919 //  b. Right shift each byte of vector lane by 4 positions.
5920 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5921 //     shuffle indices for lookup table access.
5922 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5923 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5924 //     count of all the bytes of a quadword.
5925 //  f. Perform step e. for upper 128bit vector lane.
5926 //  g. Pack the bitset count of quadwords back to double word.
5927 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5928 
5929 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5930                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5931   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5932   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5933   vpsrlw(dst, src, 4, vec_enc);
5934   vpand(dst, dst, xtmp1, vec_enc);
5935   vpand(xtmp1, src, xtmp1, vec_enc);
5936   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5937   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5938   vpshufb(dst, xtmp2, dst, vec_enc);
5939   vpaddb(dst, dst, xtmp1, vec_enc);
5940 }
5941 
5942 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5943                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5944   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5945   // Following code is as per steps e,f,g and h of above algorithm.
5946   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5947   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5948   vpsadbw(dst, dst, xtmp2, vec_enc);
5949   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5950   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5951   vpackuswb(dst, xtmp1, dst, vec_enc);
5952 }
5953 
5954 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5955                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5956   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5957   // Add the popcount of upper and lower bytes of word.
5958   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5959   vpsrlw(dst, xtmp1, 8, vec_enc);
5960   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5961   vpaddw(dst, dst, xtmp1, vec_enc);
5962 }
5963 
5964 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5965                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5966   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5967   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5968   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5969 }
5970 
5971 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5972                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5973   switch(bt) {
5974     case T_LONG:
5975       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5976       break;
5977     case T_INT:
5978       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5979       break;
5980     case T_CHAR:
5981     case T_SHORT:
5982       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5983       break;
5984     case T_BYTE:
5985     case T_BOOLEAN:
5986       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5987       break;
5988     default:
5989       fatal("Unsupported type %s", type2name(bt));
5990       break;
5991   }
5992 }
5993 
5994 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5995                                                       KRegister mask, bool merge, int vec_enc) {
5996   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5997   switch(bt) {
5998     case T_LONG:
5999       assert(VM_Version::supports_avx512_vpopcntdq(), "");
6000       evpopcntq(dst, mask, src, merge, vec_enc);
6001       break;
6002     case T_INT:
6003       assert(VM_Version::supports_avx512_vpopcntdq(), "");
6004       evpopcntd(dst, mask, src, merge, vec_enc);
6005       break;
6006     case T_CHAR:
6007     case T_SHORT:
6008       assert(VM_Version::supports_avx512_bitalg(), "");
6009       evpopcntw(dst, mask, src, merge, vec_enc);
6010       break;
6011     case T_BYTE:
6012     case T_BOOLEAN:
6013       assert(VM_Version::supports_avx512_bitalg(), "");
6014       evpopcntb(dst, mask, src, merge, vec_enc);
6015       break;
6016     default:
6017       fatal("Unsupported type %s", type2name(bt));
6018       break;
6019   }
6020 }
6021 
6022 #ifndef _LP64
6023 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
6024   assert(VM_Version::supports_avx512bw(), "");
6025   kmovdl(tmp, src);
6026   kunpckdql(dst, tmp, tmp);
6027 }
6028 #endif
6029 
6030 // Bit reversal algorithm first reverses the bits of each byte followed by
6031 // a byte level reversal for multi-byte primitive types (short/int/long).
6032 // Algorithm performs a lookup table access to get reverse bit sequence
6033 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6034 // is obtained by swapping the reverse bit sequences of upper and lower
6035 // nibble of a byte.
6036 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6037                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
6038   if (VM_Version::supports_avx512vlbw()) {
6039 
6040     // Get the reverse bit sequence of lower nibble of each byte.
6041     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6042     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6043     evpandq(dst, xtmp2, src, vec_enc);
6044     vpshufb(dst, xtmp1, dst, vec_enc);
6045     vpsllq(dst, dst, 4, vec_enc);
6046 
6047     // Get the reverse bit sequence of upper nibble of each byte.
6048     vpandn(xtmp2, xtmp2, src, vec_enc);
6049     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6050     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6051 
6052     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6053     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6054     evporq(xtmp2, dst, xtmp2, vec_enc);
6055     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6056 
6057   } else if(vec_enc == Assembler::AVX_512bit) {
6058     // Shift based bit reversal.
6059     assert(bt == T_LONG || bt == T_INT, "");
6060 
6061     // Swap lower and upper nibble of each byte.
6062     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6063 
6064     // Swap two least and most significant bits of each nibble.
6065     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6066 
6067     // Swap adjacent pair of bits.
6068     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6069     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6070 
6071     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6072     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6073   } else {
6074     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6075     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6076 
6077     // Get the reverse bit sequence of lower nibble of each byte.
6078     vpand(dst, xtmp2, src, vec_enc);
6079     vpshufb(dst, xtmp1, dst, vec_enc);
6080     vpsllq(dst, dst, 4, vec_enc);
6081 
6082     // Get the reverse bit sequence of upper nibble of each byte.
6083     vpandn(xtmp2, xtmp2, src, vec_enc);
6084     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6085     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6086 
6087     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6088     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6089     vpor(xtmp2, dst, xtmp2, vec_enc);
6090     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6091   }
6092 }
6093 
6094 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6095                                                 XMMRegister xtmp, Register rscratch) {
6096   assert(VM_Version::supports_gfni(), "");
6097   assert(rscratch != noreg || always_reachable(mask), "missing");
6098 
6099   // Galois field instruction based bit reversal based on following algorithm.
6100   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6101   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6102   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6103   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6104 }
6105 
6106 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6107                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6108   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6109   evpandq(dst, xtmp1, src, vec_enc);
6110   vpsllq(dst, dst, nbits, vec_enc);
6111   vpandn(xtmp1, xtmp1, src, vec_enc);
6112   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6113   evporq(dst, dst, xtmp1, vec_enc);
6114 }
6115 
6116 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6117                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6118   // Shift based bit reversal.
6119   assert(VM_Version::supports_evex(), "");
6120   switch(bt) {
6121     case T_LONG:
6122       // Swap upper and lower double word of each quad word.
6123       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6124       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6125       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6126       break;
6127     case T_INT:
6128       // Swap upper and lower word of each double word.
6129       evprord(xtmp1, k0, src, 16, true, vec_enc);
6130       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6131       break;
6132     case T_CHAR:
6133     case T_SHORT:
6134       // Swap upper and lower byte of each word.
6135       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6136       break;
6137     case T_BYTE:
6138       evmovdquq(dst, k0, src, true, vec_enc);
6139       break;
6140     default:
6141       fatal("Unsupported type %s", type2name(bt));
6142       break;
6143   }
6144 }
6145 
6146 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6147   if (bt == T_BYTE) {
6148     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6149       evmovdquq(dst, k0, src, true, vec_enc);
6150     } else {
6151       vmovdqu(dst, src);
6152     }
6153     return;
6154   }
6155   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6156   // pre-computed shuffle indices.
6157   switch(bt) {
6158     case T_LONG:
6159       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6160       break;
6161     case T_INT:
6162       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6163       break;
6164     case T_CHAR:
6165     case T_SHORT:
6166       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6167       break;
6168     default:
6169       fatal("Unsupported type %s", type2name(bt));
6170       break;
6171   }
6172   vpshufb(dst, src, dst, vec_enc);
6173 }
6174 
6175 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6176                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6177                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6178   assert(is_integral_type(bt), "");
6179   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6180   assert(VM_Version::supports_avx512cd(), "");
6181   switch(bt) {
6182     case T_LONG:
6183       evplzcntq(dst, ktmp, src, merge, vec_enc);
6184       break;
6185     case T_INT:
6186       evplzcntd(dst, ktmp, src, merge, vec_enc);
6187       break;
6188     case T_SHORT:
6189       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6190       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6191       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6192       vpunpckhwd(dst, xtmp1, src, vec_enc);
6193       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6194       vpackusdw(dst, xtmp2, dst, vec_enc);
6195       break;
6196     case T_BYTE:
6197       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6198       // accessing the lookup table.
6199       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6200       // accessing the lookup table.
6201       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6202       assert(VM_Version::supports_avx512bw(), "");
6203       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6204       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6205       vpand(xtmp2, dst, src, vec_enc);
6206       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6207       vpsrlw(xtmp3, src, 4, vec_enc);
6208       vpand(xtmp3, dst, xtmp3, vec_enc);
6209       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6210       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6211       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6212       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6213       break;
6214     default:
6215       fatal("Unsupported type %s", type2name(bt));
6216       break;
6217   }
6218 }
6219 
6220 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6221                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6222   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6223   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6224   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6225   // accessing the lookup table.
6226   vpand(dst, xtmp2, src, vec_enc);
6227   vpshufb(dst, xtmp1, dst, vec_enc);
6228   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6229   // accessing the lookup table.
6230   vpsrlw(xtmp3, src, 4, vec_enc);
6231   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6232   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6233   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6234   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6235   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6236   vpaddb(dst, dst, xtmp2, vec_enc);
6237   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6238 }
6239 
6240 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6241                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6242   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6243   // Add zero counts of lower byte and upper byte of a word if
6244   // upper byte holds a zero value.
6245   vpsrlw(xtmp3, src, 8, vec_enc);
6246   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6247   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6248   vpsllw(xtmp2, dst, 8, vec_enc);
6249   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6250   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6251   vpsrlw(dst, dst, 8, vec_enc);
6252 }
6253 
6254 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6255                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6256   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6257   // hence biased exponent can be used to compute leading zero count as per
6258   // following formula:-
6259   // LZCNT = 32 - (biased_exp - 127)
6260   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6261 
6262   // Broadcast 0xFF
6263   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6264   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6265 
6266   // Extract biased exponent.
6267   vcvtdq2ps(dst, src, vec_enc);
6268   vpsrld(dst, dst, 23, vec_enc);
6269   vpand(dst, dst, xtmp1, vec_enc);
6270 
6271   // Broadcast 127.
6272   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6273   // Exponent = biased_exp - 127
6274   vpsubd(dst, dst, xtmp1, vec_enc);
6275 
6276   // Exponent = Exponent  + 1
6277   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6278   vpaddd(dst, dst, xtmp3, vec_enc);
6279 
6280   // Replace -ve exponent with zero, exponent is -ve when src
6281   // lane contains a zero value.
6282   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6283   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6284 
6285   // Rematerialize broadcast 32.
6286   vpslld(xtmp1, xtmp3, 5, vec_enc);
6287   // Exponent is 32 if corresponding source lane contains max_int value.
6288   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6289   // LZCNT = 32 - exponent
6290   vpsubd(dst, xtmp1, dst, vec_enc);
6291 
6292   // Replace LZCNT with a value 1 if corresponding source lane
6293   // contains max_int value.
6294   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6295 
6296   // Replace biased_exp with 0 if source lane value is less than zero.
6297   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6298   vblendvps(dst, dst, xtmp2, src, vec_enc);
6299 }
6300 
6301 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6302                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6303   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6304   // Add zero counts of lower word and upper word of a double word if
6305   // upper word holds a zero value.
6306   vpsrld(xtmp3, src, 16, vec_enc);
6307   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6308   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6309   vpslld(xtmp2, dst, 16, vec_enc);
6310   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6311   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6312   vpsrld(dst, dst, 16, vec_enc);
6313   // Add zero counts of lower doubleword and upper doubleword of a
6314   // quadword if upper doubleword holds a zero value.
6315   vpsrlq(xtmp3, src, 32, vec_enc);
6316   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6317   vpsllq(xtmp2, dst, 32, vec_enc);
6318   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6319   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6320   vpsrlq(dst, dst, 32, vec_enc);
6321 }
6322 
6323 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6324                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6325                                                        Register rtmp, int vec_enc) {
6326   assert(is_integral_type(bt), "unexpected type");
6327   assert(vec_enc < Assembler::AVX_512bit, "");
6328   switch(bt) {
6329     case T_LONG:
6330       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6331       break;
6332     case T_INT:
6333       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6334       break;
6335     case T_SHORT:
6336       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6337       break;
6338     case T_BYTE:
6339       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6340       break;
6341     default:
6342       fatal("Unsupported type %s", type2name(bt));
6343       break;
6344   }
6345 }
6346 
6347 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6348   switch(bt) {
6349     case T_BYTE:
6350       vpsubb(dst, src1, src2, vec_enc);
6351       break;
6352     case T_SHORT:
6353       vpsubw(dst, src1, src2, vec_enc);
6354       break;
6355     case T_INT:
6356       vpsubd(dst, src1, src2, vec_enc);
6357       break;
6358     case T_LONG:
6359       vpsubq(dst, src1, src2, vec_enc);
6360       break;
6361     default:
6362       fatal("Unsupported type %s", type2name(bt));
6363       break;
6364   }
6365 }
6366 
6367 // Trailing zero count computation is based on leading zero count operation as per
6368 // following equation. All AVX3 targets support AVX512CD feature which offers
6369 // direct vector instruction to compute leading zero count.
6370 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6371 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6372                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6373                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6374   assert(is_integral_type(bt), "");
6375   // xtmp = -1
6376   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6377   // xtmp = xtmp + src
6378   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6379   // xtmp = xtmp & ~src
6380   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6381   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6382   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6383   vpsub(bt, dst, xtmp4, dst, vec_enc);
6384 }
6385 
6386 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6387 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6388 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6389                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6390   assert(is_integral_type(bt), "");
6391   // xtmp = 0
6392   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6393   // xtmp = 0 - src
6394   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6395   // xtmp = xtmp | src
6396   vpor(xtmp3, xtmp3, src, vec_enc);
6397   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6398   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6399   vpsub(bt, dst, xtmp1, dst, vec_enc);
6400 }
6401 
6402 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6403   Label done;
6404   Label neg_divisor_fastpath;
6405   cmpl(divisor, 0);
6406   jccb(Assembler::less, neg_divisor_fastpath);
6407   xorl(rdx, rdx);
6408   divl(divisor);
6409   jmpb(done);
6410   bind(neg_divisor_fastpath);
6411   // Fastpath for divisor < 0:
6412   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6413   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6414   movl(rdx, rax);
6415   subl(rdx, divisor);
6416   if (VM_Version::supports_bmi1()) {
6417     andnl(rax, rdx, rax);
6418   } else {
6419     notl(rdx);
6420     andl(rax, rdx);
6421   }
6422   shrl(rax, 31);
6423   bind(done);
6424 }
6425 
6426 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6427   Label done;
6428   Label neg_divisor_fastpath;
6429   cmpl(divisor, 0);
6430   jccb(Assembler::less, neg_divisor_fastpath);
6431   xorl(rdx, rdx);
6432   divl(divisor);
6433   jmpb(done);
6434   bind(neg_divisor_fastpath);
6435   // Fastpath when divisor < 0:
6436   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6437   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6438   movl(rdx, rax);
6439   subl(rax, divisor);
6440   if (VM_Version::supports_bmi1()) {
6441     andnl(rax, rax, rdx);
6442   } else {
6443     notl(rax);
6444     andl(rax, rdx);
6445   }
6446   sarl(rax, 31);
6447   andl(rax, divisor);
6448   subl(rdx, rax);
6449   bind(done);
6450 }
6451 
6452 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6453   Label done;
6454   Label neg_divisor_fastpath;
6455 
6456   cmpl(divisor, 0);
6457   jccb(Assembler::less, neg_divisor_fastpath);
6458   xorl(rdx, rdx);
6459   divl(divisor);
6460   jmpb(done);
6461   bind(neg_divisor_fastpath);
6462   // Fastpath for divisor < 0:
6463   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6464   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6465   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6466   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6467   movl(rdx, rax);
6468   subl(rax, divisor);
6469   if (VM_Version::supports_bmi1()) {
6470     andnl(rax, rax, rdx);
6471   } else {
6472     notl(rax);
6473     andl(rax, rdx);
6474   }
6475   movl(tmp, rax);
6476   shrl(rax, 31); // quotient
6477   sarl(tmp, 31);
6478   andl(tmp, divisor);
6479   subl(rdx, tmp); // remainder
6480   bind(done);
6481 }
6482 
6483 #ifdef _LP64
6484 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6485                                  XMMRegister xtmp2, Register rtmp) {
6486   if(VM_Version::supports_gfni()) {
6487     // Galois field instruction based bit reversal based on following algorithm.
6488     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6489     mov64(rtmp, 0x8040201008040201L);
6490     movq(xtmp1, src);
6491     movq(xtmp2, rtmp);
6492     gf2p8affineqb(xtmp1, xtmp2, 0);
6493     movq(dst, xtmp1);
6494   } else {
6495     // Swap even and odd numbered bits.
6496     movl(rtmp, src);
6497     andl(rtmp, 0x55555555);
6498     shll(rtmp, 1);
6499     movl(dst, src);
6500     andl(dst, 0xAAAAAAAA);
6501     shrl(dst, 1);
6502     orl(dst, rtmp);
6503 
6504     // Swap LSB and MSB 2 bits of each nibble.
6505     movl(rtmp, dst);
6506     andl(rtmp, 0x33333333);
6507     shll(rtmp, 2);
6508     andl(dst, 0xCCCCCCCC);
6509     shrl(dst, 2);
6510     orl(dst, rtmp);
6511 
6512     // Swap LSB and MSB 4 bits of each byte.
6513     movl(rtmp, dst);
6514     andl(rtmp, 0x0F0F0F0F);
6515     shll(rtmp, 4);
6516     andl(dst, 0xF0F0F0F0);
6517     shrl(dst, 4);
6518     orl(dst, rtmp);
6519   }
6520   bswapl(dst);
6521 }
6522 
6523 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6524                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6525   if(VM_Version::supports_gfni()) {
6526     // Galois field instruction based bit reversal based on following algorithm.
6527     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6528     mov64(rtmp1, 0x8040201008040201L);
6529     movq(xtmp1, src);
6530     movq(xtmp2, rtmp1);
6531     gf2p8affineqb(xtmp1, xtmp2, 0);
6532     movq(dst, xtmp1);
6533   } else {
6534     // Swap even and odd numbered bits.
6535     movq(rtmp1, src);
6536     mov64(rtmp2, 0x5555555555555555L);
6537     andq(rtmp1, rtmp2);
6538     shlq(rtmp1, 1);
6539     movq(dst, src);
6540     notq(rtmp2);
6541     andq(dst, rtmp2);
6542     shrq(dst, 1);
6543     orq(dst, rtmp1);
6544 
6545     // Swap LSB and MSB 2 bits of each nibble.
6546     movq(rtmp1, dst);
6547     mov64(rtmp2, 0x3333333333333333L);
6548     andq(rtmp1, rtmp2);
6549     shlq(rtmp1, 2);
6550     notq(rtmp2);
6551     andq(dst, rtmp2);
6552     shrq(dst, 2);
6553     orq(dst, rtmp1);
6554 
6555     // Swap LSB and MSB 4 bits of each byte.
6556     movq(rtmp1, dst);
6557     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6558     andq(rtmp1, rtmp2);
6559     shlq(rtmp1, 4);
6560     notq(rtmp2);
6561     andq(dst, rtmp2);
6562     shrq(dst, 4);
6563     orq(dst, rtmp1);
6564   }
6565   bswapq(dst);
6566 }
6567 
6568 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6569   Label done;
6570   Label neg_divisor_fastpath;
6571   cmpq(divisor, 0);
6572   jccb(Assembler::less, neg_divisor_fastpath);
6573   xorl(rdx, rdx);
6574   divq(divisor);
6575   jmpb(done);
6576   bind(neg_divisor_fastpath);
6577   // Fastpath for divisor < 0:
6578   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6579   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6580   movq(rdx, rax);
6581   subq(rdx, divisor);
6582   if (VM_Version::supports_bmi1()) {
6583     andnq(rax, rdx, rax);
6584   } else {
6585     notq(rdx);
6586     andq(rax, rdx);
6587   }
6588   shrq(rax, 63);
6589   bind(done);
6590 }
6591 
6592 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6593   Label done;
6594   Label neg_divisor_fastpath;
6595   cmpq(divisor, 0);
6596   jccb(Assembler::less, neg_divisor_fastpath);
6597   xorq(rdx, rdx);
6598   divq(divisor);
6599   jmp(done);
6600   bind(neg_divisor_fastpath);
6601   // Fastpath when divisor < 0:
6602   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6603   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6604   movq(rdx, rax);
6605   subq(rax, divisor);
6606   if (VM_Version::supports_bmi1()) {
6607     andnq(rax, rax, rdx);
6608   } else {
6609     notq(rax);
6610     andq(rax, rdx);
6611   }
6612   sarq(rax, 63);
6613   andq(rax, divisor);
6614   subq(rdx, rax);
6615   bind(done);
6616 }
6617 
6618 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6619   Label done;
6620   Label neg_divisor_fastpath;
6621   cmpq(divisor, 0);
6622   jccb(Assembler::less, neg_divisor_fastpath);
6623   xorq(rdx, rdx);
6624   divq(divisor);
6625   jmp(done);
6626   bind(neg_divisor_fastpath);
6627   // Fastpath for divisor < 0:
6628   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6629   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6630   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6631   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6632   movq(rdx, rax);
6633   subq(rax, divisor);
6634   if (VM_Version::supports_bmi1()) {
6635     andnq(rax, rax, rdx);
6636   } else {
6637     notq(rax);
6638     andq(rax, rdx);
6639   }
6640   movq(tmp, rax);
6641   shrq(rax, 63); // quotient
6642   sarq(tmp, 63);
6643   andq(tmp, divisor);
6644   subq(rdx, tmp); // remainder
6645   bind(done);
6646 }
6647 #endif
6648 
6649 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6650                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6651                                         int vlen_enc) {
6652   assert(VM_Version::supports_avx512bw(), "");
6653   // Byte shuffles are inlane operations and indices are determined using
6654   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6655   // normalized to index range 0-15. This makes sure that all the multiples
6656   // of an index value are placed at same relative position in 128 bit
6657   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6658   // will be 16th element in their respective 128 bit lanes.
6659   movl(rtmp, 16);
6660   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6661 
6662   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6663   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6664   // original shuffle indices and move the shuffled lanes corresponding to true
6665   // mask to destination vector.
6666   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6667   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6668   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6669 
6670   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6671   // and broadcasting second 128 bit lane.
6672   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6673   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6674   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6675   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6676   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6677 
6678   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6679   // and broadcasting third 128 bit lane.
6680   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6681   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6682   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6683   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6684   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6685 
6686   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6687   // and broadcasting third 128 bit lane.
6688   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6689   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6690   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6691   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6692   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6693 }
6694 
6695 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6696                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6697   if (vlen_enc == AVX_128bit) {
6698     vpermilps(dst, src, shuffle, vlen_enc);
6699   } else if (bt == T_INT) {
6700     vpermd(dst, shuffle, src, vlen_enc);
6701   } else {
6702     assert(bt == T_FLOAT, "");
6703     vpermps(dst, shuffle, src, vlen_enc);
6704   }
6705 }
6706 
6707 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6708   switch(elem_bt) {
6709     case T_BYTE:
6710       if (ideal_opc == Op_SaturatingAddV) {
6711         vpaddsb(dst, src1, src2, vlen_enc);
6712       } else {
6713         assert(ideal_opc == Op_SaturatingSubV, "");
6714         vpsubsb(dst, src1, src2, vlen_enc);
6715       }
6716       break;
6717     case T_SHORT:
6718       if (ideal_opc == Op_SaturatingAddV) {
6719         vpaddsw(dst, src1, src2, vlen_enc);
6720       } else {
6721         assert(ideal_opc == Op_SaturatingSubV, "");
6722         vpsubsw(dst, src1, src2, vlen_enc);
6723       }
6724       break;
6725     default:
6726       fatal("Unsupported type %s", type2name(elem_bt));
6727       break;
6728   }
6729 }
6730 
6731 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6732   switch(elem_bt) {
6733     case T_BYTE:
6734       if (ideal_opc == Op_SaturatingAddV) {
6735         vpaddusb(dst, src1, src2, vlen_enc);
6736       } else {
6737         assert(ideal_opc == Op_SaturatingSubV, "");
6738         vpsubusb(dst, src1, src2, vlen_enc);
6739       }
6740       break;
6741     case T_SHORT:
6742       if (ideal_opc == Op_SaturatingAddV) {
6743         vpaddusw(dst, src1, src2, vlen_enc);
6744       } else {
6745         assert(ideal_opc == Op_SaturatingSubV, "");
6746         vpsubusw(dst, src1, src2, vlen_enc);
6747       }
6748       break;
6749     default:
6750       fatal("Unsupported type %s", type2name(elem_bt));
6751       break;
6752   }
6753 }
6754 
6755 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6756                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6757   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6758   // overflow_mask = Inp1 <u Inp2
6759   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6760   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6761   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6762 }
6763 
6764 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6765                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6766   // Emulate unsigned comparison using signed comparison
6767   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6768   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6769   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6770   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6771 
6772   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6773 
6774   // Res = INP1 - INP2 (non-commutative and non-associative)
6775   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6776   // Res = Mask ? Zero : Res
6777   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6778   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6779 }
6780 
6781 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6782                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6783   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6784   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6785   // Res = Signed Add INP1, INP2
6786   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6787   // T1 = SRC1 | SRC2
6788   vpor(xtmp1, src1, src2, vlen_enc);
6789   // Max_Unsigned = -1
6790   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6791   // Unsigned compare:  Mask = Res <u T1
6792   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6793   // res  = Mask ? Max_Unsigned : Res
6794   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6795 }
6796 
6797 //
6798 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6799 // unsigned addition operation.
6800 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6801 //
6802 // We empirically determined its semantic equivalence to following reduced expression
6803 //    overflow_mask =  (a + b) <u (a | b)
6804 //
6805 // and also verified it though Alive2 solver.
6806 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6807 //
6808 
6809 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6810                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6811   // Res = Signed Add INP1, INP2
6812   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6813   // Compute T1 = INP1 | INP2
6814   vpor(xtmp3, src1, src2, vlen_enc);
6815   // T1 = Minimum signed value.
6816   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6817   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6818   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6819   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6820   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6821   // Compute overflow detection mask = Res<1> <s T1
6822   if (elem_bt == T_INT) {
6823     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6824   } else {
6825     assert(elem_bt == T_LONG, "");
6826     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6827   }
6828   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6829 }
6830 
6831 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6832                                       int vlen_enc, bool xtmp2_hold_M1) {
6833   if (VM_Version::supports_avx512dq()) {
6834     evpmovq2m(ktmp, src, vlen_enc);
6835   } else {
6836     assert(VM_Version::supports_evex(), "");
6837     if (!xtmp2_hold_M1) {
6838       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6839     }
6840     evpsraq(xtmp1, src, 63, vlen_enc);
6841     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6842   }
6843 }
6844 
6845 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6846                                       int vlen_enc, bool xtmp2_hold_M1) {
6847   if (VM_Version::supports_avx512dq()) {
6848     evpmovd2m(ktmp, src, vlen_enc);
6849   } else {
6850     assert(VM_Version::supports_evex(), "");
6851     if (!xtmp2_hold_M1) {
6852       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6853     }
6854     vpsrad(xtmp1, src, 31, vlen_enc);
6855     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6856   }
6857 }
6858 
6859 
6860 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6861   if (elem_bt == T_LONG) {
6862     if (VM_Version::supports_evex()) {
6863       evpsraq(dst, src, 63, vlen_enc);
6864     } else {
6865       vpsrad(dst, src, 31, vlen_enc);
6866       vpshufd(dst, dst, 0xF5, vlen_enc);
6867     }
6868   } else {
6869     assert(elem_bt == T_INT, "");
6870     vpsrad(dst, src, 31, vlen_enc);
6871   }
6872 }
6873 
6874 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6875   if (compute_allones) {
6876     if (vlen_enc == Assembler::AVX_512bit) {
6877       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6878     } else {
6879       vpcmpeqq(allones, allones, allones, vlen_enc);
6880     }
6881   }
6882   if (elem_bt == T_LONG) {
6883     vpsrlq(dst, allones, 1, vlen_enc);
6884   } else {
6885     assert(elem_bt == T_INT, "");
6886     vpsrld(dst, allones, 1, vlen_enc);
6887   }
6888 }
6889 
6890 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6891   if (compute_allones) {
6892     if (vlen_enc == Assembler::AVX_512bit) {
6893       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6894     } else {
6895       vpcmpeqq(allones, allones, allones, vlen_enc);
6896     }
6897   }
6898   if (elem_bt == T_LONG) {
6899     vpsllq(dst, allones, 63, vlen_enc);
6900   } else {
6901     assert(elem_bt == T_INT, "");
6902     vpslld(dst, allones, 31, vlen_enc);
6903   }
6904 }
6905 
6906 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6907                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6908   switch(elem_bt) {
6909     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6910     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6911     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6912     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6913     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6914   }
6915 }
6916 
6917 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6918   switch(elem_bt) {
6919     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6920     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6921     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6922     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6923     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6924   }
6925 }
6926 
6927 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6928                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6929   if (elem_bt == T_LONG) {
6930     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6931   } else {
6932     assert(elem_bt == T_INT, "");
6933     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6934   }
6935 }
6936 
6937 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6938                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6939                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6940   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6941   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6942   // Overflow detection based on Hacker's delight section 2-13.
6943   if (ideal_opc == Op_SaturatingAddV) {
6944     // res = src1 + src2
6945     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6946     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6947     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6948     vpxor(xtmp1, dst, src1, vlen_enc);
6949     vpxor(xtmp2, dst, src2, vlen_enc);
6950     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6951   } else {
6952     assert(ideal_opc == Op_SaturatingSubV, "");
6953     // res = src1 - src2
6954     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6955     // Overflow occurs when both inputs have opposite polarity and
6956     // result polarity does not comply with first input polarity.
6957     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6958     vpxor(xtmp1, src1, src2, vlen_enc);
6959     vpxor(xtmp2, dst, src1, vlen_enc);
6960     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6961   }
6962 
6963   // Compute overflow detection mask.
6964   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6965   // Note: xtmp1 hold -1 in all its lanes after above call.
6966 
6967   // Compute mask based on first input polarity.
6968   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6969 
6970   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6971   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6972 
6973   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6974   // set bits in first input polarity mask holds a min value.
6975   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6976   // Blend destination lanes with saturated values using overflow detection mask.
6977   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6978 }
6979 
6980 
6981 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6982                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6983                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6984   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6985   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6986   // Overflow detection based on Hacker's delight section 2-13.
6987   if (ideal_opc == Op_SaturatingAddV) {
6988     // res = src1 + src2
6989     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6990     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6991     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6992     vpxor(xtmp1, dst, src1, vlen_enc);
6993     vpxor(xtmp2, dst, src2, vlen_enc);
6994     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6995   } else {
6996     assert(ideal_opc == Op_SaturatingSubV, "");
6997     // res = src1 - src2
6998     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6999     // Overflow occurs when both inputs have opposite polarity and
7000     // result polarity does not comply with first input polarity.
7001     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
7002     vpxor(xtmp1, src1, src2, vlen_enc);
7003     vpxor(xtmp2, dst, src1, vlen_enc);
7004     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
7005   }
7006 
7007   // Sign-extend to compute overflow detection mask.
7008   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
7009 
7010   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
7011   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
7012   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
7013 
7014   // Compose saturating min/max vector using first input polarity mask.
7015   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
7016   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
7017 
7018   // Blend result with saturating vector using overflow detection mask.
7019   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
7020 }
7021 
7022 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7023   switch(elem_bt) {
7024     case T_BYTE:
7025       if (ideal_opc == Op_SaturatingAddV) {
7026         vpaddsb(dst, src1, src2, vlen_enc);
7027       } else {
7028         assert(ideal_opc == Op_SaturatingSubV, "");
7029         vpsubsb(dst, src1, src2, vlen_enc);
7030       }
7031       break;
7032     case T_SHORT:
7033       if (ideal_opc == Op_SaturatingAddV) {
7034         vpaddsw(dst, src1, src2, vlen_enc);
7035       } else {
7036         assert(ideal_opc == Op_SaturatingSubV, "");
7037         vpsubsw(dst, src1, src2, vlen_enc);
7038       }
7039       break;
7040     default:
7041       fatal("Unsupported type %s", type2name(elem_bt));
7042       break;
7043   }
7044 }
7045 
7046 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7047   switch(elem_bt) {
7048     case T_BYTE:
7049       if (ideal_opc == Op_SaturatingAddV) {
7050         vpaddusb(dst, src1, src2, vlen_enc);
7051       } else {
7052         assert(ideal_opc == Op_SaturatingSubV, "");
7053         vpsubusb(dst, src1, src2, vlen_enc);
7054       }
7055       break;
7056     case T_SHORT:
7057       if (ideal_opc == Op_SaturatingAddV) {
7058         vpaddusw(dst, src1, src2, vlen_enc);
7059       } else {
7060         assert(ideal_opc == Op_SaturatingSubV, "");
7061         vpsubusw(dst, src1, src2, vlen_enc);
7062       }
7063       break;
7064     default:
7065       fatal("Unsupported type %s", type2name(elem_bt));
7066       break;
7067   }
7068 }
7069 
7070 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7071                                                      XMMRegister src2, int vlen_enc) {
7072   switch(elem_bt) {
7073     case T_BYTE:
7074       evpermi2b(dst, src1, src2, vlen_enc);
7075       break;
7076     case T_SHORT:
7077       evpermi2w(dst, src1, src2, vlen_enc);
7078       break;
7079     case T_INT:
7080       evpermi2d(dst, src1, src2, vlen_enc);
7081       break;
7082     case T_LONG:
7083       evpermi2q(dst, src1, src2, vlen_enc);
7084       break;
7085     case T_FLOAT:
7086       evpermi2ps(dst, src1, src2, vlen_enc);
7087       break;
7088     case T_DOUBLE:
7089       evpermi2pd(dst, src1, src2, vlen_enc);
7090       break;
7091     default:
7092       fatal("Unsupported type %s", type2name(elem_bt));
7093       break;
7094   }
7095 }
7096 
7097 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7098   if (is_unsigned) {
7099     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7100   } else {
7101     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7102   }
7103 }
7104 
7105 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7106   if (is_unsigned) {
7107     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7108   } else {
7109     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7110   }
7111 }