1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  53 
  54   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  55   // NativeJump::patch_verified_entry will be able to patch out the entry
  56   // code safely. The push to verify stack depth is ok at 5 bytes,
  57   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  58   // stack bang then we must use the 6 byte frame allocation even if
  59   // we have no frame. :-(
  60   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  61 
  62   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  63   // Remove word for return addr
  64   framesize -= wordSize;
  65   stack_bang_size -= wordSize;
  66 
  67   // Calls to C2R adapters often do not accept exceptional returns.
  68   // We require that their callers must bang for them.  But be careful, because
  69   // some VM calls (such as call site linkage) can use several kilobytes of
  70   // stack.  But the stack safety zone should account for that.
  71   // See bugs 4446381, 4468289, 4497237.
  72   if (stack_bang_size > 0) {
  73     generate_stack_overflow_check(stack_bang_size);
  74 
  75     // We always push rbp, so that on return to interpreter rbp, will be
  76     // restored correctly and we can correct the stack.
  77     push(rbp);
  78     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  79     if (PreserveFramePointer) {
  80       mov(rbp, rsp);
  81     }
  82     // Remove word for ebp
  83     framesize -= wordSize;
  84 
  85     // Create frame
  86     if (framesize) {
  87       subptr(rsp, framesize);
  88     }
  89   } else {
  90     // Create frame (force generation of a 4 byte immediate value)
  91     subptr_imm32(rsp, framesize);
  92 
  93     // Save RBP register now.
  94     framesize -= wordSize;
  95     movptr(Address(rsp, framesize), rbp);
  96     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  97     if (PreserveFramePointer) {
  98       movptr(rbp, rsp);
  99       if (framesize > 0) {
 100         addptr(rbp, framesize);
 101       }
 102     }
 103   }
 104 
 105   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 106     framesize -= wordSize;
 107     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 108   }
 109 
 110 #ifdef ASSERT
 111   if (VerifyStackAtCalls) {
 112     Label L;
 113     push(rax);
 114     mov(rax, rsp);
 115     andptr(rax, StackAlignmentInBytes-1);
 116     cmpptr(rax, StackAlignmentInBytes-wordSize);
 117     pop(rax);
 118     jcc(Assembler::equal, L);
 119     STOP("Stack is not properly aligned!");
 120     bind(L);
 121   }
 122 #endif
 123 
 124   if (!is_stub) {
 125     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 126     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 127     Label dummy_slow_path;
 128     Label dummy_continuation;
 129     Label* slow_path = &dummy_slow_path;
 130     Label* continuation = &dummy_continuation;
 131     if (!Compile::current()->output()->in_scratch_emit_size()) {
 132       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 133       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 134       Compile::current()->output()->add_stub(stub);
 135       slow_path = &stub->entry();
 136       continuation = &stub->continuation();
 137     }
 138     bs->nmethod_entry_barrier(this, slow_path, continuation);
 139   }
 140 }
 141 
 142 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 143   switch (vlen_in_bytes) {
 144     case  4: // fall-through
 145     case  8: // fall-through
 146     case 16: return Assembler::AVX_128bit;
 147     case 32: return Assembler::AVX_256bit;
 148     case 64: return Assembler::AVX_512bit;
 149 
 150     default: {
 151       ShouldNotReachHere();
 152       return Assembler::AVX_NoVec;
 153     }
 154   }
 155 }
 156 
 157 // fast_lock and fast_unlock used by C2
 158 
 159 // Because the transitions from emitted code to the runtime
 160 // monitorenter/exit helper stubs are so slow it's critical that
 161 // we inline both the stack-locking fast path and the inflated fast path.
 162 //
 163 // See also: cmpFastLock and cmpFastUnlock.
 164 //
 165 // What follows is a specialized inline transliteration of the code
 166 // in enter() and exit(). If we're concerned about I$ bloat another
 167 // option would be to emit TrySlowEnter and TrySlowExit methods
 168 // at startup-time.  These methods would accept arguments as
 169 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 170 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 171 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 172 // In practice, however, the # of lock sites is bounded and is usually small.
 173 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 174 // if the processor uses simple bimodal branch predictors keyed by EIP
 175 // Since the helper routines would be called from multiple synchronization
 176 // sites.
 177 //
 178 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 179 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 180 // to those specialized methods.  That'd give us a mostly platform-independent
 181 // implementation that the JITs could optimize and inline at their pleasure.
 182 // Done correctly, the only time we'd need to cross to native could would be
 183 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 184 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 185 // (b) explicit barriers or fence operations.
 186 //
 187 // TODO:
 188 //
 189 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 190 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 191 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 192 //    the lock operators would typically be faster than reifying Self.
 193 //
 194 // *  Ideally I'd define the primitives as:
 195 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 196 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 197 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 198 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 199 //    Furthermore the register assignments are overconstrained, possibly resulting in
 200 //    sub-optimal code near the synchronization site.
 201 //
 202 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 203 //    Alternately, use a better sp-proximity test.
 204 //
 205 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 206 //    Either one is sufficient to uniquely identify a thread.
 207 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 208 //
 209 // *  Intrinsify notify() and notifyAll() for the common cases where the
 210 //    object is locked by the calling thread but the waitlist is empty.
 211 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 212 //
 213 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 214 //    But beware of excessive branch density on AMD Opterons.
 215 //
 216 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 217 //    or failure of the fast path.  If the fast path fails then we pass
 218 //    control to the slow path, typically in C.  In fast_lock and
 219 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 220 //    will emit a conditional branch immediately after the node.
 221 //    So we have branches to branches and lots of ICC.ZF games.
 222 //    Instead, it might be better to have C2 pass a "FailureLabel"
 223 //    into fast_lock and fast_unlock.  In the case of success, control
 224 //    will drop through the node.  ICC.ZF is undefined at exit.
 225 //    In the case of failure, the node will branch directly to the
 226 //    FailureLabel
 227 
 228 
 229 // obj: object to lock
 230 // box: on-stack box address (displaced header location) - KILLED
 231 // rax,: tmp -- KILLED
 232 // scr: tmp -- KILLED
 233 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 234                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 235                                  Metadata* method_data) {
 236   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 237   // Ensure the register assignments are disjoint
 238   assert(tmpReg == rax, "");
 239   assert(cx1Reg == noreg, "");
 240   assert(cx2Reg == noreg, "");
 241   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 242 
 243   // Possible cases that we'll encounter in fast_lock
 244   // ------------------------------------------------
 245   // * Inflated
 246   //    -- unlocked
 247   //    -- Locked
 248   //       = by self
 249   //       = by other
 250   // * neutral
 251   // * stack-locked
 252   //    -- by self
 253   //       = sp-proximity test hits
 254   //       = sp-proximity test generates false-negative
 255   //    -- by other
 256   //
 257 
 258   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 259 
 260   if (DiagnoseSyncOnValueBasedClasses != 0) {
 261     load_klass(tmpReg, objReg, scrReg);
 262     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 263     jcc(Assembler::notZero, DONE_LABEL);
 264   }
 265 
 266   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 267   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 268   jcc(Assembler::notZero, IsInflated);
 269 
 270   if (LockingMode == LM_MONITOR) {
 271     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 272     testptr(objReg, objReg);
 273   } else {
 274     assert(LockingMode == LM_LEGACY, "must be");
 275     // Attempt stack-locking ...
 276     orptr (tmpReg, markWord::unlocked_value);
 277     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 278     lock();
 279     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 280     jcc(Assembler::equal, COUNT);           // Success
 281 
 282     // Recursive locking.
 283     // The object is stack-locked: markword contains stack pointer to BasicLock.
 284     // Locked by current thread if difference with current SP is less than one page.
 285     subptr(tmpReg, rsp);
 286     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 287     andptr(tmpReg, (int32_t) (7 - (int)os::vm_page_size()) );
 288     movptr(Address(boxReg, 0), tmpReg);
 289   }
 290   jmp(DONE_LABEL);
 291 
 292   bind(IsInflated);
 293   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 294 
 295   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 296   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 297   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 298 
 299   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 300   movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset()));
 301   movq(scrReg, tmpReg);
 302   xorq(tmpReg, tmpReg);
 303   lock();
 304   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 305 
 306   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 307   jccb(Assembler::equal, COUNT);    // CAS above succeeded; propagate ZF = 1 (success)
 308 
 309   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 310   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 311   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 312   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 313   bind(DONE_LABEL);
 314 
 315   // ZFlag == 1 count in fast path
 316   // ZFlag == 0 count in slow path
 317   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 318 
 319   bind(COUNT);
 320   if (LockingMode == LM_LEGACY) {
 321     // Count monitors in fast path
 322     increment(Address(thread, JavaThread::held_monitor_count_offset()));
 323   }
 324   xorl(tmpReg, tmpReg); // Set ZF == 1
 325 
 326   bind(NO_COUNT);
 327 
 328   // At NO_COUNT the icc ZFlag is set as follows ...
 329   // fast_unlock uses the same protocol.
 330   // ZFlag == 1 -> Success
 331   // ZFlag == 0 -> Failure - force control through the slow path
 332 }
 333 
 334 // obj: object to unlock
 335 // box: box address (displaced header location), killed.  Must be EAX.
 336 // tmp: killed, cannot be obj nor box.
 337 //
 338 // Some commentary on balanced locking:
 339 //
 340 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 341 // Methods that don't have provably balanced locking are forced to run in the
 342 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 343 // The interpreter provides two properties:
 344 // I1:  At return-time the interpreter automatically and quietly unlocks any
 345 //      objects acquired the current activation (frame).  Recall that the
 346 //      interpreter maintains an on-stack list of locks currently held by
 347 //      a frame.
 348 // I2:  If a method attempts to unlock an object that is not held by the
 349 //      the frame the interpreter throws IMSX.
 350 //
 351 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 352 // B() doesn't have provably balanced locking so it runs in the interpreter.
 353 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 354 // is still locked by A().
 355 //
 356 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 357 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 358 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 359 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 360 // Arguably given that the spec legislates the JNI case as undefined our implementation
 361 // could reasonably *avoid* checking owner in fast_unlock().
 362 // In the interest of performance we elide m->Owner==Self check in unlock.
 363 // A perfectly viable alternative is to elide the owner check except when
 364 // Xcheck:jni is enabled.
 365 
 366 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 367   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 368   assert(boxReg == rax, "");
 369   assert_different_registers(objReg, boxReg, tmpReg);
 370 
 371   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 372 
 373   if (LockingMode == LM_LEGACY) {
 374     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 375     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 376   }
 377   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 378   if (LockingMode != LM_MONITOR) {
 379     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 380     jcc(Assembler::zero, Stacked);
 381   }
 382 
 383   // It's inflated.
 384 
 385   // Despite our balanced locking property we still check that m->_owner == Self
 386   // as java routines or native JNI code called by this thread might
 387   // have released the lock.
 388   //
 389   // If there's no contention try a 1-0 exit.  That is, exit without
 390   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 391   // we detect and recover from the race that the 1-0 exit admits.
 392   //
 393   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 394   // before it STs null into _owner, releasing the lock.  Updates
 395   // to data protected by the critical section must be visible before
 396   // we drop the lock (and thus before any other thread could acquire
 397   // the lock and observe the fields protected by the lock).
 398   // IA32's memory-model is SPO, so STs are ordered with respect to
 399   // each other and there's no need for an explicit barrier (fence).
 400   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 401   Label LSuccess, LNotRecursive;
 402 
 403   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 404   jccb(Assembler::equal, LNotRecursive);
 405 
 406   // Recursive inflated unlock
 407   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 408   jmpb(LSuccess);
 409 
 410   bind(LNotRecursive);
 411 
 412   // Set owner to null.
 413   // Release to satisfy the JMM
 414   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 415   // We need a full fence after clearing owner to avoid stranding.
 416   // StoreLoad achieves this.
 417   membar(StoreLoad);
 418 
 419   // Check if the entry_list is empty.
 420   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD);
 421   jccb(Assembler::zero, LSuccess);    // If so we are done.
 422 
 423   // Check if there is a successor.
 424   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 425   jccb(Assembler::notZero, LSuccess); // If so we are done.
 426 
 427   // Save the monitor pointer in the current thread, so we can try to
 428   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 429   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 430   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 431 
 432   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 433   jmpb  (DONE_LABEL);
 434 
 435   bind  (LSuccess);
 436   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 437   jmpb  (DONE_LABEL);
 438 
 439   if (LockingMode == LM_LEGACY) {
 440     bind  (Stacked);
 441     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 442     lock();
 443     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 444     // Intentional fall-thru into DONE_LABEL
 445   }
 446 
 447   bind(DONE_LABEL);
 448 
 449   // ZFlag == 1 count in fast path
 450   // ZFlag == 0 count in slow path
 451   jccb(Assembler::notZero, NO_COUNT);
 452 
 453   bind(COUNT);
 454 
 455   if (LockingMode == LM_LEGACY) {
 456     // Count monitors in fast path
 457     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 458   }
 459 
 460   xorl(tmpReg, tmpReg); // Set ZF == 1
 461 
 462   bind(NO_COUNT);
 463 }
 464 
 465 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 466                                               Register t, Register thread) {
 467   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 468   assert(rax_reg == rax, "Used for CAS");
 469   assert_different_registers(obj, box, rax_reg, t, thread);
 470 
 471   // Handle inflated monitor.
 472   Label inflated;
 473   // Finish fast lock successfully. ZF value is irrelevant.
 474   Label locked;
 475   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 476   Label slow_path;
 477 
 478   if (UseObjectMonitorTable) {
 479     // Clear cache in case fast locking succeeds.
 480     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 481   }
 482 
 483   if (DiagnoseSyncOnValueBasedClasses != 0) {
 484     load_klass(rax_reg, obj, t);
 485     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 486     jcc(Assembler::notZero, slow_path);
 487   }
 488 
 489   const Register mark = t;
 490 
 491   { // Lightweight Lock
 492 
 493     Label push;
 494 
 495     const Register top = UseObjectMonitorTable ? rax_reg : box;
 496 
 497     // Load the mark.
 498     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 499 
 500     // Prefetch top.
 501     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 502 
 503     // Check for monitor (0b10).
 504     testptr(mark, markWord::monitor_value);
 505     jcc(Assembler::notZero, inflated);
 506 
 507     // Check if lock-stack is full.
 508     cmpl(top, LockStack::end_offset() - 1);
 509     jcc(Assembler::greater, slow_path);
 510 
 511     // Check if recursive.
 512     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 513     jccb(Assembler::equal, push);
 514 
 515     // Try to lock. Transition lock bits 0b01 => 0b00
 516     movptr(rax_reg, mark);
 517     orptr(rax_reg, markWord::unlocked_value);
 518     andptr(mark, ~(int32_t)markWord::unlocked_value);
 519     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 520     jcc(Assembler::notEqual, slow_path);
 521 
 522     if (UseObjectMonitorTable) {
 523       // Need to reload top, clobbered by CAS.
 524       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 525     }
 526     bind(push);
 527     // After successful lock, push object on lock-stack.
 528     movptr(Address(thread, top), obj);
 529     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 530     jmpb(locked);
 531   }
 532 
 533   { // Handle inflated monitor.
 534     bind(inflated);
 535 
 536     const Register monitor = t;
 537 
 538     if (!UseObjectMonitorTable) {
 539       assert(mark == monitor, "should be the same here");
 540     } else {
 541       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 542       // Fetch ObjectMonitor* from the cache or take the slow-path.
 543       Label monitor_found;
 544 
 545       // Load cache address
 546       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 547 
 548       const int num_unrolled = 2;
 549       for (int i = 0; i < num_unrolled; i++) {
 550         cmpptr(obj, Address(t));
 551         jccb(Assembler::equal, monitor_found);
 552         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 553       }
 554 
 555       Label loop;
 556 
 557       // Search for obj in cache.
 558       bind(loop);
 559 
 560       // Check for match.
 561       cmpptr(obj, Address(t));
 562       jccb(Assembler::equal, monitor_found);
 563 
 564       // Search until null encountered, guaranteed _null_sentinel at end.
 565       cmpptr(Address(t), 1);
 566       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 567       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 568       jmpb(loop);
 569 
 570       // Cache hit.
 571       bind(monitor_found);
 572       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 573     }
 574     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 575     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 576     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 577 
 578     Label monitor_locked;
 579     // Lock the monitor.
 580 
 581     if (UseObjectMonitorTable) {
 582       // Cache the monitor for unlock before trashing box. On failure to acquire
 583       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 584       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 585     }
 586 
 587     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 588     xorptr(rax_reg, rax_reg);
 589     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 590     lock(); cmpxchgptr(box, owner_address);
 591     jccb(Assembler::equal, monitor_locked);
 592 
 593     // Check if recursive.
 594     cmpptr(box, rax_reg);
 595     jccb(Assembler::notEqual, slow_path);
 596 
 597     // Recursive.
 598     increment(recursions_address);
 599 
 600     bind(monitor_locked);
 601   }
 602 
 603   bind(locked);
 604   // Set ZF = 1
 605   xorl(rax_reg, rax_reg);
 606 
 607 #ifdef ASSERT
 608   // Check that locked label is reached with ZF set.
 609   Label zf_correct;
 610   Label zf_bad_zero;
 611   jcc(Assembler::zero, zf_correct);
 612   jmp(zf_bad_zero);
 613 #endif
 614 
 615   bind(slow_path);
 616 #ifdef ASSERT
 617   // Check that slow_path label is reached with ZF not set.
 618   jcc(Assembler::notZero, zf_correct);
 619   stop("Fast Lock ZF != 0");
 620   bind(zf_bad_zero);
 621   stop("Fast Lock ZF != 1");
 622   bind(zf_correct);
 623 #endif
 624   // C2 uses the value of ZF to determine the continuation.
 625 }
 626 
 627 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 628   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 629   assert(reg_rax == rax, "Used for CAS");
 630   assert_different_registers(obj, reg_rax, t);
 631 
 632   // Handle inflated monitor.
 633   Label inflated, inflated_check_lock_stack;
 634   // Finish fast unlock successfully.  MUST jump with ZF == 1
 635   Label unlocked, slow_path;
 636 
 637   const Register mark = t;
 638   const Register monitor = t;
 639   const Register top = UseObjectMonitorTable ? t : reg_rax;
 640   const Register box = reg_rax;
 641 
 642   Label dummy;
 643   C2FastUnlockLightweightStub* stub = nullptr;
 644 
 645   if (!Compile::current()->output()->in_scratch_emit_size()) {
 646     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 647     Compile::current()->output()->add_stub(stub);
 648   }
 649 
 650   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 651 
 652   { // Lightweight Unlock
 653 
 654     // Load top.
 655     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 656 
 657     if (!UseObjectMonitorTable) {
 658       // Prefetch mark.
 659       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 660     }
 661 
 662     // Check if obj is top of lock-stack.
 663     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 664     // Top of lock stack was not obj. Must be monitor.
 665     jcc(Assembler::notEqual, inflated_check_lock_stack);
 666 
 667     // Pop lock-stack.
 668     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 669     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 670 
 671     // Check if recursive.
 672     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 673     jcc(Assembler::equal, unlocked);
 674 
 675     // We elide the monitor check, let the CAS fail instead.
 676 
 677     if (UseObjectMonitorTable) {
 678       // Load mark.
 679       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 680     }
 681 
 682     // Try to unlock. Transition lock bits 0b00 => 0b01
 683     movptr(reg_rax, mark);
 684     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 685     orptr(mark, markWord::unlocked_value);
 686     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 687     jcc(Assembler::notEqual, push_and_slow_path);
 688     jmp(unlocked);
 689   }
 690 
 691 
 692   { // Handle inflated monitor.
 693     bind(inflated_check_lock_stack);
 694 #ifdef ASSERT
 695     Label check_done;
 696     subl(top, oopSize);
 697     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 698     jcc(Assembler::below, check_done);
 699     cmpptr(obj, Address(thread, top));
 700     jccb(Assembler::notEqual, inflated_check_lock_stack);
 701     stop("Fast Unlock lock on stack");
 702     bind(check_done);
 703     if (UseObjectMonitorTable) {
 704       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 705     }
 706     testptr(mark, markWord::monitor_value);
 707     jccb(Assembler::notZero, inflated);
 708     stop("Fast Unlock not monitor");
 709 #endif
 710 
 711     bind(inflated);
 712 
 713     if (!UseObjectMonitorTable) {
 714       assert(mark == monitor, "should be the same here");
 715     } else {
 716       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 717       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 718       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 719       cmpptr(monitor, alignof(ObjectMonitor*));
 720       jcc(Assembler::below, slow_path);
 721     }
 722     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 723     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 724     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 725     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 726     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 727 
 728     Label recursive;
 729 
 730     // Check if recursive.
 731     cmpptr(recursions_address, 0);
 732     jccb(Assembler::notZero, recursive);
 733 
 734     // Set owner to null.
 735     // Release to satisfy the JMM
 736     movptr(owner_address, NULL_WORD);
 737     // We need a full fence after clearing owner to avoid stranding.
 738     // StoreLoad achieves this.
 739     membar(StoreLoad);
 740 
 741     // Check if the entry_list is empty.
 742     cmpptr(entry_list_address, NULL_WORD);
 743     jccb(Assembler::zero, unlocked);    // If so we are done.
 744 
 745     // Check if there is a successor.
 746     cmpptr(succ_address, NULL_WORD);
 747     jccb(Assembler::notZero, unlocked); // If so we are done.
 748 
 749     // Save the monitor pointer in the current thread, so we can try to
 750     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 751     if (!UseObjectMonitorTable) {
 752       andptr(monitor, ~(int32_t)markWord::monitor_value);
 753     }
 754     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 755 
 756     orl(t, 1); // Fast Unlock ZF = 0
 757     jmpb(slow_path);
 758 
 759     // Recursive unlock.
 760     bind(recursive);
 761     decrement(recursions_address);
 762   }
 763 
 764   bind(unlocked);
 765   xorl(t, t); // Fast Unlock ZF = 1
 766 
 767 #ifdef ASSERT
 768   // Check that unlocked label is reached with ZF set.
 769   Label zf_correct;
 770   Label zf_bad_zero;
 771   jcc(Assembler::zero, zf_correct);
 772   jmp(zf_bad_zero);
 773 #endif
 774 
 775   bind(slow_path);
 776   if (stub != nullptr) {
 777     bind(stub->slow_path_continuation());
 778   }
 779 #ifdef ASSERT
 780   // Check that stub->continuation() label is reached with ZF not set.
 781   jcc(Assembler::notZero, zf_correct);
 782   stop("Fast Unlock ZF != 0");
 783   bind(zf_bad_zero);
 784   stop("Fast Unlock ZF != 1");
 785   bind(zf_correct);
 786 #endif
 787   // C2 uses the value of ZF to determine the continuation.
 788 }
 789 
 790 //-------------------------------------------------------------------------------------------
 791 // Generic instructions support for use in .ad files C2 code generation
 792 
 793 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 794   if (dst != src) {
 795     movdqu(dst, src);
 796   }
 797   if (opcode == Op_AbsVD) {
 798     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 799   } else {
 800     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 801     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 802   }
 803 }
 804 
 805 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 806   if (opcode == Op_AbsVD) {
 807     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 808   } else {
 809     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 810     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 811   }
 812 }
 813 
 814 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 815   if (dst != src) {
 816     movdqu(dst, src);
 817   }
 818   if (opcode == Op_AbsVF) {
 819     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 820   } else {
 821     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 822     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 823   }
 824 }
 825 
 826 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 827   if (opcode == Op_AbsVF) {
 828     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 829   } else {
 830     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 831     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 832   }
 833 }
 834 
 835 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 836   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 837   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 838 
 839   if (opcode == Op_MinV) {
 840     if (elem_bt == T_BYTE) {
 841       pminsb(dst, src);
 842     } else if (elem_bt == T_SHORT) {
 843       pminsw(dst, src);
 844     } else if (elem_bt == T_INT) {
 845       pminsd(dst, src);
 846     } else {
 847       assert(elem_bt == T_LONG, "required");
 848       assert(tmp == xmm0, "required");
 849       assert_different_registers(dst, src, tmp);
 850       movdqu(xmm0, dst);
 851       pcmpgtq(xmm0, src);
 852       blendvpd(dst, src);  // xmm0 as mask
 853     }
 854   } else { // opcode == Op_MaxV
 855     if (elem_bt == T_BYTE) {
 856       pmaxsb(dst, src);
 857     } else if (elem_bt == T_SHORT) {
 858       pmaxsw(dst, src);
 859     } else if (elem_bt == T_INT) {
 860       pmaxsd(dst, src);
 861     } else {
 862       assert(elem_bt == T_LONG, "required");
 863       assert(tmp == xmm0, "required");
 864       assert_different_registers(dst, src, tmp);
 865       movdqu(xmm0, src);
 866       pcmpgtq(xmm0, dst);
 867       blendvpd(dst, src);  // xmm0 as mask
 868     }
 869   }
 870 }
 871 
 872 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 873                                   XMMRegister src1, Address src2, int vlen_enc) {
 874   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 875   if (opcode == Op_UMinV) {
 876     switch(elem_bt) {
 877       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 878       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 879       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 880       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 881       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 882     }
 883   } else {
 884     assert(opcode == Op_UMaxV, "required");
 885     switch(elem_bt) {
 886       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 887       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 888       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 889       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 890       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 891     }
 892   }
 893 }
 894 
 895 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 896   // For optimality, leverage a full vector width of 512 bits
 897   // for operations over smaller vector sizes on AVX512 targets.
 898   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 899     if (opcode == Op_UMaxV) {
 900       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 901     } else {
 902       assert(opcode == Op_UMinV, "required");
 903       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 904     }
 905   } else {
 906     // T1 = -1
 907     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 908     // T1 = -1 << 63
 909     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 910     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 911     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 912     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 913     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 914     // Mask = T2 > T1
 915     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 916     if (opcode == Op_UMaxV) {
 917       // Res = Mask ? Src2 : Src1
 918       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 919     } else {
 920       // Res = Mask ? Src1 : Src2
 921       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 922     }
 923   }
 924 }
 925 
 926 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 927                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 928   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 929   if (opcode == Op_UMinV) {
 930     switch(elem_bt) {
 931       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 932       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 933       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 934       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 935       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 936     }
 937   } else {
 938     assert(opcode == Op_UMaxV, "required");
 939     switch(elem_bt) {
 940       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 941       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 942       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 943       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 944       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 945     }
 946   }
 947 }
 948 
 949 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 950                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 951                                  int vlen_enc) {
 952   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 953 
 954   if (opcode == Op_MinV) {
 955     if (elem_bt == T_BYTE) {
 956       vpminsb(dst, src1, src2, vlen_enc);
 957     } else if (elem_bt == T_SHORT) {
 958       vpminsw(dst, src1, src2, vlen_enc);
 959     } else if (elem_bt == T_INT) {
 960       vpminsd(dst, src1, src2, vlen_enc);
 961     } else {
 962       assert(elem_bt == T_LONG, "required");
 963       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 964         vpminsq(dst, src1, src2, vlen_enc);
 965       } else {
 966         assert_different_registers(dst, src1, src2);
 967         vpcmpgtq(dst, src1, src2, vlen_enc);
 968         vblendvpd(dst, src1, src2, dst, vlen_enc);
 969       }
 970     }
 971   } else { // opcode == Op_MaxV
 972     if (elem_bt == T_BYTE) {
 973       vpmaxsb(dst, src1, src2, vlen_enc);
 974     } else if (elem_bt == T_SHORT) {
 975       vpmaxsw(dst, src1, src2, vlen_enc);
 976     } else if (elem_bt == T_INT) {
 977       vpmaxsd(dst, src1, src2, vlen_enc);
 978     } else {
 979       assert(elem_bt == T_LONG, "required");
 980       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 981         vpmaxsq(dst, src1, src2, vlen_enc);
 982       } else {
 983         assert_different_registers(dst, src1, src2);
 984         vpcmpgtq(dst, src1, src2, vlen_enc);
 985         vblendvpd(dst, src2, src1, dst, vlen_enc);
 986       }
 987     }
 988   }
 989 }
 990 
 991 // Float/Double min max
 992 
 993 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 994                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 995                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 996                                    int vlen_enc) {
 997   assert(UseAVX > 0, "required");
 998   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 999          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1000   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1001   assert_different_registers(a, tmp, atmp, btmp);
1002   assert_different_registers(b, tmp, atmp, btmp);
1003 
1004   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1005   bool is_double_word = is_double_word_type(elem_bt);
1006 
1007   /* Note on 'non-obvious' assembly sequence:
1008    *
1009    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1010    * and Java on how they handle floats:
1011    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1012    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1013    *
1014    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1015    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1016    *                (only useful when signs differ, noop otherwise)
1017    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1018 
1019    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1020    *   btmp = (b < +0.0) ? a : b
1021    *   atmp = (b < +0.0) ? b : a
1022    *   Tmp  = Max_Float(atmp , btmp)
1023    *   Res  = (atmp == NaN) ? atmp : Tmp
1024    */
1025 
1026   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1027   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1028   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1029   XMMRegister mask;
1030 
1031   if (!is_double_word && is_min) {
1032     mask = a;
1033     vblend = &MacroAssembler::vblendvps;
1034     vmaxmin = &MacroAssembler::vminps;
1035     vcmp = &MacroAssembler::vcmpps;
1036   } else if (!is_double_word && !is_min) {
1037     mask = b;
1038     vblend = &MacroAssembler::vblendvps;
1039     vmaxmin = &MacroAssembler::vmaxps;
1040     vcmp = &MacroAssembler::vcmpps;
1041   } else if (is_double_word && is_min) {
1042     mask = a;
1043     vblend = &MacroAssembler::vblendvpd;
1044     vmaxmin = &MacroAssembler::vminpd;
1045     vcmp = &MacroAssembler::vcmppd;
1046   } else {
1047     assert(is_double_word && !is_min, "sanity");
1048     mask = b;
1049     vblend = &MacroAssembler::vblendvpd;
1050     vmaxmin = &MacroAssembler::vmaxpd;
1051     vcmp = &MacroAssembler::vcmppd;
1052   }
1053 
1054   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1055   XMMRegister maxmin, scratch;
1056   if (dst == btmp) {
1057     maxmin = btmp;
1058     scratch = tmp;
1059   } else {
1060     maxmin = tmp;
1061     scratch = btmp;
1062   }
1063 
1064   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1065   if (precompute_mask && !is_double_word) {
1066     vpsrad(tmp, mask, 32, vlen_enc);
1067     mask = tmp;
1068   } else if (precompute_mask && is_double_word) {
1069     vpxor(tmp, tmp, tmp, vlen_enc);
1070     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1071     mask = tmp;
1072   }
1073 
1074   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1075   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1076   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1077   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1078   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1079 }
1080 
1081 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1082                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1083                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1084                                     int vlen_enc) {
1085   assert(UseAVX > 2, "required");
1086   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1087          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1088   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1089   assert_different_registers(dst, a, atmp, btmp);
1090   assert_different_registers(dst, b, atmp, btmp);
1091 
1092   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1093   bool is_double_word = is_double_word_type(elem_bt);
1094   bool merge = true;
1095 
1096   if (!is_double_word && is_min) {
1097     evpmovd2m(ktmp, a, vlen_enc);
1098     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1099     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1100     vminps(dst, atmp, btmp, vlen_enc);
1101     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1102     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1103   } else if (!is_double_word && !is_min) {
1104     evpmovd2m(ktmp, b, vlen_enc);
1105     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1106     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1107     vmaxps(dst, atmp, btmp, vlen_enc);
1108     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1109     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1110   } else if (is_double_word && is_min) {
1111     evpmovq2m(ktmp, a, vlen_enc);
1112     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1113     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1114     vminpd(dst, atmp, btmp, vlen_enc);
1115     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1116     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1117   } else {
1118     assert(is_double_word && !is_min, "sanity");
1119     evpmovq2m(ktmp, b, vlen_enc);
1120     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1121     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1122     vmaxpd(dst, atmp, btmp, vlen_enc);
1123     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1124     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1125   }
1126 }
1127 
1128 // Float/Double signum
1129 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1130   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1131 
1132   Label DONE_LABEL;
1133 
1134   if (opcode == Op_SignumF) {
1135     assert(UseSSE > 0, "required");
1136     ucomiss(dst, zero);
1137     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1138     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1139     movflt(dst, one);
1140     jcc(Assembler::above, DONE_LABEL);
1141     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1142   } else if (opcode == Op_SignumD) {
1143     assert(UseSSE > 1, "required");
1144     ucomisd(dst, zero);
1145     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1146     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1147     movdbl(dst, one);
1148     jcc(Assembler::above, DONE_LABEL);
1149     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1150   }
1151 
1152   bind(DONE_LABEL);
1153 }
1154 
1155 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1156   if (sign) {
1157     pmovsxbw(dst, src);
1158   } else {
1159     pmovzxbw(dst, src);
1160   }
1161 }
1162 
1163 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1164   if (sign) {
1165     vpmovsxbw(dst, src, vector_len);
1166   } else {
1167     vpmovzxbw(dst, src, vector_len);
1168   }
1169 }
1170 
1171 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1172   if (sign) {
1173     vpmovsxbd(dst, src, vector_len);
1174   } else {
1175     vpmovzxbd(dst, src, vector_len);
1176   }
1177 }
1178 
1179 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1180   if (sign) {
1181     vpmovsxwd(dst, src, vector_len);
1182   } else {
1183     vpmovzxwd(dst, src, vector_len);
1184   }
1185 }
1186 
1187 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1188                                      int shift, int vector_len) {
1189   if (opcode == Op_RotateLeftV) {
1190     if (etype == T_INT) {
1191       evprold(dst, src, shift, vector_len);
1192     } else {
1193       assert(etype == T_LONG, "expected type T_LONG");
1194       evprolq(dst, src, shift, vector_len);
1195     }
1196   } else {
1197     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1198     if (etype == T_INT) {
1199       evprord(dst, src, shift, vector_len);
1200     } else {
1201       assert(etype == T_LONG, "expected type T_LONG");
1202       evprorq(dst, src, shift, vector_len);
1203     }
1204   }
1205 }
1206 
1207 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1208                                      XMMRegister shift, int vector_len) {
1209   if (opcode == Op_RotateLeftV) {
1210     if (etype == T_INT) {
1211       evprolvd(dst, src, shift, vector_len);
1212     } else {
1213       assert(etype == T_LONG, "expected type T_LONG");
1214       evprolvq(dst, src, shift, vector_len);
1215     }
1216   } else {
1217     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1218     if (etype == T_INT) {
1219       evprorvd(dst, src, shift, vector_len);
1220     } else {
1221       assert(etype == T_LONG, "expected type T_LONG");
1222       evprorvq(dst, src, shift, vector_len);
1223     }
1224   }
1225 }
1226 
1227 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1228   if (opcode == Op_RShiftVI) {
1229     psrad(dst, shift);
1230   } else if (opcode == Op_LShiftVI) {
1231     pslld(dst, shift);
1232   } else {
1233     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1234     psrld(dst, shift);
1235   }
1236 }
1237 
1238 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1239   switch (opcode) {
1240     case Op_RShiftVI:  psrad(dst, shift); break;
1241     case Op_LShiftVI:  pslld(dst, shift); break;
1242     case Op_URShiftVI: psrld(dst, shift); break;
1243 
1244     default: assert(false, "%s", NodeClassNames[opcode]);
1245   }
1246 }
1247 
1248 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1249   if (opcode == Op_RShiftVI) {
1250     vpsrad(dst, nds, shift, vector_len);
1251   } else if (opcode == Op_LShiftVI) {
1252     vpslld(dst, nds, shift, vector_len);
1253   } else {
1254     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1255     vpsrld(dst, nds, shift, vector_len);
1256   }
1257 }
1258 
1259 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1260   switch (opcode) {
1261     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1262     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1263     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1264 
1265     default: assert(false, "%s", NodeClassNames[opcode]);
1266   }
1267 }
1268 
1269 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1270   switch (opcode) {
1271     case Op_RShiftVB:  // fall-through
1272     case Op_RShiftVS:  psraw(dst, shift); break;
1273 
1274     case Op_LShiftVB:  // fall-through
1275     case Op_LShiftVS:  psllw(dst, shift);   break;
1276 
1277     case Op_URShiftVS: // fall-through
1278     case Op_URShiftVB: psrlw(dst, shift);  break;
1279 
1280     default: assert(false, "%s", NodeClassNames[opcode]);
1281   }
1282 }
1283 
1284 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1285   switch (opcode) {
1286     case Op_RShiftVB:  // fall-through
1287     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1288 
1289     case Op_LShiftVB:  // fall-through
1290     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1291 
1292     case Op_URShiftVS: // fall-through
1293     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1294 
1295     default: assert(false, "%s", NodeClassNames[opcode]);
1296   }
1297 }
1298 
1299 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1300   switch (opcode) {
1301     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1302     case Op_LShiftVL:  psllq(dst, shift); break;
1303     case Op_URShiftVL: psrlq(dst, shift); break;
1304 
1305     default: assert(false, "%s", NodeClassNames[opcode]);
1306   }
1307 }
1308 
1309 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1310   if (opcode == Op_RShiftVL) {
1311     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1312   } else if (opcode == Op_LShiftVL) {
1313     psllq(dst, shift);
1314   } else {
1315     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1316     psrlq(dst, shift);
1317   }
1318 }
1319 
1320 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1321   switch (opcode) {
1322     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1323     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1324     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1325 
1326     default: assert(false, "%s", NodeClassNames[opcode]);
1327   }
1328 }
1329 
1330 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1331   if (opcode == Op_RShiftVL) {
1332     evpsraq(dst, nds, shift, vector_len);
1333   } else if (opcode == Op_LShiftVL) {
1334     vpsllq(dst, nds, shift, vector_len);
1335   } else {
1336     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1337     vpsrlq(dst, nds, shift, vector_len);
1338   }
1339 }
1340 
1341 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1342   switch (opcode) {
1343     case Op_RShiftVB:  // fall-through
1344     case Op_RShiftVS:  // fall-through
1345     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1346 
1347     case Op_LShiftVB:  // fall-through
1348     case Op_LShiftVS:  // fall-through
1349     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1350 
1351     case Op_URShiftVB: // fall-through
1352     case Op_URShiftVS: // fall-through
1353     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1354 
1355     default: assert(false, "%s", NodeClassNames[opcode]);
1356   }
1357 }
1358 
1359 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1360   switch (opcode) {
1361     case Op_RShiftVB:  // fall-through
1362     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1363 
1364     case Op_LShiftVB:  // fall-through
1365     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1366 
1367     case Op_URShiftVB: // fall-through
1368     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1369 
1370     default: assert(false, "%s", NodeClassNames[opcode]);
1371   }
1372 }
1373 
1374 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1375   assert(UseAVX >= 2, "required");
1376   switch (opcode) {
1377     case Op_RShiftVL: {
1378       if (UseAVX > 2) {
1379         assert(tmp == xnoreg, "not used");
1380         if (!VM_Version::supports_avx512vl()) {
1381           vlen_enc = Assembler::AVX_512bit;
1382         }
1383         evpsravq(dst, src, shift, vlen_enc);
1384       } else {
1385         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1386         vpsrlvq(dst, src, shift, vlen_enc);
1387         vpsrlvq(tmp, tmp, shift, vlen_enc);
1388         vpxor(dst, dst, tmp, vlen_enc);
1389         vpsubq(dst, dst, tmp, vlen_enc);
1390       }
1391       break;
1392     }
1393     case Op_LShiftVL: {
1394       assert(tmp == xnoreg, "not used");
1395       vpsllvq(dst, src, shift, vlen_enc);
1396       break;
1397     }
1398     case Op_URShiftVL: {
1399       assert(tmp == xnoreg, "not used");
1400       vpsrlvq(dst, src, shift, vlen_enc);
1401       break;
1402     }
1403     default: assert(false, "%s", NodeClassNames[opcode]);
1404   }
1405 }
1406 
1407 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1408 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1409   assert(opcode == Op_LShiftVB ||
1410          opcode == Op_RShiftVB ||
1411          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1412   bool sign = (opcode != Op_URShiftVB);
1413   assert(vector_len == 0, "required");
1414   vextendbd(sign, dst, src, 1);
1415   vpmovzxbd(vtmp, shift, 1);
1416   varshiftd(opcode, dst, dst, vtmp, 1);
1417   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1418   vextracti128_high(vtmp, dst);
1419   vpackusdw(dst, dst, vtmp, 0);
1420 }
1421 
1422 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1423 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1424   assert(opcode == Op_LShiftVB ||
1425          opcode == Op_RShiftVB ||
1426          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1427   bool sign = (opcode != Op_URShiftVB);
1428   int ext_vector_len = vector_len + 1;
1429   vextendbw(sign, dst, src, ext_vector_len);
1430   vpmovzxbw(vtmp, shift, ext_vector_len);
1431   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1432   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1433   if (vector_len == 0) {
1434     vextracti128_high(vtmp, dst);
1435     vpackuswb(dst, dst, vtmp, vector_len);
1436   } else {
1437     vextracti64x4_high(vtmp, dst);
1438     vpackuswb(dst, dst, vtmp, vector_len);
1439     vpermq(dst, dst, 0xD8, vector_len);
1440   }
1441 }
1442 
1443 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1444   switch(typ) {
1445     case T_BYTE:
1446       pinsrb(dst, val, idx);
1447       break;
1448     case T_SHORT:
1449       pinsrw(dst, val, idx);
1450       break;
1451     case T_INT:
1452       pinsrd(dst, val, idx);
1453       break;
1454     case T_LONG:
1455       pinsrq(dst, val, idx);
1456       break;
1457     default:
1458       assert(false,"Should not reach here.");
1459       break;
1460   }
1461 }
1462 
1463 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1464   switch(typ) {
1465     case T_BYTE:
1466       vpinsrb(dst, src, val, idx);
1467       break;
1468     case T_SHORT:
1469       vpinsrw(dst, src, val, idx);
1470       break;
1471     case T_INT:
1472       vpinsrd(dst, src, val, idx);
1473       break;
1474     case T_LONG:
1475       vpinsrq(dst, src, val, idx);
1476       break;
1477     default:
1478       assert(false,"Should not reach here.");
1479       break;
1480   }
1481 }
1482 
1483 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1484                                                 XMMRegister dst, Register base,
1485                                                 Register idx_base,
1486                                                 Register offset, Register mask,
1487                                                 Register mask_idx, Register rtmp,
1488                                                 int vlen_enc) {
1489   vpxor(dst, dst, dst, vlen_enc);
1490   if (elem_bt == T_SHORT) {
1491     for (int i = 0; i < 4; i++) {
1492       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1493       Label skip_load;
1494       btq(mask, mask_idx);
1495       jccb(Assembler::carryClear, skip_load);
1496       movl(rtmp, Address(idx_base, i * 4));
1497       if (offset != noreg) {
1498         addl(rtmp, offset);
1499       }
1500       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1501       bind(skip_load);
1502       incq(mask_idx);
1503     }
1504   } else {
1505     assert(elem_bt == T_BYTE, "");
1506     for (int i = 0; i < 8; i++) {
1507       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1508       Label skip_load;
1509       btq(mask, mask_idx);
1510       jccb(Assembler::carryClear, skip_load);
1511       movl(rtmp, Address(idx_base, i * 4));
1512       if (offset != noreg) {
1513         addl(rtmp, offset);
1514       }
1515       pinsrb(dst, Address(base, rtmp), i);
1516       bind(skip_load);
1517       incq(mask_idx);
1518     }
1519   }
1520 }
1521 
1522 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1523                                          Register base, Register idx_base,
1524                                          Register offset, Register rtmp,
1525                                          int vlen_enc) {
1526   vpxor(dst, dst, dst, vlen_enc);
1527   if (elem_bt == T_SHORT) {
1528     for (int i = 0; i < 4; i++) {
1529       // dst[i] = src[offset + idx_base[i]]
1530       movl(rtmp, Address(idx_base, i * 4));
1531       if (offset != noreg) {
1532         addl(rtmp, offset);
1533       }
1534       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1535     }
1536   } else {
1537     assert(elem_bt == T_BYTE, "");
1538     for (int i = 0; i < 8; i++) {
1539       // dst[i] = src[offset + idx_base[i]]
1540       movl(rtmp, Address(idx_base, i * 4));
1541       if (offset != noreg) {
1542         addl(rtmp, offset);
1543       }
1544       pinsrb(dst, Address(base, rtmp), i);
1545     }
1546   }
1547 }
1548 
1549 /*
1550  * Gather using hybrid algorithm, first partially unroll scalar loop
1551  * to accumulate values from gather indices into a quad-word(64bit) slice.
1552  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1553  * permutation to place the slice into appropriate vector lane
1554  * locations in destination vector. Following pseudo code describes the
1555  * algorithm in detail:
1556  *
1557  * DST_VEC = ZERO_VEC
1558  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1559  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1560  * FOREACH_ITER:
1561  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1562  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1563  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1564  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1565  *
1566  * With each iteration, doubleword permute indices (0,1) corresponding
1567  * to gathered quadword gets right shifted by two lane positions.
1568  *
1569  */
1570 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1571                                         Register base, Register idx_base,
1572                                         Register offset, Register mask,
1573                                         XMMRegister xtmp1, XMMRegister xtmp2,
1574                                         XMMRegister temp_dst, Register rtmp,
1575                                         Register mask_idx, Register length,
1576                                         int vector_len, int vlen_enc) {
1577   Label GATHER8_LOOP;
1578   assert(is_subword_type(elem_ty), "");
1579   movl(length, vector_len);
1580   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1581   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1582   vallones(xtmp2, vlen_enc);
1583   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1584   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1585   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1586 
1587   bind(GATHER8_LOOP);
1588     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1589     if (mask == noreg) {
1590       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1591     } else {
1592       vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc);
1593     }
1594     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1595     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1596     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1597     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1598     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1599     vpor(dst, dst, temp_dst, vlen_enc);
1600     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1601     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1602     jcc(Assembler::notEqual, GATHER8_LOOP);
1603 }
1604 
1605 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1606   switch(typ) {
1607     case T_INT:
1608       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1609       break;
1610     case T_FLOAT:
1611       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1612       break;
1613     case T_LONG:
1614       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1615       break;
1616     case T_DOUBLE:
1617       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1618       break;
1619     default:
1620       assert(false,"Should not reach here.");
1621       break;
1622   }
1623 }
1624 
1625 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1626   switch(typ) {
1627     case T_INT:
1628       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1629       break;
1630     case T_FLOAT:
1631       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1632       break;
1633     case T_LONG:
1634       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1635       break;
1636     case T_DOUBLE:
1637       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1638       break;
1639     default:
1640       assert(false,"Should not reach here.");
1641       break;
1642   }
1643 }
1644 
1645 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1646   switch(typ) {
1647     case T_INT:
1648       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1649       break;
1650     case T_FLOAT:
1651       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1652       break;
1653     case T_LONG:
1654       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1655       break;
1656     case T_DOUBLE:
1657       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1658       break;
1659     default:
1660       assert(false,"Should not reach here.");
1661       break;
1662   }
1663 }
1664 
1665 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1666   if (vlen_in_bytes <= 16) {
1667     pxor (dst, dst);
1668     psubb(dst, src);
1669     switch (elem_bt) {
1670       case T_BYTE:   /* nothing to do */ break;
1671       case T_SHORT:  pmovsxbw(dst, dst); break;
1672       case T_INT:    pmovsxbd(dst, dst); break;
1673       case T_FLOAT:  pmovsxbd(dst, dst); break;
1674       case T_LONG:   pmovsxbq(dst, dst); break;
1675       case T_DOUBLE: pmovsxbq(dst, dst); break;
1676 
1677       default: assert(false, "%s", type2name(elem_bt));
1678     }
1679   } else {
1680     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1681     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1682 
1683     vpxor (dst, dst, dst, vlen_enc);
1684     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1685 
1686     switch (elem_bt) {
1687       case T_BYTE:   /* nothing to do */            break;
1688       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1689       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1690       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1691       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1692       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1693 
1694       default: assert(false, "%s", type2name(elem_bt));
1695     }
1696   }
1697 }
1698 
1699 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1700   if (novlbwdq) {
1701     vpmovsxbd(xtmp, src, vlen_enc);
1702     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1703             Assembler::eq, true, vlen_enc, noreg);
1704   } else {
1705     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1706     vpsubb(xtmp, xtmp, src, vlen_enc);
1707     evpmovb2m(dst, xtmp, vlen_enc);
1708   }
1709 }
1710 
1711 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1712   if (is_integral_type(bt)) {
1713     switch (vlen_in_bytes) {
1714       case 4:  movdl(dst, src);   break;
1715       case 8:  movq(dst, src);    break;
1716       case 16: movdqu(dst, src);  break;
1717       case 32: vmovdqu(dst, src); break;
1718       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1719       default: ShouldNotReachHere();
1720     }
1721   } else {
1722     switch (vlen_in_bytes) {
1723       case 4:  movflt(dst, src); break;
1724       case 8:  movdbl(dst, src); break;
1725       case 16: movups(dst, src); break;
1726       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1727       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1728       default: ShouldNotReachHere();
1729     }
1730   }
1731 }
1732 
1733 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1734   assert(rscratch != noreg || always_reachable(src), "missing");
1735 
1736   if (reachable(src)) {
1737     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1738   } else {
1739     lea(rscratch, src);
1740     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1741   }
1742 }
1743 
1744 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1745   int vlen_enc = vector_length_encoding(vlen);
1746   if (VM_Version::supports_avx()) {
1747     if (bt == T_LONG) {
1748       if (VM_Version::supports_avx2()) {
1749         vpbroadcastq(dst, src, vlen_enc);
1750       } else {
1751         vmovddup(dst, src, vlen_enc);
1752       }
1753     } else if (bt == T_DOUBLE) {
1754       if (vlen_enc != Assembler::AVX_128bit) {
1755         vbroadcastsd(dst, src, vlen_enc, noreg);
1756       } else {
1757         vmovddup(dst, src, vlen_enc);
1758       }
1759     } else {
1760       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1761         vpbroadcastd(dst, src, vlen_enc);
1762       } else {
1763         vbroadcastss(dst, src, vlen_enc);
1764       }
1765     }
1766   } else if (VM_Version::supports_sse3()) {
1767     movddup(dst, src);
1768   } else {
1769     load_vector(bt, dst, src, vlen);
1770   }
1771 }
1772 
1773 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1774   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1775   int offset = exact_log2(type2aelembytes(bt)) << 6;
1776   if (is_floating_point_type(bt)) {
1777     offset += 128;
1778   }
1779   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1780   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1781 }
1782 
1783 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1784 
1785 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1786   int vector_len = Assembler::AVX_128bit;
1787 
1788   switch (opcode) {
1789     case Op_AndReductionV:  pand(dst, src); break;
1790     case Op_OrReductionV:   por (dst, src); break;
1791     case Op_XorReductionV:  pxor(dst, src); break;
1792     case Op_MinReductionV:
1793       switch (typ) {
1794         case T_BYTE:        pminsb(dst, src); break;
1795         case T_SHORT:       pminsw(dst, src); break;
1796         case T_INT:         pminsd(dst, src); break;
1797         case T_LONG:        assert(UseAVX > 2, "required");
1798                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1799         default:            assert(false, "wrong type");
1800       }
1801       break;
1802     case Op_MaxReductionV:
1803       switch (typ) {
1804         case T_BYTE:        pmaxsb(dst, src); break;
1805         case T_SHORT:       pmaxsw(dst, src); break;
1806         case T_INT:         pmaxsd(dst, src); break;
1807         case T_LONG:        assert(UseAVX > 2, "required");
1808                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1809         default:            assert(false, "wrong type");
1810       }
1811       break;
1812     case Op_AddReductionVF: addss(dst, src); break;
1813     case Op_AddReductionVD: addsd(dst, src); break;
1814     case Op_AddReductionVI:
1815       switch (typ) {
1816         case T_BYTE:        paddb(dst, src); break;
1817         case T_SHORT:       paddw(dst, src); break;
1818         case T_INT:         paddd(dst, src); break;
1819         default:            assert(false, "wrong type");
1820       }
1821       break;
1822     case Op_AddReductionVL: paddq(dst, src); break;
1823     case Op_MulReductionVF: mulss(dst, src); break;
1824     case Op_MulReductionVD: mulsd(dst, src); break;
1825     case Op_MulReductionVI:
1826       switch (typ) {
1827         case T_SHORT:       pmullw(dst, src); break;
1828         case T_INT:         pmulld(dst, src); break;
1829         default:            assert(false, "wrong type");
1830       }
1831       break;
1832     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1833                             evpmullq(dst, dst, src, vector_len); break;
1834     default:                assert(false, "wrong opcode");
1835   }
1836 }
1837 
1838 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1839   switch (opcode) {
1840     case Op_AddReductionVF: addps(dst, src); break;
1841     case Op_AddReductionVD: addpd(dst, src); break;
1842     case Op_MulReductionVF: mulps(dst, src); break;
1843     case Op_MulReductionVD: mulpd(dst, src); break;
1844     default:                assert(false, "%s", NodeClassNames[opcode]);
1845   }
1846 }
1847 
1848 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1849   int vector_len = Assembler::AVX_256bit;
1850 
1851   switch (opcode) {
1852     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1853     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1854     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1855     case Op_MinReductionV:
1856       switch (typ) {
1857         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1858         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1859         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1860         case T_LONG:        assert(UseAVX > 2, "required");
1861                             vpminsq(dst, src1, src2, vector_len); break;
1862         default:            assert(false, "wrong type");
1863       }
1864       break;
1865     case Op_MaxReductionV:
1866       switch (typ) {
1867         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1868         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1869         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1870         case T_LONG:        assert(UseAVX > 2, "required");
1871                             vpmaxsq(dst, src1, src2, vector_len); break;
1872         default:            assert(false, "wrong type");
1873       }
1874       break;
1875     case Op_AddReductionVI:
1876       switch (typ) {
1877         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1878         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1879         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1880         default:            assert(false, "wrong type");
1881       }
1882       break;
1883     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1884     case Op_MulReductionVI:
1885       switch (typ) {
1886         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1887         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1888         default:            assert(false, "wrong type");
1889       }
1890       break;
1891     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1892     default:                assert(false, "wrong opcode");
1893   }
1894 }
1895 
1896 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1897   int vector_len = Assembler::AVX_256bit;
1898 
1899   switch (opcode) {
1900     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1901     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1902     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1903     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1904     default:                assert(false, "%s", NodeClassNames[opcode]);
1905   }
1906 }
1907 
1908 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1909                                   XMMRegister dst, XMMRegister src,
1910                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1911   switch (opcode) {
1912     case Op_AddReductionVF:
1913     case Op_MulReductionVF:
1914       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1915       break;
1916 
1917     case Op_AddReductionVD:
1918     case Op_MulReductionVD:
1919       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1920       break;
1921 
1922     default: assert(false, "wrong opcode");
1923   }
1924 }
1925 
1926 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1927                                             XMMRegister dst, XMMRegister src,
1928                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1929   switch (opcode) {
1930     case Op_AddReductionVF:
1931     case Op_MulReductionVF:
1932       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1933       break;
1934 
1935     case Op_AddReductionVD:
1936     case Op_MulReductionVD:
1937       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1938       break;
1939 
1940     default: assert(false, "%s", NodeClassNames[opcode]);
1941   }
1942 }
1943 
1944 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1945                              Register dst, Register src1, XMMRegister src2,
1946                              XMMRegister vtmp1, XMMRegister vtmp2) {
1947   switch (vlen) {
1948     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1949     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1950     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1951     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1952 
1953     default: assert(false, "wrong vector length");
1954   }
1955 }
1956 
1957 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1958                              Register dst, Register src1, XMMRegister src2,
1959                              XMMRegister vtmp1, XMMRegister vtmp2) {
1960   switch (vlen) {
1961     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1962     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1963     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1964     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1965 
1966     default: assert(false, "wrong vector length");
1967   }
1968 }
1969 
1970 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1971                              Register dst, Register src1, XMMRegister src2,
1972                              XMMRegister vtmp1, XMMRegister vtmp2) {
1973   switch (vlen) {
1974     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1975     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1976     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1977     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1978 
1979     default: assert(false, "wrong vector length");
1980   }
1981 }
1982 
1983 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1984                              Register dst, Register src1, XMMRegister src2,
1985                              XMMRegister vtmp1, XMMRegister vtmp2) {
1986   switch (vlen) {
1987     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1988     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1989     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1990     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1991 
1992     default: assert(false, "wrong vector length");
1993   }
1994 }
1995 
1996 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1997                              Register dst, Register src1, XMMRegister src2,
1998                              XMMRegister vtmp1, XMMRegister vtmp2) {
1999   switch (vlen) {
2000     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2001     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2002     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2003 
2004     default: assert(false, "wrong vector length");
2005   }
2006 }
2007 
2008 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2009   switch (vlen) {
2010     case 2:
2011       assert(vtmp2 == xnoreg, "");
2012       reduce2F(opcode, dst, src, vtmp1);
2013       break;
2014     case 4:
2015       assert(vtmp2 == xnoreg, "");
2016       reduce4F(opcode, dst, src, vtmp1);
2017       break;
2018     case 8:
2019       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2020       break;
2021     case 16:
2022       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2023       break;
2024     default: assert(false, "wrong vector length");
2025   }
2026 }
2027 
2028 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2029   switch (vlen) {
2030     case 2:
2031       assert(vtmp2 == xnoreg, "");
2032       reduce2D(opcode, dst, src, vtmp1);
2033       break;
2034     case 4:
2035       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2036       break;
2037     case 8:
2038       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2039       break;
2040     default: assert(false, "wrong vector length");
2041   }
2042 }
2043 
2044 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2045   switch (vlen) {
2046     case 2:
2047       assert(vtmp1 == xnoreg, "");
2048       assert(vtmp2 == xnoreg, "");
2049       unorderedReduce2F(opcode, dst, src);
2050       break;
2051     case 4:
2052       assert(vtmp2 == xnoreg, "");
2053       unorderedReduce4F(opcode, dst, src, vtmp1);
2054       break;
2055     case 8:
2056       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2057       break;
2058     case 16:
2059       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2060       break;
2061     default: assert(false, "wrong vector length");
2062   }
2063 }
2064 
2065 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2066   switch (vlen) {
2067     case 2:
2068       assert(vtmp1 == xnoreg, "");
2069       assert(vtmp2 == xnoreg, "");
2070       unorderedReduce2D(opcode, dst, src);
2071       break;
2072     case 4:
2073       assert(vtmp2 == xnoreg, "");
2074       unorderedReduce4D(opcode, dst, src, vtmp1);
2075       break;
2076     case 8:
2077       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2078       break;
2079     default: assert(false, "wrong vector length");
2080   }
2081 }
2082 
2083 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2084   if (opcode == Op_AddReductionVI) {
2085     if (vtmp1 != src2) {
2086       movdqu(vtmp1, src2);
2087     }
2088     phaddd(vtmp1, vtmp1);
2089   } else {
2090     pshufd(vtmp1, src2, 0x1);
2091     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2092   }
2093   movdl(vtmp2, src1);
2094   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2095   movdl(dst, vtmp1);
2096 }
2097 
2098 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2099   if (opcode == Op_AddReductionVI) {
2100     if (vtmp1 != src2) {
2101       movdqu(vtmp1, src2);
2102     }
2103     phaddd(vtmp1, src2);
2104     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2105   } else {
2106     pshufd(vtmp2, src2, 0xE);
2107     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2108     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2109   }
2110 }
2111 
2112 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2113   if (opcode == Op_AddReductionVI) {
2114     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2115     vextracti128_high(vtmp2, vtmp1);
2116     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2117     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2118   } else {
2119     vextracti128_high(vtmp1, src2);
2120     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2121     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2122   }
2123 }
2124 
2125 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2126   vextracti64x4_high(vtmp2, src2);
2127   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2128   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2129 }
2130 
2131 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2132   pshufd(vtmp2, src2, 0x1);
2133   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2134   movdqu(vtmp1, vtmp2);
2135   psrldq(vtmp1, 2);
2136   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2137   movdqu(vtmp2, vtmp1);
2138   psrldq(vtmp2, 1);
2139   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2140   movdl(vtmp2, src1);
2141   pmovsxbd(vtmp1, vtmp1);
2142   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2143   pextrb(dst, vtmp1, 0x0);
2144   movsbl(dst, dst);
2145 }
2146 
2147 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2148   pshufd(vtmp1, src2, 0xE);
2149   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2150   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2151 }
2152 
2153 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2154   vextracti128_high(vtmp2, src2);
2155   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2156   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2157 }
2158 
2159 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2160   vextracti64x4_high(vtmp1, src2);
2161   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2162   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2163 }
2164 
2165 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2166   pmovsxbw(vtmp2, src2);
2167   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2168 }
2169 
2170 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2171   if (UseAVX > 1) {
2172     int vector_len = Assembler::AVX_256bit;
2173     vpmovsxbw(vtmp1, src2, vector_len);
2174     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2175   } else {
2176     pmovsxbw(vtmp2, src2);
2177     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2178     pshufd(vtmp2, src2, 0x1);
2179     pmovsxbw(vtmp2, src2);
2180     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2181   }
2182 }
2183 
2184 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2185   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2186     int vector_len = Assembler::AVX_512bit;
2187     vpmovsxbw(vtmp1, src2, vector_len);
2188     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2189   } else {
2190     assert(UseAVX >= 2,"Should not reach here.");
2191     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2192     vextracti128_high(vtmp2, src2);
2193     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2194   }
2195 }
2196 
2197 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2198   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2199   vextracti64x4_high(vtmp2, src2);
2200   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2201 }
2202 
2203 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2204   if (opcode == Op_AddReductionVI) {
2205     if (vtmp1 != src2) {
2206       movdqu(vtmp1, src2);
2207     }
2208     phaddw(vtmp1, vtmp1);
2209     phaddw(vtmp1, vtmp1);
2210   } else {
2211     pshufd(vtmp2, src2, 0x1);
2212     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2213     movdqu(vtmp1, vtmp2);
2214     psrldq(vtmp1, 2);
2215     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2216   }
2217   movdl(vtmp2, src1);
2218   pmovsxwd(vtmp1, vtmp1);
2219   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2220   pextrw(dst, vtmp1, 0x0);
2221   movswl(dst, dst);
2222 }
2223 
2224 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2225   if (opcode == Op_AddReductionVI) {
2226     if (vtmp1 != src2) {
2227       movdqu(vtmp1, src2);
2228     }
2229     phaddw(vtmp1, src2);
2230   } else {
2231     pshufd(vtmp1, src2, 0xE);
2232     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2233   }
2234   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2235 }
2236 
2237 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2238   if (opcode == Op_AddReductionVI) {
2239     int vector_len = Assembler::AVX_256bit;
2240     vphaddw(vtmp2, src2, src2, vector_len);
2241     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2242   } else {
2243     vextracti128_high(vtmp2, src2);
2244     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2245   }
2246   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2247 }
2248 
2249 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2250   int vector_len = Assembler::AVX_256bit;
2251   vextracti64x4_high(vtmp1, src2);
2252   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2253   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2254 }
2255 
2256 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2257   pshufd(vtmp2, src2, 0xE);
2258   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2259   movdq(vtmp1, src1);
2260   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2261   movdq(dst, vtmp1);
2262 }
2263 
2264 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2265   vextracti128_high(vtmp1, src2);
2266   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2267   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2268 }
2269 
2270 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2271   vextracti64x4_high(vtmp2, src2);
2272   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2273   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2274 }
2275 
2276 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2277   mov64(temp, -1L);
2278   bzhiq(temp, temp, len);
2279   kmovql(dst, temp);
2280 }
2281 
2282 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2283   reduce_operation_128(T_FLOAT, opcode, dst, src);
2284   pshufd(vtmp, src, 0x1);
2285   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2286 }
2287 
2288 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2289   reduce2F(opcode, dst, src, vtmp);
2290   pshufd(vtmp, src, 0x2);
2291   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2292   pshufd(vtmp, src, 0x3);
2293   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2294 }
2295 
2296 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2297   reduce4F(opcode, dst, src, vtmp2);
2298   vextractf128_high(vtmp2, src);
2299   reduce4F(opcode, dst, vtmp2, vtmp1);
2300 }
2301 
2302 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2303   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2304   vextracti64x4_high(vtmp1, src);
2305   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2306 }
2307 
2308 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2309   pshufd(dst, src, 0x1);
2310   reduce_operation_128(T_FLOAT, opcode, dst, src);
2311 }
2312 
2313 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2314   pshufd(vtmp, src, 0xE);
2315   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2316   unorderedReduce2F(opcode, dst, vtmp);
2317 }
2318 
2319 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2320   vextractf128_high(vtmp1, src);
2321   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2322   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2323 }
2324 
2325 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2326   vextractf64x4_high(vtmp2, src);
2327   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2328   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2329 }
2330 
2331 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2332   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2333   pshufd(vtmp, src, 0xE);
2334   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2335 }
2336 
2337 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2338   reduce2D(opcode, dst, src, vtmp2);
2339   vextractf128_high(vtmp2, src);
2340   reduce2D(opcode, dst, vtmp2, vtmp1);
2341 }
2342 
2343 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2344   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2345   vextracti64x4_high(vtmp1, src);
2346   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2347 }
2348 
2349 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2350   pshufd(dst, src, 0xE);
2351   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2352 }
2353 
2354 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2355   vextractf128_high(vtmp, src);
2356   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2357   unorderedReduce2D(opcode, dst, vtmp);
2358 }
2359 
2360 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2361   vextractf64x4_high(vtmp2, src);
2362   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2363   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2364 }
2365 
2366 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2367   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2368 }
2369 
2370 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2371   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2372 }
2373 
2374 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2375   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2376 }
2377 
2378 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2379                                  int vec_enc) {
2380   switch(elem_bt) {
2381     case T_INT:
2382     case T_FLOAT:
2383       vmaskmovps(dst, src, mask, vec_enc);
2384       break;
2385     case T_LONG:
2386     case T_DOUBLE:
2387       vmaskmovpd(dst, src, mask, vec_enc);
2388       break;
2389     default:
2390       fatal("Unsupported type %s", type2name(elem_bt));
2391       break;
2392   }
2393 }
2394 
2395 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2396                                  int vec_enc) {
2397   switch(elem_bt) {
2398     case T_INT:
2399     case T_FLOAT:
2400       vmaskmovps(dst, src, mask, vec_enc);
2401       break;
2402     case T_LONG:
2403     case T_DOUBLE:
2404       vmaskmovpd(dst, src, mask, vec_enc);
2405       break;
2406     default:
2407       fatal("Unsupported type %s", type2name(elem_bt));
2408       break;
2409   }
2410 }
2411 
2412 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2413                                           XMMRegister dst, XMMRegister src,
2414                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2415                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2416   const int permconst[] = {1, 14};
2417   XMMRegister wsrc = src;
2418   XMMRegister wdst = xmm_0;
2419   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2420 
2421   int vlen_enc = Assembler::AVX_128bit;
2422   if (vlen == 16) {
2423     vlen_enc = Assembler::AVX_256bit;
2424   }
2425 
2426   for (int i = log2(vlen) - 1; i >=0; i--) {
2427     if (i == 0 && !is_dst_valid) {
2428       wdst = dst;
2429     }
2430     if (i == 3) {
2431       vextracti64x4_high(wtmp, wsrc);
2432     } else if (i == 2) {
2433       vextracti128_high(wtmp, wsrc);
2434     } else { // i = [0,1]
2435       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2436     }
2437     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2438     wsrc = wdst;
2439     vlen_enc = Assembler::AVX_128bit;
2440   }
2441   if (is_dst_valid) {
2442     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2443   }
2444 }
2445 
2446 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2447                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2448                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2449   XMMRegister wsrc = src;
2450   XMMRegister wdst = xmm_0;
2451   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2452   int vlen_enc = Assembler::AVX_128bit;
2453   if (vlen == 8) {
2454     vlen_enc = Assembler::AVX_256bit;
2455   }
2456   for (int i = log2(vlen) - 1; i >=0; i--) {
2457     if (i == 0 && !is_dst_valid) {
2458       wdst = dst;
2459     }
2460     if (i == 1) {
2461       vextracti128_high(wtmp, wsrc);
2462     } else if (i == 2) {
2463       vextracti64x4_high(wtmp, wsrc);
2464     } else {
2465       assert(i == 0, "%d", i);
2466       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2467     }
2468     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2469     wsrc = wdst;
2470     vlen_enc = Assembler::AVX_128bit;
2471   }
2472   if (is_dst_valid) {
2473     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2474   }
2475 }
2476 
2477 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2478   switch (bt) {
2479     case T_BYTE:  pextrb(dst, src, idx); break;
2480     case T_SHORT: pextrw(dst, src, idx); break;
2481     case T_INT:   pextrd(dst, src, idx); break;
2482     case T_LONG:  pextrq(dst, src, idx); break;
2483 
2484     default:
2485       assert(false,"Should not reach here.");
2486       break;
2487   }
2488 }
2489 
2490 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2491   int esize =  type2aelembytes(typ);
2492   int elem_per_lane = 16/esize;
2493   int lane = elemindex / elem_per_lane;
2494   int eindex = elemindex % elem_per_lane;
2495 
2496   if (lane >= 2) {
2497     assert(UseAVX > 2, "required");
2498     vextractf32x4(dst, src, lane & 3);
2499     return dst;
2500   } else if (lane > 0) {
2501     assert(UseAVX > 0, "required");
2502     vextractf128(dst, src, lane);
2503     return dst;
2504   } else {
2505     return src;
2506   }
2507 }
2508 
2509 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2510   if (typ == T_BYTE) {
2511     movsbl(dst, dst);
2512   } else if (typ == T_SHORT) {
2513     movswl(dst, dst);
2514   }
2515 }
2516 
2517 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2518   int esize =  type2aelembytes(typ);
2519   int elem_per_lane = 16/esize;
2520   int eindex = elemindex % elem_per_lane;
2521   assert(is_integral_type(typ),"required");
2522 
2523   if (eindex == 0) {
2524     if (typ == T_LONG) {
2525       movq(dst, src);
2526     } else {
2527       movdl(dst, src);
2528       movsxl(typ, dst);
2529     }
2530   } else {
2531     extract(typ, dst, src, eindex);
2532     movsxl(typ, dst);
2533   }
2534 }
2535 
2536 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2537   int esize =  type2aelembytes(typ);
2538   int elem_per_lane = 16/esize;
2539   int eindex = elemindex % elem_per_lane;
2540   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2541 
2542   if (eindex == 0) {
2543     movq(dst, src);
2544   } else {
2545     if (typ == T_FLOAT) {
2546       if (UseAVX == 0) {
2547         movdqu(dst, src);
2548         shufps(dst, dst, eindex);
2549       } else {
2550         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2551       }
2552     } else {
2553       if (UseAVX == 0) {
2554         movdqu(dst, src);
2555         psrldq(dst, eindex*esize);
2556       } else {
2557         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2558       }
2559       movq(dst, dst);
2560     }
2561   }
2562   // Zero upper bits
2563   if (typ == T_FLOAT) {
2564     if (UseAVX == 0) {
2565       assert(vtmp != xnoreg, "required.");
2566       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2567       pand(dst, vtmp);
2568     } else {
2569       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2570     }
2571   }
2572 }
2573 
2574 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2575   switch(typ) {
2576     case T_BYTE:
2577     case T_BOOLEAN:
2578       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2579       break;
2580     case T_SHORT:
2581     case T_CHAR:
2582       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2583       break;
2584     case T_INT:
2585     case T_FLOAT:
2586       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2587       break;
2588     case T_LONG:
2589     case T_DOUBLE:
2590       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2591       break;
2592     default:
2593       assert(false,"Should not reach here.");
2594       break;
2595   }
2596 }
2597 
2598 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2599   assert(rscratch != noreg || always_reachable(src2), "missing");
2600 
2601   switch(typ) {
2602     case T_BOOLEAN:
2603     case T_BYTE:
2604       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2605       break;
2606     case T_CHAR:
2607     case T_SHORT:
2608       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2609       break;
2610     case T_INT:
2611     case T_FLOAT:
2612       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2613       break;
2614     case T_LONG:
2615     case T_DOUBLE:
2616       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2617       break;
2618     default:
2619       assert(false,"Should not reach here.");
2620       break;
2621   }
2622 }
2623 
2624 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2625   switch(typ) {
2626     case T_BYTE:
2627       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2628       break;
2629     case T_SHORT:
2630       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2631       break;
2632     case T_INT:
2633     case T_FLOAT:
2634       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2635       break;
2636     case T_LONG:
2637     case T_DOUBLE:
2638       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2639       break;
2640     default:
2641       assert(false,"Should not reach here.");
2642       break;
2643   }
2644 }
2645 
2646 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2647   assert(vlen_in_bytes <= 32, "");
2648   int esize = type2aelembytes(bt);
2649   if (vlen_in_bytes == 32) {
2650     assert(vtmp == xnoreg, "required.");
2651     if (esize >= 4) {
2652       vtestps(src1, src2, AVX_256bit);
2653     } else {
2654       vptest(src1, src2, AVX_256bit);
2655     }
2656     return;
2657   }
2658   if (vlen_in_bytes < 16) {
2659     // Duplicate the lower part to fill the whole register,
2660     // Don't need to do so for src2
2661     assert(vtmp != xnoreg, "required");
2662     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2663     pshufd(vtmp, src1, shuffle_imm);
2664   } else {
2665     assert(vtmp == xnoreg, "required");
2666     vtmp = src1;
2667   }
2668   if (esize >= 4 && VM_Version::supports_avx()) {
2669     vtestps(vtmp, src2, AVX_128bit);
2670   } else {
2671     ptest(vtmp, src2);
2672   }
2673 }
2674 
2675 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2676 #ifdef ASSERT
2677   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2678   bool is_bw_supported = VM_Version::supports_avx512bw();
2679   if (is_bw && !is_bw_supported) {
2680     assert(vlen_enc != Assembler::AVX_512bit, "required");
2681     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2682            "XMM register should be 0-15");
2683   }
2684 #endif // ASSERT
2685   switch (elem_bt) {
2686     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2687     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2688     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2689     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2690     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2691     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2692     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2693   }
2694 }
2695 
2696 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2697   assert(UseAVX >= 2, "required");
2698   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2699   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2700   if ((UseAVX > 2) &&
2701       (!is_bw || VM_Version::supports_avx512bw()) &&
2702       (!is_vl || VM_Version::supports_avx512vl())) {
2703     switch (elem_bt) {
2704       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2705       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2706       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2707       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2708       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2709     }
2710   } else {
2711     assert(vlen_enc != Assembler::AVX_512bit, "required");
2712     assert((dst->encoding() < 16),"XMM register should be 0-15");
2713     switch (elem_bt) {
2714       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2715       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2716       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2717       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2718       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2719       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2720       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2721     }
2722   }
2723 }
2724 
2725 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2726   switch (to_elem_bt) {
2727     case T_SHORT:
2728       vpmovsxbw(dst, src, vlen_enc);
2729       break;
2730     case T_INT:
2731       vpmovsxbd(dst, src, vlen_enc);
2732       break;
2733     case T_FLOAT:
2734       vpmovsxbd(dst, src, vlen_enc);
2735       vcvtdq2ps(dst, dst, vlen_enc);
2736       break;
2737     case T_LONG:
2738       vpmovsxbq(dst, src, vlen_enc);
2739       break;
2740     case T_DOUBLE: {
2741       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2742       vpmovsxbd(dst, src, mid_vlen_enc);
2743       vcvtdq2pd(dst, dst, vlen_enc);
2744       break;
2745     }
2746     default:
2747       fatal("Unsupported type %s", type2name(to_elem_bt));
2748       break;
2749   }
2750 }
2751 
2752 //-------------------------------------------------------------------------------------------
2753 
2754 // IndexOf for constant substrings with size >= 8 chars
2755 // which don't need to be loaded through stack.
2756 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2757                                          Register cnt1, Register cnt2,
2758                                          int int_cnt2,  Register result,
2759                                          XMMRegister vec, Register tmp,
2760                                          int ae) {
2761   ShortBranchVerifier sbv(this);
2762   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2763   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2764 
2765   // This method uses the pcmpestri instruction with bound registers
2766   //   inputs:
2767   //     xmm - substring
2768   //     rax - substring length (elements count)
2769   //     mem - scanned string
2770   //     rdx - string length (elements count)
2771   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2772   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2773   //   outputs:
2774   //     rcx - matched index in string
2775   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2776   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2777   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2778   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2779   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2780 
2781   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2782         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2783         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2784 
2785   // Note, inline_string_indexOf() generates checks:
2786   // if (substr.count > string.count) return -1;
2787   // if (substr.count == 0) return 0;
2788   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2789 
2790   // Load substring.
2791   if (ae == StrIntrinsicNode::UL) {
2792     pmovzxbw(vec, Address(str2, 0));
2793   } else {
2794     movdqu(vec, Address(str2, 0));
2795   }
2796   movl(cnt2, int_cnt2);
2797   movptr(result, str1); // string addr
2798 
2799   if (int_cnt2 > stride) {
2800     jmpb(SCAN_TO_SUBSTR);
2801 
2802     // Reload substr for rescan, this code
2803     // is executed only for large substrings (> 8 chars)
2804     bind(RELOAD_SUBSTR);
2805     if (ae == StrIntrinsicNode::UL) {
2806       pmovzxbw(vec, Address(str2, 0));
2807     } else {
2808       movdqu(vec, Address(str2, 0));
2809     }
2810     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2811 
2812     bind(RELOAD_STR);
2813     // We came here after the beginning of the substring was
2814     // matched but the rest of it was not so we need to search
2815     // again. Start from the next element after the previous match.
2816 
2817     // cnt2 is number of substring reminding elements and
2818     // cnt1 is number of string reminding elements when cmp failed.
2819     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2820     subl(cnt1, cnt2);
2821     addl(cnt1, int_cnt2);
2822     movl(cnt2, int_cnt2); // Now restore cnt2
2823 
2824     decrementl(cnt1);     // Shift to next element
2825     cmpl(cnt1, cnt2);
2826     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2827 
2828     addptr(result, (1<<scale1));
2829 
2830   } // (int_cnt2 > 8)
2831 
2832   // Scan string for start of substr in 16-byte vectors
2833   bind(SCAN_TO_SUBSTR);
2834   pcmpestri(vec, Address(result, 0), mode);
2835   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2836   subl(cnt1, stride);
2837   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2838   cmpl(cnt1, cnt2);
2839   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2840   addptr(result, 16);
2841   jmpb(SCAN_TO_SUBSTR);
2842 
2843   // Found a potential substr
2844   bind(FOUND_CANDIDATE);
2845   // Matched whole vector if first element matched (tmp(rcx) == 0).
2846   if (int_cnt2 == stride) {
2847     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2848   } else { // int_cnt2 > 8
2849     jccb(Assembler::overflow, FOUND_SUBSTR);
2850   }
2851   // After pcmpestri tmp(rcx) contains matched element index
2852   // Compute start addr of substr
2853   lea(result, Address(result, tmp, scale1));
2854 
2855   // Make sure string is still long enough
2856   subl(cnt1, tmp);
2857   cmpl(cnt1, cnt2);
2858   if (int_cnt2 == stride) {
2859     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2860   } else { // int_cnt2 > 8
2861     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2862   }
2863   // Left less then substring.
2864 
2865   bind(RET_NOT_FOUND);
2866   movl(result, -1);
2867   jmp(EXIT);
2868 
2869   if (int_cnt2 > stride) {
2870     // This code is optimized for the case when whole substring
2871     // is matched if its head is matched.
2872     bind(MATCH_SUBSTR_HEAD);
2873     pcmpestri(vec, Address(result, 0), mode);
2874     // Reload only string if does not match
2875     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2876 
2877     Label CONT_SCAN_SUBSTR;
2878     // Compare the rest of substring (> 8 chars).
2879     bind(FOUND_SUBSTR);
2880     // First 8 chars are already matched.
2881     negptr(cnt2);
2882     addptr(cnt2, stride);
2883 
2884     bind(SCAN_SUBSTR);
2885     subl(cnt1, stride);
2886     cmpl(cnt2, -stride); // Do not read beyond substring
2887     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2888     // Back-up strings to avoid reading beyond substring:
2889     // cnt1 = cnt1 - cnt2 + 8
2890     addl(cnt1, cnt2); // cnt2 is negative
2891     addl(cnt1, stride);
2892     movl(cnt2, stride); negptr(cnt2);
2893     bind(CONT_SCAN_SUBSTR);
2894     if (int_cnt2 < (int)G) {
2895       int tail_off1 = int_cnt2<<scale1;
2896       int tail_off2 = int_cnt2<<scale2;
2897       if (ae == StrIntrinsicNode::UL) {
2898         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2899       } else {
2900         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2901       }
2902       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2903     } else {
2904       // calculate index in register to avoid integer overflow (int_cnt2*2)
2905       movl(tmp, int_cnt2);
2906       addptr(tmp, cnt2);
2907       if (ae == StrIntrinsicNode::UL) {
2908         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2909       } else {
2910         movdqu(vec, Address(str2, tmp, scale2, 0));
2911       }
2912       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2913     }
2914     // Need to reload strings pointers if not matched whole vector
2915     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2916     addptr(cnt2, stride);
2917     jcc(Assembler::negative, SCAN_SUBSTR);
2918     // Fall through if found full substring
2919 
2920   } // (int_cnt2 > 8)
2921 
2922   bind(RET_FOUND);
2923   // Found result if we matched full small substring.
2924   // Compute substr offset
2925   subptr(result, str1);
2926   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2927     shrl(result, 1); // index
2928   }
2929   bind(EXIT);
2930 
2931 } // string_indexofC8
2932 
2933 // Small strings are loaded through stack if they cross page boundary.
2934 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2935                                        Register cnt1, Register cnt2,
2936                                        int int_cnt2,  Register result,
2937                                        XMMRegister vec, Register tmp,
2938                                        int ae) {
2939   ShortBranchVerifier sbv(this);
2940   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2941   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2942 
2943   //
2944   // int_cnt2 is length of small (< 8 chars) constant substring
2945   // or (-1) for non constant substring in which case its length
2946   // is in cnt2 register.
2947   //
2948   // Note, inline_string_indexOf() generates checks:
2949   // if (substr.count > string.count) return -1;
2950   // if (substr.count == 0) return 0;
2951   //
2952   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2953   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2954   // This method uses the pcmpestri instruction with bound registers
2955   //   inputs:
2956   //     xmm - substring
2957   //     rax - substring length (elements count)
2958   //     mem - scanned string
2959   //     rdx - string length (elements count)
2960   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2961   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2962   //   outputs:
2963   //     rcx - matched index in string
2964   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2965   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2966   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2967   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2968 
2969   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2970         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2971         FOUND_CANDIDATE;
2972 
2973   { //========================================================
2974     // We don't know where these strings are located
2975     // and we can't read beyond them. Load them through stack.
2976     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2977 
2978     movptr(tmp, rsp); // save old SP
2979 
2980     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2981       if (int_cnt2 == (1>>scale2)) { // One byte
2982         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2983         load_unsigned_byte(result, Address(str2, 0));
2984         movdl(vec, result); // move 32 bits
2985       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2986         // Not enough header space in 32-bit VM: 12+3 = 15.
2987         movl(result, Address(str2, -1));
2988         shrl(result, 8);
2989         movdl(vec, result); // move 32 bits
2990       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2991         load_unsigned_short(result, Address(str2, 0));
2992         movdl(vec, result); // move 32 bits
2993       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2994         movdl(vec, Address(str2, 0)); // move 32 bits
2995       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2996         movq(vec, Address(str2, 0));  // move 64 bits
2997       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2998         // Array header size is 12 bytes in 32-bit VM
2999         // + 6 bytes for 3 chars == 18 bytes,
3000         // enough space to load vec and shift.
3001         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3002         if (ae == StrIntrinsicNode::UL) {
3003           int tail_off = int_cnt2-8;
3004           pmovzxbw(vec, Address(str2, tail_off));
3005           psrldq(vec, -2*tail_off);
3006         }
3007         else {
3008           int tail_off = int_cnt2*(1<<scale2);
3009           movdqu(vec, Address(str2, tail_off-16));
3010           psrldq(vec, 16-tail_off);
3011         }
3012       }
3013     } else { // not constant substring
3014       cmpl(cnt2, stride);
3015       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3016 
3017       // We can read beyond string if srt+16 does not cross page boundary
3018       // since heaps are aligned and mapped by pages.
3019       assert(os::vm_page_size() < (int)G, "default page should be small");
3020       movl(result, str2); // We need only low 32 bits
3021       andl(result, ((int)os::vm_page_size()-1));
3022       cmpl(result, ((int)os::vm_page_size()-16));
3023       jccb(Assembler::belowEqual, CHECK_STR);
3024 
3025       // Move small strings to stack to allow load 16 bytes into vec.
3026       subptr(rsp, 16);
3027       int stk_offset = wordSize-(1<<scale2);
3028       push(cnt2);
3029 
3030       bind(COPY_SUBSTR);
3031       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3032         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3033         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3034       } else if (ae == StrIntrinsicNode::UU) {
3035         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3036         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3037       }
3038       decrement(cnt2);
3039       jccb(Assembler::notZero, COPY_SUBSTR);
3040 
3041       pop(cnt2);
3042       movptr(str2, rsp);  // New substring address
3043     } // non constant
3044 
3045     bind(CHECK_STR);
3046     cmpl(cnt1, stride);
3047     jccb(Assembler::aboveEqual, BIG_STRINGS);
3048 
3049     // Check cross page boundary.
3050     movl(result, str1); // We need only low 32 bits
3051     andl(result, ((int)os::vm_page_size()-1));
3052     cmpl(result, ((int)os::vm_page_size()-16));
3053     jccb(Assembler::belowEqual, BIG_STRINGS);
3054 
3055     subptr(rsp, 16);
3056     int stk_offset = -(1<<scale1);
3057     if (int_cnt2 < 0) { // not constant
3058       push(cnt2);
3059       stk_offset += wordSize;
3060     }
3061     movl(cnt2, cnt1);
3062 
3063     bind(COPY_STR);
3064     if (ae == StrIntrinsicNode::LL) {
3065       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3066       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3067     } else {
3068       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3069       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3070     }
3071     decrement(cnt2);
3072     jccb(Assembler::notZero, COPY_STR);
3073 
3074     if (int_cnt2 < 0) { // not constant
3075       pop(cnt2);
3076     }
3077     movptr(str1, rsp);  // New string address
3078 
3079     bind(BIG_STRINGS);
3080     // Load substring.
3081     if (int_cnt2 < 0) { // -1
3082       if (ae == StrIntrinsicNode::UL) {
3083         pmovzxbw(vec, Address(str2, 0));
3084       } else {
3085         movdqu(vec, Address(str2, 0));
3086       }
3087       push(cnt2);       // substr count
3088       push(str2);       // substr addr
3089       push(str1);       // string addr
3090     } else {
3091       // Small (< 8 chars) constant substrings are loaded already.
3092       movl(cnt2, int_cnt2);
3093     }
3094     push(tmp);  // original SP
3095 
3096   } // Finished loading
3097 
3098   //========================================================
3099   // Start search
3100   //
3101 
3102   movptr(result, str1); // string addr
3103 
3104   if (int_cnt2  < 0) {  // Only for non constant substring
3105     jmpb(SCAN_TO_SUBSTR);
3106 
3107     // SP saved at sp+0
3108     // String saved at sp+1*wordSize
3109     // Substr saved at sp+2*wordSize
3110     // Substr count saved at sp+3*wordSize
3111 
3112     // Reload substr for rescan, this code
3113     // is executed only for large substrings (> 8 chars)
3114     bind(RELOAD_SUBSTR);
3115     movptr(str2, Address(rsp, 2*wordSize));
3116     movl(cnt2, Address(rsp, 3*wordSize));
3117     if (ae == StrIntrinsicNode::UL) {
3118       pmovzxbw(vec, Address(str2, 0));
3119     } else {
3120       movdqu(vec, Address(str2, 0));
3121     }
3122     // We came here after the beginning of the substring was
3123     // matched but the rest of it was not so we need to search
3124     // again. Start from the next element after the previous match.
3125     subptr(str1, result); // Restore counter
3126     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3127       shrl(str1, 1);
3128     }
3129     addl(cnt1, str1);
3130     decrementl(cnt1);   // Shift to next element
3131     cmpl(cnt1, cnt2);
3132     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3133 
3134     addptr(result, (1<<scale1));
3135   } // non constant
3136 
3137   // Scan string for start of substr in 16-byte vectors
3138   bind(SCAN_TO_SUBSTR);
3139   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3140   pcmpestri(vec, Address(result, 0), mode);
3141   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3142   subl(cnt1, stride);
3143   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3144   cmpl(cnt1, cnt2);
3145   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3146   addptr(result, 16);
3147 
3148   bind(ADJUST_STR);
3149   cmpl(cnt1, stride); // Do not read beyond string
3150   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3151   // Back-up string to avoid reading beyond string.
3152   lea(result, Address(result, cnt1, scale1, -16));
3153   movl(cnt1, stride);
3154   jmpb(SCAN_TO_SUBSTR);
3155 
3156   // Found a potential substr
3157   bind(FOUND_CANDIDATE);
3158   // After pcmpestri tmp(rcx) contains matched element index
3159 
3160   // Make sure string is still long enough
3161   subl(cnt1, tmp);
3162   cmpl(cnt1, cnt2);
3163   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3164   // Left less then substring.
3165 
3166   bind(RET_NOT_FOUND);
3167   movl(result, -1);
3168   jmp(CLEANUP);
3169 
3170   bind(FOUND_SUBSTR);
3171   // Compute start addr of substr
3172   lea(result, Address(result, tmp, scale1));
3173   if (int_cnt2 > 0) { // Constant substring
3174     // Repeat search for small substring (< 8 chars)
3175     // from new point without reloading substring.
3176     // Have to check that we don't read beyond string.
3177     cmpl(tmp, stride-int_cnt2);
3178     jccb(Assembler::greater, ADJUST_STR);
3179     // Fall through if matched whole substring.
3180   } else { // non constant
3181     assert(int_cnt2 == -1, "should be != 0");
3182 
3183     addl(tmp, cnt2);
3184     // Found result if we matched whole substring.
3185     cmpl(tmp, stride);
3186     jcc(Assembler::lessEqual, RET_FOUND);
3187 
3188     // Repeat search for small substring (<= 8 chars)
3189     // from new point 'str1' without reloading substring.
3190     cmpl(cnt2, stride);
3191     // Have to check that we don't read beyond string.
3192     jccb(Assembler::lessEqual, ADJUST_STR);
3193 
3194     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3195     // Compare the rest of substring (> 8 chars).
3196     movptr(str1, result);
3197 
3198     cmpl(tmp, cnt2);
3199     // First 8 chars are already matched.
3200     jccb(Assembler::equal, CHECK_NEXT);
3201 
3202     bind(SCAN_SUBSTR);
3203     pcmpestri(vec, Address(str1, 0), mode);
3204     // Need to reload strings pointers if not matched whole vector
3205     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3206 
3207     bind(CHECK_NEXT);
3208     subl(cnt2, stride);
3209     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3210     addptr(str1, 16);
3211     if (ae == StrIntrinsicNode::UL) {
3212       addptr(str2, 8);
3213     } else {
3214       addptr(str2, 16);
3215     }
3216     subl(cnt1, stride);
3217     cmpl(cnt2, stride); // Do not read beyond substring
3218     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3219     // Back-up strings to avoid reading beyond substring.
3220 
3221     if (ae == StrIntrinsicNode::UL) {
3222       lea(str2, Address(str2, cnt2, scale2, -8));
3223       lea(str1, Address(str1, cnt2, scale1, -16));
3224     } else {
3225       lea(str2, Address(str2, cnt2, scale2, -16));
3226       lea(str1, Address(str1, cnt2, scale1, -16));
3227     }
3228     subl(cnt1, cnt2);
3229     movl(cnt2, stride);
3230     addl(cnt1, stride);
3231     bind(CONT_SCAN_SUBSTR);
3232     if (ae == StrIntrinsicNode::UL) {
3233       pmovzxbw(vec, Address(str2, 0));
3234     } else {
3235       movdqu(vec, Address(str2, 0));
3236     }
3237     jmp(SCAN_SUBSTR);
3238 
3239     bind(RET_FOUND_LONG);
3240     movptr(str1, Address(rsp, wordSize));
3241   } // non constant
3242 
3243   bind(RET_FOUND);
3244   // Compute substr offset
3245   subptr(result, str1);
3246   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3247     shrl(result, 1); // index
3248   }
3249   bind(CLEANUP);
3250   pop(rsp); // restore SP
3251 
3252 } // string_indexof
3253 
3254 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3255                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3256   ShortBranchVerifier sbv(this);
3257   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3258 
3259   int stride = 8;
3260 
3261   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3262         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3263         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3264         FOUND_SEQ_CHAR, DONE_LABEL;
3265 
3266   movptr(result, str1);
3267   if (UseAVX >= 2) {
3268     cmpl(cnt1, stride);
3269     jcc(Assembler::less, SCAN_TO_CHAR);
3270     cmpl(cnt1, 2*stride);
3271     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3272     movdl(vec1, ch);
3273     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3274     vpxor(vec2, vec2);
3275     movl(tmp, cnt1);
3276     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3277     andl(cnt1,0x0000000F);  //tail count (in chars)
3278 
3279     bind(SCAN_TO_16_CHAR_LOOP);
3280     vmovdqu(vec3, Address(result, 0));
3281     vpcmpeqw(vec3, vec3, vec1, 1);
3282     vptest(vec2, vec3);
3283     jcc(Assembler::carryClear, FOUND_CHAR);
3284     addptr(result, 32);
3285     subl(tmp, 2*stride);
3286     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3287     jmp(SCAN_TO_8_CHAR);
3288     bind(SCAN_TO_8_CHAR_INIT);
3289     movdl(vec1, ch);
3290     pshuflw(vec1, vec1, 0x00);
3291     pshufd(vec1, vec1, 0);
3292     pxor(vec2, vec2);
3293   }
3294   bind(SCAN_TO_8_CHAR);
3295   cmpl(cnt1, stride);
3296   jcc(Assembler::less, SCAN_TO_CHAR);
3297   if (UseAVX < 2) {
3298     movdl(vec1, ch);
3299     pshuflw(vec1, vec1, 0x00);
3300     pshufd(vec1, vec1, 0);
3301     pxor(vec2, vec2);
3302   }
3303   movl(tmp, cnt1);
3304   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3305   andl(cnt1,0x00000007);  //tail count (in chars)
3306 
3307   bind(SCAN_TO_8_CHAR_LOOP);
3308   movdqu(vec3, Address(result, 0));
3309   pcmpeqw(vec3, vec1);
3310   ptest(vec2, vec3);
3311   jcc(Assembler::carryClear, FOUND_CHAR);
3312   addptr(result, 16);
3313   subl(tmp, stride);
3314   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3315   bind(SCAN_TO_CHAR);
3316   testl(cnt1, cnt1);
3317   jcc(Assembler::zero, RET_NOT_FOUND);
3318   bind(SCAN_TO_CHAR_LOOP);
3319   load_unsigned_short(tmp, Address(result, 0));
3320   cmpl(ch, tmp);
3321   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3322   addptr(result, 2);
3323   subl(cnt1, 1);
3324   jccb(Assembler::zero, RET_NOT_FOUND);
3325   jmp(SCAN_TO_CHAR_LOOP);
3326 
3327   bind(RET_NOT_FOUND);
3328   movl(result, -1);
3329   jmpb(DONE_LABEL);
3330 
3331   bind(FOUND_CHAR);
3332   if (UseAVX >= 2) {
3333     vpmovmskb(tmp, vec3);
3334   } else {
3335     pmovmskb(tmp, vec3);
3336   }
3337   bsfl(ch, tmp);
3338   addptr(result, ch);
3339 
3340   bind(FOUND_SEQ_CHAR);
3341   subptr(result, str1);
3342   shrl(result, 1);
3343 
3344   bind(DONE_LABEL);
3345 } // string_indexof_char
3346 
3347 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3348                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3349   ShortBranchVerifier sbv(this);
3350   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3351 
3352   int stride = 16;
3353 
3354   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3355         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3356         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3357         FOUND_SEQ_CHAR, DONE_LABEL;
3358 
3359   movptr(result, str1);
3360   if (UseAVX >= 2) {
3361     cmpl(cnt1, stride);
3362     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3363     cmpl(cnt1, stride*2);
3364     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3365     movdl(vec1, ch);
3366     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3367     vpxor(vec2, vec2);
3368     movl(tmp, cnt1);
3369     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3370     andl(cnt1,0x0000001F);  //tail count (in chars)
3371 
3372     bind(SCAN_TO_32_CHAR_LOOP);
3373     vmovdqu(vec3, Address(result, 0));
3374     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3375     vptest(vec2, vec3);
3376     jcc(Assembler::carryClear, FOUND_CHAR);
3377     addptr(result, 32);
3378     subl(tmp, stride*2);
3379     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3380     jmp(SCAN_TO_16_CHAR);
3381 
3382     bind(SCAN_TO_16_CHAR_INIT);
3383     movdl(vec1, ch);
3384     pxor(vec2, vec2);
3385     pshufb(vec1, vec2);
3386   }
3387 
3388   bind(SCAN_TO_16_CHAR);
3389   cmpl(cnt1, stride);
3390   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3391   if (UseAVX < 2) {
3392     movdl(vec1, ch);
3393     pxor(vec2, vec2);
3394     pshufb(vec1, vec2);
3395   }
3396   movl(tmp, cnt1);
3397   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3398   andl(cnt1,0x0000000F);  //tail count (in bytes)
3399 
3400   bind(SCAN_TO_16_CHAR_LOOP);
3401   movdqu(vec3, Address(result, 0));
3402   pcmpeqb(vec3, vec1);
3403   ptest(vec2, vec3);
3404   jcc(Assembler::carryClear, FOUND_CHAR);
3405   addptr(result, 16);
3406   subl(tmp, stride);
3407   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3408 
3409   bind(SCAN_TO_CHAR_INIT);
3410   testl(cnt1, cnt1);
3411   jcc(Assembler::zero, RET_NOT_FOUND);
3412   bind(SCAN_TO_CHAR_LOOP);
3413   load_unsigned_byte(tmp, Address(result, 0));
3414   cmpl(ch, tmp);
3415   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3416   addptr(result, 1);
3417   subl(cnt1, 1);
3418   jccb(Assembler::zero, RET_NOT_FOUND);
3419   jmp(SCAN_TO_CHAR_LOOP);
3420 
3421   bind(RET_NOT_FOUND);
3422   movl(result, -1);
3423   jmpb(DONE_LABEL);
3424 
3425   bind(FOUND_CHAR);
3426   if (UseAVX >= 2) {
3427     vpmovmskb(tmp, vec3);
3428   } else {
3429     pmovmskb(tmp, vec3);
3430   }
3431   bsfl(ch, tmp);
3432   addptr(result, ch);
3433 
3434   bind(FOUND_SEQ_CHAR);
3435   subptr(result, str1);
3436 
3437   bind(DONE_LABEL);
3438 } // stringL_indexof_char
3439 
3440 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3441   switch (eltype) {
3442   case T_BOOLEAN: return sizeof(jboolean);
3443   case T_BYTE:  return sizeof(jbyte);
3444   case T_SHORT: return sizeof(jshort);
3445   case T_CHAR:  return sizeof(jchar);
3446   case T_INT:   return sizeof(jint);
3447   default:
3448     ShouldNotReachHere();
3449     return -1;
3450   }
3451 }
3452 
3453 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3454   switch (eltype) {
3455   // T_BOOLEAN used as surrogate for unsigned byte
3456   case T_BOOLEAN: movzbl(dst, src);   break;
3457   case T_BYTE:    movsbl(dst, src);   break;
3458   case T_SHORT:   movswl(dst, src);   break;
3459   case T_CHAR:    movzwl(dst, src);   break;
3460   case T_INT:     movl(dst, src);     break;
3461   default:
3462     ShouldNotReachHere();
3463   }
3464 }
3465 
3466 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3467   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3468 }
3469 
3470 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3471   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3472 }
3473 
3474 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3475   const int vlen = Assembler::AVX_256bit;
3476   switch (eltype) {
3477   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3478   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3479   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3480   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3481   case T_INT:
3482     // do nothing
3483     break;
3484   default:
3485     ShouldNotReachHere();
3486   }
3487 }
3488 
3489 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3490                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3491                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3492                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3493                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3494                                         BasicType eltype) {
3495   ShortBranchVerifier sbv(this);
3496   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3497   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3498   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3499 
3500   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3501         SHORT_UNROLLED_LOOP_EXIT,
3502         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3503         UNROLLED_VECTOR_LOOP_BEGIN,
3504         END;
3505   switch (eltype) {
3506   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3507   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3508   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3509   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3510   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3511   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3512   }
3513 
3514   // For "renaming" for readibility of the code
3515   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3516                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3517                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3518 
3519   const int elsize = arrays_hashcode_elsize(eltype);
3520 
3521   /*
3522     if (cnt1 >= 2) {
3523       if (cnt1 >= 32) {
3524         UNROLLED VECTOR LOOP
3525       }
3526       UNROLLED SCALAR LOOP
3527     }
3528     SINGLE SCALAR
3529    */
3530 
3531   cmpl(cnt1, 32);
3532   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3533 
3534   // cnt1 >= 32 && generate_vectorized_loop
3535   xorl(index, index);
3536 
3537   // vresult = IntVector.zero(I256);
3538   for (int idx = 0; idx < 4; idx++) {
3539     vpxor(vresult[idx], vresult[idx]);
3540   }
3541   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3542   Register bound = tmp2;
3543   Register next = tmp3;
3544   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3545   movl(next, Address(tmp2, 0));
3546   movdl(vnext, next);
3547   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3548 
3549   // index = 0;
3550   // bound = cnt1 & ~(32 - 1);
3551   movl(bound, cnt1);
3552   andl(bound, ~(32 - 1));
3553   // for (; index < bound; index += 32) {
3554   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3555   // result *= next;
3556   imull(result, next);
3557   // loop fission to upfront the cost of fetching from memory, OOO execution
3558   // can then hopefully do a better job of prefetching
3559   for (int idx = 0; idx < 4; idx++) {
3560     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3561   }
3562   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3563   for (int idx = 0; idx < 4; idx++) {
3564     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3565     arrays_hashcode_elvcast(vtmp[idx], eltype);
3566     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3567   }
3568   // index += 32;
3569   addl(index, 32);
3570   // index < bound;
3571   cmpl(index, bound);
3572   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3573   // }
3574 
3575   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3576   subl(cnt1, bound);
3577   // release bound
3578 
3579   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3580   for (int idx = 0; idx < 4; idx++) {
3581     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3582     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3583     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3584   }
3585   // result += vresult.reduceLanes(ADD);
3586   for (int idx = 0; idx < 4; idx++) {
3587     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3588   }
3589 
3590   // } else if (cnt1 < 32) {
3591 
3592   bind(SHORT_UNROLLED_BEGIN);
3593   // int i = 1;
3594   movl(index, 1);
3595   cmpl(index, cnt1);
3596   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3597 
3598   // for (; i < cnt1 ; i += 2) {
3599   bind(SHORT_UNROLLED_LOOP_BEGIN);
3600   movl(tmp3, 961);
3601   imull(result, tmp3);
3602   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3603   movl(tmp3, tmp2);
3604   shll(tmp3, 5);
3605   subl(tmp3, tmp2);
3606   addl(result, tmp3);
3607   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3608   addl(result, tmp3);
3609   addl(index, 2);
3610   cmpl(index, cnt1);
3611   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3612 
3613   // }
3614   // if (i >= cnt1) {
3615   bind(SHORT_UNROLLED_LOOP_EXIT);
3616   jccb(Assembler::greater, END);
3617   movl(tmp2, result);
3618   shll(result, 5);
3619   subl(result, tmp2);
3620   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3621   addl(result, tmp3);
3622   // }
3623   bind(END);
3624 
3625   BLOCK_COMMENT("} // arrays_hashcode");
3626 
3627 } // arrays_hashcode
3628 
3629 // helper function for string_compare
3630 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3631                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3632                                            Address::ScaleFactor scale2, Register index, int ae) {
3633   if (ae == StrIntrinsicNode::LL) {
3634     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3635     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3636   } else if (ae == StrIntrinsicNode::UU) {
3637     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3638     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3639   } else {
3640     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3641     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3642   }
3643 }
3644 
3645 // Compare strings, used for char[] and byte[].
3646 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3647                                        Register cnt1, Register cnt2, Register result,
3648                                        XMMRegister vec1, int ae, KRegister mask) {
3649   ShortBranchVerifier sbv(this);
3650   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3651   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3652   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3653   int stride2x2 = 0x40;
3654   Address::ScaleFactor scale = Address::no_scale;
3655   Address::ScaleFactor scale1 = Address::no_scale;
3656   Address::ScaleFactor scale2 = Address::no_scale;
3657 
3658   if (ae != StrIntrinsicNode::LL) {
3659     stride2x2 = 0x20;
3660   }
3661 
3662   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3663     shrl(cnt2, 1);
3664   }
3665   // Compute the minimum of the string lengths and the
3666   // difference of the string lengths (stack).
3667   // Do the conditional move stuff
3668   movl(result, cnt1);
3669   subl(cnt1, cnt2);
3670   push(cnt1);
3671   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3672 
3673   // Is the minimum length zero?
3674   testl(cnt2, cnt2);
3675   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3676   if (ae == StrIntrinsicNode::LL) {
3677     // Load first bytes
3678     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3679     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3680   } else if (ae == StrIntrinsicNode::UU) {
3681     // Load first characters
3682     load_unsigned_short(result, Address(str1, 0));
3683     load_unsigned_short(cnt1, Address(str2, 0));
3684   } else {
3685     load_unsigned_byte(result, Address(str1, 0));
3686     load_unsigned_short(cnt1, Address(str2, 0));
3687   }
3688   subl(result, cnt1);
3689   jcc(Assembler::notZero,  POP_LABEL);
3690 
3691   if (ae == StrIntrinsicNode::UU) {
3692     // Divide length by 2 to get number of chars
3693     shrl(cnt2, 1);
3694   }
3695   cmpl(cnt2, 1);
3696   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3697 
3698   // Check if the strings start at the same location and setup scale and stride
3699   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3700     cmpptr(str1, str2);
3701     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3702     if (ae == StrIntrinsicNode::LL) {
3703       scale = Address::times_1;
3704       stride = 16;
3705     } else {
3706       scale = Address::times_2;
3707       stride = 8;
3708     }
3709   } else {
3710     scale1 = Address::times_1;
3711     scale2 = Address::times_2;
3712     // scale not used
3713     stride = 8;
3714   }
3715 
3716   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3717     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3718     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3719     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3720     Label COMPARE_TAIL_LONG;
3721     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3722 
3723     int pcmpmask = 0x19;
3724     if (ae == StrIntrinsicNode::LL) {
3725       pcmpmask &= ~0x01;
3726     }
3727 
3728     // Setup to compare 16-chars (32-bytes) vectors,
3729     // start from first character again because it has aligned address.
3730     if (ae == StrIntrinsicNode::LL) {
3731       stride2 = 32;
3732     } else {
3733       stride2 = 16;
3734     }
3735     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3736       adr_stride = stride << scale;
3737     } else {
3738       adr_stride1 = 8;  //stride << scale1;
3739       adr_stride2 = 16; //stride << scale2;
3740     }
3741 
3742     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3743     // rax and rdx are used by pcmpestri as elements counters
3744     movl(result, cnt2);
3745     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3746     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3747 
3748     // fast path : compare first 2 8-char vectors.
3749     bind(COMPARE_16_CHARS);
3750     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3751       movdqu(vec1, Address(str1, 0));
3752     } else {
3753       pmovzxbw(vec1, Address(str1, 0));
3754     }
3755     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3756     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3757 
3758     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3759       movdqu(vec1, Address(str1, adr_stride));
3760       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3761     } else {
3762       pmovzxbw(vec1, Address(str1, adr_stride1));
3763       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3764     }
3765     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3766     addl(cnt1, stride);
3767 
3768     // Compare the characters at index in cnt1
3769     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3770     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3771     subl(result, cnt2);
3772     jmp(POP_LABEL);
3773 
3774     // Setup the registers to start vector comparison loop
3775     bind(COMPARE_WIDE_VECTORS);
3776     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3777       lea(str1, Address(str1, result, scale));
3778       lea(str2, Address(str2, result, scale));
3779     } else {
3780       lea(str1, Address(str1, result, scale1));
3781       lea(str2, Address(str2, result, scale2));
3782     }
3783     subl(result, stride2);
3784     subl(cnt2, stride2);
3785     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3786     negptr(result);
3787 
3788     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3789     bind(COMPARE_WIDE_VECTORS_LOOP);
3790 
3791     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3792       cmpl(cnt2, stride2x2);
3793       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3794       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3795       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3796 
3797       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3798       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3799         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3800         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3801       } else {
3802         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3803         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3804       }
3805       kortestql(mask, mask);
3806       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3807       addptr(result, stride2x2);  // update since we already compared at this addr
3808       subl(cnt2, stride2x2);      // and sub the size too
3809       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3810 
3811       vpxor(vec1, vec1);
3812       jmpb(COMPARE_WIDE_TAIL);
3813     }//if (VM_Version::supports_avx512vlbw())
3814 
3815     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3816     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3817       vmovdqu(vec1, Address(str1, result, scale));
3818       vpxor(vec1, Address(str2, result, scale));
3819     } else {
3820       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3821       vpxor(vec1, Address(str2, result, scale2));
3822     }
3823     vptest(vec1, vec1);
3824     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3825     addptr(result, stride2);
3826     subl(cnt2, stride2);
3827     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3828     // clean upper bits of YMM registers
3829     vpxor(vec1, vec1);
3830 
3831     // compare wide vectors tail
3832     bind(COMPARE_WIDE_TAIL);
3833     testptr(result, result);
3834     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3835 
3836     movl(result, stride2);
3837     movl(cnt2, result);
3838     negptr(result);
3839     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3840 
3841     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3842     bind(VECTOR_NOT_EQUAL);
3843     // clean upper bits of YMM registers
3844     vpxor(vec1, vec1);
3845     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3846       lea(str1, Address(str1, result, scale));
3847       lea(str2, Address(str2, result, scale));
3848     } else {
3849       lea(str1, Address(str1, result, scale1));
3850       lea(str2, Address(str2, result, scale2));
3851     }
3852     jmp(COMPARE_16_CHARS);
3853 
3854     // Compare tail chars, length between 1 to 15 chars
3855     bind(COMPARE_TAIL_LONG);
3856     movl(cnt2, result);
3857     cmpl(cnt2, stride);
3858     jcc(Assembler::less, COMPARE_SMALL_STR);
3859 
3860     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3861       movdqu(vec1, Address(str1, 0));
3862     } else {
3863       pmovzxbw(vec1, Address(str1, 0));
3864     }
3865     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3866     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3867     subptr(cnt2, stride);
3868     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3869     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3870       lea(str1, Address(str1, result, scale));
3871       lea(str2, Address(str2, result, scale));
3872     } else {
3873       lea(str1, Address(str1, result, scale1));
3874       lea(str2, Address(str2, result, scale2));
3875     }
3876     negptr(cnt2);
3877     jmpb(WHILE_HEAD_LABEL);
3878 
3879     bind(COMPARE_SMALL_STR);
3880   } else if (UseSSE42Intrinsics) {
3881     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3882     int pcmpmask = 0x19;
3883     // Setup to compare 8-char (16-byte) vectors,
3884     // start from first character again because it has aligned address.
3885     movl(result, cnt2);
3886     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3887     if (ae == StrIntrinsicNode::LL) {
3888       pcmpmask &= ~0x01;
3889     }
3890     jcc(Assembler::zero, COMPARE_TAIL);
3891     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3892       lea(str1, Address(str1, result, scale));
3893       lea(str2, Address(str2, result, scale));
3894     } else {
3895       lea(str1, Address(str1, result, scale1));
3896       lea(str2, Address(str2, result, scale2));
3897     }
3898     negptr(result);
3899 
3900     // pcmpestri
3901     //   inputs:
3902     //     vec1- substring
3903     //     rax - negative string length (elements count)
3904     //     mem - scanned string
3905     //     rdx - string length (elements count)
3906     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3907     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3908     //   outputs:
3909     //     rcx - first mismatched element index
3910     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3911 
3912     bind(COMPARE_WIDE_VECTORS);
3913     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3914       movdqu(vec1, Address(str1, result, scale));
3915       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3916     } else {
3917       pmovzxbw(vec1, Address(str1, result, scale1));
3918       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3919     }
3920     // After pcmpestri cnt1(rcx) contains mismatched element index
3921 
3922     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3923     addptr(result, stride);
3924     subptr(cnt2, stride);
3925     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3926 
3927     // compare wide vectors tail
3928     testptr(result, result);
3929     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3930 
3931     movl(cnt2, stride);
3932     movl(result, stride);
3933     negptr(result);
3934     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3935       movdqu(vec1, Address(str1, result, scale));
3936       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3937     } else {
3938       pmovzxbw(vec1, Address(str1, result, scale1));
3939       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3940     }
3941     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3942 
3943     // Mismatched characters in the vectors
3944     bind(VECTOR_NOT_EQUAL);
3945     addptr(cnt1, result);
3946     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3947     subl(result, cnt2);
3948     jmpb(POP_LABEL);
3949 
3950     bind(COMPARE_TAIL); // limit is zero
3951     movl(cnt2, result);
3952     // Fallthru to tail compare
3953   }
3954   // Shift str2 and str1 to the end of the arrays, negate min
3955   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3956     lea(str1, Address(str1, cnt2, scale));
3957     lea(str2, Address(str2, cnt2, scale));
3958   } else {
3959     lea(str1, Address(str1, cnt2, scale1));
3960     lea(str2, Address(str2, cnt2, scale2));
3961   }
3962   decrementl(cnt2);  // first character was compared already
3963   negptr(cnt2);
3964 
3965   // Compare the rest of the elements
3966   bind(WHILE_HEAD_LABEL);
3967   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3968   subl(result, cnt1);
3969   jccb(Assembler::notZero, POP_LABEL);
3970   increment(cnt2);
3971   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3972 
3973   // Strings are equal up to min length.  Return the length difference.
3974   bind(LENGTH_DIFF_LABEL);
3975   pop(result);
3976   if (ae == StrIntrinsicNode::UU) {
3977     // Divide diff by 2 to get number of chars
3978     sarl(result, 1);
3979   }
3980   jmpb(DONE_LABEL);
3981 
3982   if (VM_Version::supports_avx512vlbw()) {
3983 
3984     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3985 
3986     kmovql(cnt1, mask);
3987     notq(cnt1);
3988     bsfq(cnt2, cnt1);
3989     if (ae != StrIntrinsicNode::LL) {
3990       // Divide diff by 2 to get number of chars
3991       sarl(cnt2, 1);
3992     }
3993     addq(result, cnt2);
3994     if (ae == StrIntrinsicNode::LL) {
3995       load_unsigned_byte(cnt1, Address(str2, result));
3996       load_unsigned_byte(result, Address(str1, result));
3997     } else if (ae == StrIntrinsicNode::UU) {
3998       load_unsigned_short(cnt1, Address(str2, result, scale));
3999       load_unsigned_short(result, Address(str1, result, scale));
4000     } else {
4001       load_unsigned_short(cnt1, Address(str2, result, scale2));
4002       load_unsigned_byte(result, Address(str1, result, scale1));
4003     }
4004     subl(result, cnt1);
4005     jmpb(POP_LABEL);
4006   }//if (VM_Version::supports_avx512vlbw())
4007 
4008   // Discard the stored length difference
4009   bind(POP_LABEL);
4010   pop(cnt1);
4011 
4012   // That's it
4013   bind(DONE_LABEL);
4014   if(ae == StrIntrinsicNode::UL) {
4015     negl(result);
4016   }
4017 
4018 }
4019 
4020 // Search for Non-ASCII character (Negative byte value) in a byte array,
4021 // return the index of the first such character, otherwise the length
4022 // of the array segment searched.
4023 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4024 //   @IntrinsicCandidate
4025 //   public static int countPositives(byte[] ba, int off, int len) {
4026 //     for (int i = off; i < off + len; i++) {
4027 //       if (ba[i] < 0) {
4028 //         return i - off;
4029 //       }
4030 //     }
4031 //     return len;
4032 //   }
4033 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4034   Register result, Register tmp1,
4035   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4036   // rsi: byte array
4037   // rcx: len
4038   // rax: result
4039   ShortBranchVerifier sbv(this);
4040   assert_different_registers(ary1, len, result, tmp1);
4041   assert_different_registers(vec1, vec2);
4042   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4043 
4044   movl(result, len); // copy
4045   // len == 0
4046   testl(len, len);
4047   jcc(Assembler::zero, DONE);
4048 
4049   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4050     VM_Version::supports_avx512vlbw() &&
4051     VM_Version::supports_bmi2()) {
4052 
4053     Label test_64_loop, test_tail, BREAK_LOOP;
4054     movl(tmp1, len);
4055     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4056 
4057     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4058     andl(len,  0xffffffc0); // vector count (in chars)
4059     jccb(Assembler::zero, test_tail);
4060 
4061     lea(ary1, Address(ary1, len, Address::times_1));
4062     negptr(len);
4063 
4064     bind(test_64_loop);
4065     // Check whether our 64 elements of size byte contain negatives
4066     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4067     kortestql(mask1, mask1);
4068     jcc(Assembler::notZero, BREAK_LOOP);
4069 
4070     addptr(len, 64);
4071     jccb(Assembler::notZero, test_64_loop);
4072 
4073     bind(test_tail);
4074     // bail out when there is nothing to be done
4075     testl(tmp1, -1);
4076     jcc(Assembler::zero, DONE);
4077 
4078 
4079     // check the tail for absense of negatives
4080     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4081     {
4082       Register tmp3_aliased = len;
4083       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4084       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4085       notq(tmp3_aliased);
4086       kmovql(mask2, tmp3_aliased);
4087     }
4088 
4089     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4090     ktestq(mask1, mask2);
4091     jcc(Assembler::zero, DONE);
4092 
4093     // do a full check for negative registers in the tail
4094     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4095                      // ary1 already pointing to the right place
4096     jmpb(TAIL_START);
4097 
4098     bind(BREAK_LOOP);
4099     // At least one byte in the last 64 byte block was negative.
4100     // Set up to look at the last 64 bytes as if they were a tail
4101     lea(ary1, Address(ary1, len, Address::times_1));
4102     addptr(result, len);
4103     // Ignore the very last byte: if all others are positive,
4104     // it must be negative, so we can skip right to the 2+1 byte
4105     // end comparison at this point
4106     orl(result, 63);
4107     movl(len, 63);
4108     // Fallthru to tail compare
4109   } else {
4110 
4111     if (UseAVX >= 2 && UseSSE >= 2) {
4112       // With AVX2, use 32-byte vector compare
4113       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4114 
4115       // Compare 32-byte vectors
4116       testl(len, 0xffffffe0);   // vector count (in bytes)
4117       jccb(Assembler::zero, TAIL_START);
4118 
4119       andl(len, 0xffffffe0);
4120       lea(ary1, Address(ary1, len, Address::times_1));
4121       negptr(len);
4122 
4123       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4124       movdl(vec2, tmp1);
4125       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4126 
4127       bind(COMPARE_WIDE_VECTORS);
4128       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4129       vptest(vec1, vec2);
4130       jccb(Assembler::notZero, BREAK_LOOP);
4131       addptr(len, 32);
4132       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4133 
4134       testl(result, 0x0000001f);   // any bytes remaining?
4135       jcc(Assembler::zero, DONE);
4136 
4137       // Quick test using the already prepared vector mask
4138       movl(len, result);
4139       andl(len, 0x0000001f);
4140       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4141       vptest(vec1, vec2);
4142       jcc(Assembler::zero, DONE);
4143       // There are zeros, jump to the tail to determine exactly where
4144       jmpb(TAIL_START);
4145 
4146       bind(BREAK_LOOP);
4147       // At least one byte in the last 32-byte vector is negative.
4148       // Set up to look at the last 32 bytes as if they were a tail
4149       lea(ary1, Address(ary1, len, Address::times_1));
4150       addptr(result, len);
4151       // Ignore the very last byte: if all others are positive,
4152       // it must be negative, so we can skip right to the 2+1 byte
4153       // end comparison at this point
4154       orl(result, 31);
4155       movl(len, 31);
4156       // Fallthru to tail compare
4157     } else if (UseSSE42Intrinsics) {
4158       // With SSE4.2, use double quad vector compare
4159       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4160 
4161       // Compare 16-byte vectors
4162       testl(len, 0xfffffff0);   // vector count (in bytes)
4163       jcc(Assembler::zero, TAIL_START);
4164 
4165       andl(len, 0xfffffff0);
4166       lea(ary1, Address(ary1, len, Address::times_1));
4167       negptr(len);
4168 
4169       movl(tmp1, 0x80808080);
4170       movdl(vec2, tmp1);
4171       pshufd(vec2, vec2, 0);
4172 
4173       bind(COMPARE_WIDE_VECTORS);
4174       movdqu(vec1, Address(ary1, len, Address::times_1));
4175       ptest(vec1, vec2);
4176       jccb(Assembler::notZero, BREAK_LOOP);
4177       addptr(len, 16);
4178       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4179 
4180       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4181       jcc(Assembler::zero, DONE);
4182 
4183       // Quick test using the already prepared vector mask
4184       movl(len, result);
4185       andl(len, 0x0000000f);   // tail count (in bytes)
4186       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4187       ptest(vec1, vec2);
4188       jcc(Assembler::zero, DONE);
4189       jmpb(TAIL_START);
4190 
4191       bind(BREAK_LOOP);
4192       // At least one byte in the last 16-byte vector is negative.
4193       // Set up and look at the last 16 bytes as if they were a tail
4194       lea(ary1, Address(ary1, len, Address::times_1));
4195       addptr(result, len);
4196       // Ignore the very last byte: if all others are positive,
4197       // it must be negative, so we can skip right to the 2+1 byte
4198       // end comparison at this point
4199       orl(result, 15);
4200       movl(len, 15);
4201       // Fallthru to tail compare
4202     }
4203   }
4204 
4205   bind(TAIL_START);
4206   // Compare 4-byte vectors
4207   andl(len, 0xfffffffc); // vector count (in bytes)
4208   jccb(Assembler::zero, COMPARE_CHAR);
4209 
4210   lea(ary1, Address(ary1, len, Address::times_1));
4211   negptr(len);
4212 
4213   bind(COMPARE_VECTORS);
4214   movl(tmp1, Address(ary1, len, Address::times_1));
4215   andl(tmp1, 0x80808080);
4216   jccb(Assembler::notZero, TAIL_ADJUST);
4217   addptr(len, 4);
4218   jccb(Assembler::notZero, COMPARE_VECTORS);
4219 
4220   // Compare trailing char (final 2-3 bytes), if any
4221   bind(COMPARE_CHAR);
4222 
4223   testl(result, 0x2);   // tail  char
4224   jccb(Assembler::zero, COMPARE_BYTE);
4225   load_unsigned_short(tmp1, Address(ary1, 0));
4226   andl(tmp1, 0x00008080);
4227   jccb(Assembler::notZero, CHAR_ADJUST);
4228   lea(ary1, Address(ary1, 2));
4229 
4230   bind(COMPARE_BYTE);
4231   testl(result, 0x1);   // tail  byte
4232   jccb(Assembler::zero, DONE);
4233   load_unsigned_byte(tmp1, Address(ary1, 0));
4234   testl(tmp1, 0x00000080);
4235   jccb(Assembler::zero, DONE);
4236   subptr(result, 1);
4237   jmpb(DONE);
4238 
4239   bind(TAIL_ADJUST);
4240   // there are negative bits in the last 4 byte block.
4241   // Adjust result and check the next three bytes
4242   addptr(result, len);
4243   orl(result, 3);
4244   lea(ary1, Address(ary1, len, Address::times_1));
4245   jmpb(COMPARE_CHAR);
4246 
4247   bind(CHAR_ADJUST);
4248   // We are looking at a char + optional byte tail, and found that one
4249   // of the bytes in the char is negative. Adjust the result, check the
4250   // first byte and readjust if needed.
4251   andl(result, 0xfffffffc);
4252   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4253   jccb(Assembler::notZero, DONE);
4254   addptr(result, 1);
4255 
4256   // That's it
4257   bind(DONE);
4258   if (UseAVX >= 2 && UseSSE >= 2) {
4259     // clean upper bits of YMM registers
4260     vpxor(vec1, vec1);
4261     vpxor(vec2, vec2);
4262   }
4263 }
4264 
4265 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4266 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4267                                       Register limit, Register result, Register chr,
4268                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4269                                       KRegister mask, bool expand_ary2) {
4270   // for expand_ary2, limit is the (smaller) size of the second array.
4271   ShortBranchVerifier sbv(this);
4272   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4273 
4274   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4275          "Expansion only implemented for AVX2");
4276 
4277   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4278   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4279 
4280   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4281   int scaleIncr = expand_ary2 ? 8 : 16;
4282 
4283   if (is_array_equ) {
4284     // Check the input args
4285     cmpoop(ary1, ary2);
4286     jcc(Assembler::equal, TRUE_LABEL);
4287 
4288     // Need additional checks for arrays_equals.
4289     testptr(ary1, ary1);
4290     jcc(Assembler::zero, FALSE_LABEL);
4291     testptr(ary2, ary2);
4292     jcc(Assembler::zero, FALSE_LABEL);
4293 
4294     // Check the lengths
4295     movl(limit, Address(ary1, length_offset));
4296     cmpl(limit, Address(ary2, length_offset));
4297     jcc(Assembler::notEqual, FALSE_LABEL);
4298   }
4299 
4300   // count == 0
4301   testl(limit, limit);
4302   jcc(Assembler::zero, TRUE_LABEL);
4303 
4304   if (is_array_equ) {
4305     // Load array address
4306     lea(ary1, Address(ary1, base_offset));
4307     lea(ary2, Address(ary2, base_offset));
4308   }
4309 
4310   if (is_array_equ && is_char) {
4311     // arrays_equals when used for char[].
4312     shll(limit, 1);      // byte count != 0
4313   }
4314   movl(result, limit); // copy
4315 
4316   if (UseAVX >= 2) {
4317     // With AVX2, use 32-byte vector compare
4318     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4319 
4320     // Compare 32-byte vectors
4321     if (expand_ary2) {
4322       andl(result, 0x0000000f);  //   tail count (in bytes)
4323       andl(limit, 0xfffffff0);   // vector count (in bytes)
4324       jcc(Assembler::zero, COMPARE_TAIL);
4325     } else {
4326       andl(result, 0x0000001f);  //   tail count (in bytes)
4327       andl(limit, 0xffffffe0);   // vector count (in bytes)
4328       jcc(Assembler::zero, COMPARE_TAIL_16);
4329     }
4330 
4331     lea(ary1, Address(ary1, limit, scaleFactor));
4332     lea(ary2, Address(ary2, limit, Address::times_1));
4333     negptr(limit);
4334 
4335     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4336       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4337 
4338       cmpl(limit, -64);
4339       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4340 
4341       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4342 
4343       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4344       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4345       kortestql(mask, mask);
4346       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4347       addptr(limit, 64);  // update since we already compared at this addr
4348       cmpl(limit, -64);
4349       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4350 
4351       // At this point we may still need to compare -limit+result bytes.
4352       // We could execute the next two instruction and just continue via non-wide path:
4353       //  cmpl(limit, 0);
4354       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4355       // But since we stopped at the points ary{1,2}+limit which are
4356       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4357       // (|limit| <= 32 and result < 32),
4358       // we may just compare the last 64 bytes.
4359       //
4360       addptr(result, -64);   // it is safe, bc we just came from this area
4361       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4362       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4363       kortestql(mask, mask);
4364       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4365 
4366       jmp(TRUE_LABEL);
4367 
4368       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4369 
4370     }//if (VM_Version::supports_avx512vlbw())
4371 
4372     bind(COMPARE_WIDE_VECTORS);
4373     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4374     if (expand_ary2) {
4375       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4376     } else {
4377       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4378     }
4379     vpxor(vec1, vec2);
4380 
4381     vptest(vec1, vec1);
4382     jcc(Assembler::notZero, FALSE_LABEL);
4383     addptr(limit, scaleIncr * 2);
4384     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4385 
4386     testl(result, result);
4387     jcc(Assembler::zero, TRUE_LABEL);
4388 
4389     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4390     if (expand_ary2) {
4391       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4392     } else {
4393       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4394     }
4395     vpxor(vec1, vec2);
4396 
4397     vptest(vec1, vec1);
4398     jcc(Assembler::notZero, FALSE_LABEL);
4399     jmp(TRUE_LABEL);
4400 
4401     bind(COMPARE_TAIL_16); // limit is zero
4402     movl(limit, result);
4403 
4404     // Compare 16-byte chunks
4405     andl(result, 0x0000000f);  //   tail count (in bytes)
4406     andl(limit, 0xfffffff0);   // vector count (in bytes)
4407     jcc(Assembler::zero, COMPARE_TAIL);
4408 
4409     lea(ary1, Address(ary1, limit, scaleFactor));
4410     lea(ary2, Address(ary2, limit, Address::times_1));
4411     negptr(limit);
4412 
4413     bind(COMPARE_WIDE_VECTORS_16);
4414     movdqu(vec1, Address(ary1, limit, scaleFactor));
4415     if (expand_ary2) {
4416       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4417     } else {
4418       movdqu(vec2, Address(ary2, limit, Address::times_1));
4419     }
4420     pxor(vec1, vec2);
4421 
4422     ptest(vec1, vec1);
4423     jcc(Assembler::notZero, FALSE_LABEL);
4424     addptr(limit, scaleIncr);
4425     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4426 
4427     bind(COMPARE_TAIL); // limit is zero
4428     movl(limit, result);
4429     // Fallthru to tail compare
4430   } else if (UseSSE42Intrinsics) {
4431     // With SSE4.2, use double quad vector compare
4432     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4433 
4434     // Compare 16-byte vectors
4435     andl(result, 0x0000000f);  //   tail count (in bytes)
4436     andl(limit, 0xfffffff0);   // vector count (in bytes)
4437     jcc(Assembler::zero, COMPARE_TAIL);
4438 
4439     lea(ary1, Address(ary1, limit, Address::times_1));
4440     lea(ary2, Address(ary2, limit, Address::times_1));
4441     negptr(limit);
4442 
4443     bind(COMPARE_WIDE_VECTORS);
4444     movdqu(vec1, Address(ary1, limit, Address::times_1));
4445     movdqu(vec2, Address(ary2, limit, Address::times_1));
4446     pxor(vec1, vec2);
4447 
4448     ptest(vec1, vec1);
4449     jcc(Assembler::notZero, FALSE_LABEL);
4450     addptr(limit, 16);
4451     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4452 
4453     testl(result, result);
4454     jcc(Assembler::zero, TRUE_LABEL);
4455 
4456     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4457     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4458     pxor(vec1, vec2);
4459 
4460     ptest(vec1, vec1);
4461     jccb(Assembler::notZero, FALSE_LABEL);
4462     jmpb(TRUE_LABEL);
4463 
4464     bind(COMPARE_TAIL); // limit is zero
4465     movl(limit, result);
4466     // Fallthru to tail compare
4467   }
4468 
4469   // Compare 4-byte vectors
4470   if (expand_ary2) {
4471     testl(result, result);
4472     jccb(Assembler::zero, TRUE_LABEL);
4473   } else {
4474     andl(limit, 0xfffffffc); // vector count (in bytes)
4475     jccb(Assembler::zero, COMPARE_CHAR);
4476   }
4477 
4478   lea(ary1, Address(ary1, limit, scaleFactor));
4479   lea(ary2, Address(ary2, limit, Address::times_1));
4480   negptr(limit);
4481 
4482   bind(COMPARE_VECTORS);
4483   if (expand_ary2) {
4484     // There are no "vector" operations for bytes to shorts
4485     movzbl(chr, Address(ary2, limit, Address::times_1));
4486     cmpw(Address(ary1, limit, Address::times_2), chr);
4487     jccb(Assembler::notEqual, FALSE_LABEL);
4488     addptr(limit, 1);
4489     jcc(Assembler::notZero, COMPARE_VECTORS);
4490     jmp(TRUE_LABEL);
4491   } else {
4492     movl(chr, Address(ary1, limit, Address::times_1));
4493     cmpl(chr, Address(ary2, limit, Address::times_1));
4494     jccb(Assembler::notEqual, FALSE_LABEL);
4495     addptr(limit, 4);
4496     jcc(Assembler::notZero, COMPARE_VECTORS);
4497   }
4498 
4499   // Compare trailing char (final 2 bytes), if any
4500   bind(COMPARE_CHAR);
4501   testl(result, 0x2);   // tail  char
4502   jccb(Assembler::zero, COMPARE_BYTE);
4503   load_unsigned_short(chr, Address(ary1, 0));
4504   load_unsigned_short(limit, Address(ary2, 0));
4505   cmpl(chr, limit);
4506   jccb(Assembler::notEqual, FALSE_LABEL);
4507 
4508   if (is_array_equ && is_char) {
4509     bind(COMPARE_BYTE);
4510   } else {
4511     lea(ary1, Address(ary1, 2));
4512     lea(ary2, Address(ary2, 2));
4513 
4514     bind(COMPARE_BYTE);
4515     testl(result, 0x1);   // tail  byte
4516     jccb(Assembler::zero, TRUE_LABEL);
4517     load_unsigned_byte(chr, Address(ary1, 0));
4518     load_unsigned_byte(limit, Address(ary2, 0));
4519     cmpl(chr, limit);
4520     jccb(Assembler::notEqual, FALSE_LABEL);
4521   }
4522   bind(TRUE_LABEL);
4523   movl(result, 1);   // return true
4524   jmpb(DONE);
4525 
4526   bind(FALSE_LABEL);
4527   xorl(result, result); // return false
4528 
4529   // That's it
4530   bind(DONE);
4531   if (UseAVX >= 2) {
4532     // clean upper bits of YMM registers
4533     vpxor(vec1, vec1);
4534     vpxor(vec2, vec2);
4535   }
4536 }
4537 
4538 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4539 #define __ masm.
4540   Register dst = stub.data<0>();
4541   XMMRegister src = stub.data<1>();
4542   address target = stub.data<2>();
4543   __ bind(stub.entry());
4544   __ subptr(rsp, 8);
4545   __ movdbl(Address(rsp), src);
4546   __ call(RuntimeAddress(target));
4547   __ pop(dst);
4548   __ jmp(stub.continuation());
4549 #undef __
4550 }
4551 
4552 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4553   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4554   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4555 
4556   address slowpath_target;
4557   if (dst_bt == T_INT) {
4558     if (src_bt == T_FLOAT) {
4559       cvttss2sil(dst, src);
4560       cmpl(dst, 0x80000000);
4561       slowpath_target = StubRoutines::x86::f2i_fixup();
4562     } else {
4563       cvttsd2sil(dst, src);
4564       cmpl(dst, 0x80000000);
4565       slowpath_target = StubRoutines::x86::d2i_fixup();
4566     }
4567   } else {
4568     if (src_bt == T_FLOAT) {
4569       cvttss2siq(dst, src);
4570       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4571       slowpath_target = StubRoutines::x86::f2l_fixup();
4572     } else {
4573       cvttsd2siq(dst, src);
4574       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4575       slowpath_target = StubRoutines::x86::d2l_fixup();
4576     }
4577   }
4578 
4579   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4580   jcc(Assembler::equal, stub->entry());
4581   bind(stub->continuation());
4582 }
4583 
4584 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4585                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4586   switch(ideal_opc) {
4587     case Op_LShiftVS:
4588       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4589     case Op_LShiftVI:
4590       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4591     case Op_LShiftVL:
4592       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4593     case Op_RShiftVS:
4594       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4595     case Op_RShiftVI:
4596       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4597     case Op_RShiftVL:
4598       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4599     case Op_URShiftVS:
4600       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4601     case Op_URShiftVI:
4602       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4603     case Op_URShiftVL:
4604       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4605     case Op_RotateRightV:
4606       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4607     case Op_RotateLeftV:
4608       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4609     default:
4610       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4611       break;
4612   }
4613 }
4614 
4615 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4616                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4617   if (is_unsigned) {
4618     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4619   } else {
4620     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4621   }
4622 }
4623 
4624 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4625                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4626   switch (elem_bt) {
4627     case T_BYTE:
4628       if (ideal_opc == Op_SaturatingAddV) {
4629         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4630       } else {
4631         assert(ideal_opc == Op_SaturatingSubV, "");
4632         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4633       }
4634       break;
4635     case T_SHORT:
4636       if (ideal_opc == Op_SaturatingAddV) {
4637         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4638       } else {
4639         assert(ideal_opc == Op_SaturatingSubV, "");
4640         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4641       }
4642       break;
4643     default:
4644       fatal("Unsupported type %s", type2name(elem_bt));
4645       break;
4646   }
4647 }
4648 
4649 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4650                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4651   switch (elem_bt) {
4652     case T_BYTE:
4653       if (ideal_opc == Op_SaturatingAddV) {
4654         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4655       } else {
4656         assert(ideal_opc == Op_SaturatingSubV, "");
4657         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4658       }
4659       break;
4660     case T_SHORT:
4661       if (ideal_opc == Op_SaturatingAddV) {
4662         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4663       } else {
4664         assert(ideal_opc == Op_SaturatingSubV, "");
4665         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4666       }
4667       break;
4668     default:
4669       fatal("Unsupported type %s", type2name(elem_bt));
4670       break;
4671   }
4672 }
4673 
4674 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4675                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4676   if (is_unsigned) {
4677     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4678   } else {
4679     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4680   }
4681 }
4682 
4683 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4684                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4685   switch (elem_bt) {
4686     case T_BYTE:
4687       if (ideal_opc == Op_SaturatingAddV) {
4688         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4689       } else {
4690         assert(ideal_opc == Op_SaturatingSubV, "");
4691         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4692       }
4693       break;
4694     case T_SHORT:
4695       if (ideal_opc == Op_SaturatingAddV) {
4696         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4697       } else {
4698         assert(ideal_opc == Op_SaturatingSubV, "");
4699         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4700       }
4701       break;
4702     default:
4703       fatal("Unsupported type %s", type2name(elem_bt));
4704       break;
4705   }
4706 }
4707 
4708 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4709                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4710   switch (elem_bt) {
4711     case T_BYTE:
4712       if (ideal_opc == Op_SaturatingAddV) {
4713         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4714       } else {
4715         assert(ideal_opc == Op_SaturatingSubV, "");
4716         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4717       }
4718       break;
4719     case T_SHORT:
4720       if (ideal_opc == Op_SaturatingAddV) {
4721         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4722       } else {
4723         assert(ideal_opc == Op_SaturatingSubV, "");
4724         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4725       }
4726       break;
4727     default:
4728       fatal("Unsupported type %s", type2name(elem_bt));
4729       break;
4730   }
4731 }
4732 
4733 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4734                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4735                                     bool is_varshift) {
4736   switch (ideal_opc) {
4737     case Op_AddVB:
4738       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4739     case Op_AddVS:
4740       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4741     case Op_AddVI:
4742       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4743     case Op_AddVL:
4744       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4745     case Op_AddVF:
4746       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4747     case Op_AddVD:
4748       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4749     case Op_SubVB:
4750       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4751     case Op_SubVS:
4752       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4753     case Op_SubVI:
4754       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4755     case Op_SubVL:
4756       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4757     case Op_SubVF:
4758       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4759     case Op_SubVD:
4760       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4761     case Op_MulVS:
4762       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4763     case Op_MulVI:
4764       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4765     case Op_MulVL:
4766       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4767     case Op_MulVF:
4768       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4769     case Op_MulVD:
4770       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4771     case Op_DivVF:
4772       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4773     case Op_DivVD:
4774       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4775     case Op_SqrtVF:
4776       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4777     case Op_SqrtVD:
4778       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4779     case Op_AbsVB:
4780       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4781     case Op_AbsVS:
4782       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4783     case Op_AbsVI:
4784       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4785     case Op_AbsVL:
4786       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4787     case Op_FmaVF:
4788       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4789     case Op_FmaVD:
4790       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4791     case Op_VectorRearrange:
4792       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4793     case Op_LShiftVS:
4794       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4795     case Op_LShiftVI:
4796       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4797     case Op_LShiftVL:
4798       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4799     case Op_RShiftVS:
4800       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4801     case Op_RShiftVI:
4802       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4803     case Op_RShiftVL:
4804       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4805     case Op_URShiftVS:
4806       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4807     case Op_URShiftVI:
4808       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4809     case Op_URShiftVL:
4810       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4811     case Op_RotateLeftV:
4812       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4813     case Op_RotateRightV:
4814       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4815     case Op_MaxV:
4816       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4817     case Op_MinV:
4818       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4819     case Op_UMinV:
4820       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4821     case Op_UMaxV:
4822       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4823     case Op_XorV:
4824       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4825     case Op_OrV:
4826       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4827     case Op_AndV:
4828       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4829     default:
4830       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4831       break;
4832   }
4833 }
4834 
4835 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4836                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4837   switch (ideal_opc) {
4838     case Op_AddVB:
4839       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4840     case Op_AddVS:
4841       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4842     case Op_AddVI:
4843       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4844     case Op_AddVL:
4845       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4846     case Op_AddVF:
4847       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4848     case Op_AddVD:
4849       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4850     case Op_SubVB:
4851       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4852     case Op_SubVS:
4853       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4854     case Op_SubVI:
4855       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4856     case Op_SubVL:
4857       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4858     case Op_SubVF:
4859       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4860     case Op_SubVD:
4861       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4862     case Op_MulVS:
4863       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4864     case Op_MulVI:
4865       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4866     case Op_MulVL:
4867       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4868     case Op_MulVF:
4869       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4870     case Op_MulVD:
4871       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4872     case Op_DivVF:
4873       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4874     case Op_DivVD:
4875       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4876     case Op_FmaVF:
4877       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4878     case Op_FmaVD:
4879       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4880     case Op_MaxV:
4881       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4882     case Op_MinV:
4883       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4884     case Op_UMaxV:
4885       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4886     case Op_UMinV:
4887       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4888     case Op_XorV:
4889       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4890     case Op_OrV:
4891       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4892     case Op_AndV:
4893       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4894     default:
4895       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4896       break;
4897   }
4898 }
4899 
4900 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4901                                   KRegister src1, KRegister src2) {
4902   BasicType etype = T_ILLEGAL;
4903   switch(mask_len) {
4904     case 2:
4905     case 4:
4906     case 8:  etype = T_BYTE; break;
4907     case 16: etype = T_SHORT; break;
4908     case 32: etype = T_INT; break;
4909     case 64: etype = T_LONG; break;
4910     default: fatal("Unsupported type"); break;
4911   }
4912   assert(etype != T_ILLEGAL, "");
4913   switch(ideal_opc) {
4914     case Op_AndVMask:
4915       kand(etype, dst, src1, src2); break;
4916     case Op_OrVMask:
4917       kor(etype, dst, src1, src2); break;
4918     case Op_XorVMask:
4919       kxor(etype, dst, src1, src2); break;
4920     default:
4921       fatal("Unsupported masked operation"); break;
4922   }
4923 }
4924 
4925 /*
4926  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4927  * If src is NaN, the result is 0.
4928  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4929  * the result is equal to the value of Integer.MIN_VALUE.
4930  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4931  * the result is equal to the value of Integer.MAX_VALUE.
4932  */
4933 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4934                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4935                                                                    Register rscratch, AddressLiteral float_sign_flip,
4936                                                                    int vec_enc) {
4937   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4938   Label done;
4939   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4940   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4941   vptest(xtmp2, xtmp2, vec_enc);
4942   jccb(Assembler::equal, done);
4943 
4944   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4945   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4946 
4947   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4948   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4949   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4950 
4951   // Recompute the mask for remaining special value.
4952   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4953   // Extract SRC values corresponding to TRUE mask lanes.
4954   vpand(xtmp4, xtmp2, src, vec_enc);
4955   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4956   // values are set.
4957   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4958 
4959   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4960   bind(done);
4961 }
4962 
4963 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4964                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4965                                                                     Register rscratch, AddressLiteral float_sign_flip,
4966                                                                     int vec_enc) {
4967   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4968   Label done;
4969   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4970   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4971   kortestwl(ktmp1, ktmp1);
4972   jccb(Assembler::equal, done);
4973 
4974   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4975   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4976   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4977 
4978   kxorwl(ktmp1, ktmp1, ktmp2);
4979   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4980   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4981   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4982   bind(done);
4983 }
4984 
4985 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4986                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4987                                                                      Register rscratch, AddressLiteral double_sign_flip,
4988                                                                      int vec_enc) {
4989   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4990 
4991   Label done;
4992   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4993   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4994   kortestwl(ktmp1, ktmp1);
4995   jccb(Assembler::equal, done);
4996 
4997   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4998   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4999   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5000 
5001   kxorwl(ktmp1, ktmp1, ktmp2);
5002   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5003   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5004   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5005   bind(done);
5006 }
5007 
5008 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5009                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5010                                                                      Register rscratch, AddressLiteral float_sign_flip,
5011                                                                      int vec_enc) {
5012   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5013   Label done;
5014   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5015   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5016   kortestwl(ktmp1, ktmp1);
5017   jccb(Assembler::equal, done);
5018 
5019   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5020   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5021   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5022 
5023   kxorwl(ktmp1, ktmp1, ktmp2);
5024   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5025   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5026   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5027   bind(done);
5028 }
5029 
5030 /*
5031  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5032  * If src is NaN, the result is 0.
5033  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5034  * the result is equal to the value of Long.MIN_VALUE.
5035  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5036  * the result is equal to the value of Long.MAX_VALUE.
5037  */
5038 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5039                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5040                                                                       Register rscratch, AddressLiteral double_sign_flip,
5041                                                                       int vec_enc) {
5042   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5043 
5044   Label done;
5045   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5046   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5047   kortestwl(ktmp1, ktmp1);
5048   jccb(Assembler::equal, done);
5049 
5050   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5051   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5052   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5053 
5054   kxorwl(ktmp1, ktmp1, ktmp2);
5055   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5056   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5057   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5058   bind(done);
5059 }
5060 
5061 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5062                                                              XMMRegister xtmp, int index, int vec_enc) {
5063    assert(vec_enc < Assembler::AVX_512bit, "");
5064    if (vec_enc == Assembler::AVX_256bit) {
5065      vextractf128_high(xtmp, src);
5066      vshufps(dst, src, xtmp, index, vec_enc);
5067    } else {
5068      vshufps(dst, src, zero, index, vec_enc);
5069    }
5070 }
5071 
5072 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5073                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5074                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5075   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5076 
5077   Label done;
5078   // Compare the destination lanes with float_sign_flip
5079   // value to get mask for all special values.
5080   movdqu(xtmp1, float_sign_flip, rscratch);
5081   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5082   ptest(xtmp2, xtmp2);
5083   jccb(Assembler::equal, done);
5084 
5085   // Flip float_sign_flip to get max integer value.
5086   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5087   pxor(xtmp1, xtmp4);
5088 
5089   // Set detination lanes corresponding to unordered source lanes as zero.
5090   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5091   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5092 
5093   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5094   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5095   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5096 
5097   // Recompute the mask for remaining special value.
5098   pxor(xtmp2, xtmp3);
5099   // Extract mask corresponding to non-negative source lanes.
5100   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5101 
5102   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5103   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5104   pand(xtmp3, xtmp2);
5105 
5106   // Replace destination lanes holding special value(0x80000000) with max int
5107   // if corresponding source lane holds a +ve value.
5108   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5109   bind(done);
5110 }
5111 
5112 
5113 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5114                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5115   switch(to_elem_bt) {
5116     case T_SHORT:
5117       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5118       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5119       vpackusdw(dst, dst, zero, vec_enc);
5120       if (vec_enc == Assembler::AVX_256bit) {
5121         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5122       }
5123       break;
5124     case  T_BYTE:
5125       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5126       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5127       vpackusdw(dst, dst, zero, vec_enc);
5128       if (vec_enc == Assembler::AVX_256bit) {
5129         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5130       }
5131       vpackuswb(dst, dst, zero, vec_enc);
5132       break;
5133     default: assert(false, "%s", type2name(to_elem_bt));
5134   }
5135 }
5136 
5137 /*
5138  * Algorithm for vector D2L and F2I conversions:-
5139  * a) Perform vector D2L/F2I cast.
5140  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5141  *    It signifies that source value could be any of the special floating point
5142  *    values(NaN,-Inf,Inf,Max,-Min).
5143  * c) Set destination to zero if source is NaN value.
5144  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5145  */
5146 
5147 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5148                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5149                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5150   int to_elem_sz = type2aelembytes(to_elem_bt);
5151   assert(to_elem_sz <= 4, "");
5152   vcvttps2dq(dst, src, vec_enc);
5153   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5154   if (to_elem_sz < 4) {
5155     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5156     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5157   }
5158 }
5159 
5160 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5161                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5162                                             Register rscratch, int vec_enc) {
5163   int to_elem_sz = type2aelembytes(to_elem_bt);
5164   assert(to_elem_sz <= 4, "");
5165   vcvttps2dq(dst, src, vec_enc);
5166   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5167   switch(to_elem_bt) {
5168     case T_INT:
5169       break;
5170     case T_SHORT:
5171       evpmovdw(dst, dst, vec_enc);
5172       break;
5173     case T_BYTE:
5174       evpmovdb(dst, dst, vec_enc);
5175       break;
5176     default: assert(false, "%s", type2name(to_elem_bt));
5177   }
5178 }
5179 
5180 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5181                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5182                                             Register rscratch, int vec_enc) {
5183   evcvttps2qq(dst, src, vec_enc);
5184   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5185 }
5186 
5187 // Handling for downcasting from double to integer or sub-word types on AVX2.
5188 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5189                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5190                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5191   int to_elem_sz = type2aelembytes(to_elem_bt);
5192   assert(to_elem_sz < 8, "");
5193   vcvttpd2dq(dst, src, vec_enc);
5194   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5195                                               float_sign_flip, vec_enc);
5196   if (to_elem_sz < 4) {
5197     // xtmp4 holds all zero lanes.
5198     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5199   }
5200 }
5201 
5202 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5203                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5204                                             KRegister ktmp2, AddressLiteral sign_flip,
5205                                             Register rscratch, int vec_enc) {
5206   if (VM_Version::supports_avx512dq()) {
5207     evcvttpd2qq(dst, src, vec_enc);
5208     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5209     switch(to_elem_bt) {
5210       case T_LONG:
5211         break;
5212       case T_INT:
5213         evpmovsqd(dst, dst, vec_enc);
5214         break;
5215       case T_SHORT:
5216         evpmovsqd(dst, dst, vec_enc);
5217         evpmovdw(dst, dst, vec_enc);
5218         break;
5219       case T_BYTE:
5220         evpmovsqd(dst, dst, vec_enc);
5221         evpmovdb(dst, dst, vec_enc);
5222         break;
5223       default: assert(false, "%s", type2name(to_elem_bt));
5224     }
5225   } else {
5226     assert(type2aelembytes(to_elem_bt) <= 4, "");
5227     vcvttpd2dq(dst, src, vec_enc);
5228     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5229     switch(to_elem_bt) {
5230       case T_INT:
5231         break;
5232       case T_SHORT:
5233         evpmovdw(dst, dst, vec_enc);
5234         break;
5235       case T_BYTE:
5236         evpmovdb(dst, dst, vec_enc);
5237         break;
5238       default: assert(false, "%s", type2name(to_elem_bt));
5239     }
5240   }
5241 }
5242 
5243 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5244                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5245                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5246   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5247   // and re-instantiate original MXCSR.RC mode after that.
5248   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5249 
5250   mov64(tmp, julong_cast(0.5L));
5251   evpbroadcastq(xtmp1, tmp, vec_enc);
5252   vaddpd(xtmp1, src , xtmp1, vec_enc);
5253   evcvtpd2qq(dst, xtmp1, vec_enc);
5254   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5255                                                 double_sign_flip, vec_enc);;
5256 
5257   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5258 }
5259 
5260 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5261                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5262                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5263   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5264   // and re-instantiate original MXCSR.RC mode after that.
5265   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5266 
5267   movl(tmp, jint_cast(0.5));
5268   movq(xtmp1, tmp);
5269   vbroadcastss(xtmp1, xtmp1, vec_enc);
5270   vaddps(xtmp1, src , xtmp1, vec_enc);
5271   vcvtps2dq(dst, xtmp1, vec_enc);
5272   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5273                                               float_sign_flip, vec_enc);
5274 
5275   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5276 }
5277 
5278 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5279                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5280                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5281   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5282   // and re-instantiate original MXCSR.RC mode after that.
5283   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5284 
5285   movl(tmp, jint_cast(0.5));
5286   movq(xtmp1, tmp);
5287   vbroadcastss(xtmp1, xtmp1, vec_enc);
5288   vaddps(xtmp1, src , xtmp1, vec_enc);
5289   vcvtps2dq(dst, xtmp1, vec_enc);
5290   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5291 
5292   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5293 }
5294 
5295 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5296                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5297   switch (from_elem_bt) {
5298     case T_BYTE:
5299       switch (to_elem_bt) {
5300         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5301         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5302         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5303         default: ShouldNotReachHere();
5304       }
5305       break;
5306     case T_SHORT:
5307       switch (to_elem_bt) {
5308         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5309         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5310         default: ShouldNotReachHere();
5311       }
5312       break;
5313     case T_INT:
5314       assert(to_elem_bt == T_LONG, "");
5315       vpmovzxdq(dst, src, vlen_enc);
5316       break;
5317     default:
5318       ShouldNotReachHere();
5319   }
5320 }
5321 
5322 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5323                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5324   switch (from_elem_bt) {
5325     case T_BYTE:
5326       switch (to_elem_bt) {
5327         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5328         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5329         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5330         default: ShouldNotReachHere();
5331       }
5332       break;
5333     case T_SHORT:
5334       switch (to_elem_bt) {
5335         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5336         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5337         default: ShouldNotReachHere();
5338       }
5339       break;
5340     case T_INT:
5341       assert(to_elem_bt == T_LONG, "");
5342       vpmovsxdq(dst, src, vlen_enc);
5343       break;
5344     default:
5345       ShouldNotReachHere();
5346   }
5347 }
5348 
5349 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5350                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5351   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5352   assert(vlen_enc != AVX_512bit, "");
5353 
5354   int dst_bt_size = type2aelembytes(dst_bt);
5355   int src_bt_size = type2aelembytes(src_bt);
5356   if (dst_bt_size > src_bt_size) {
5357     switch (dst_bt_size / src_bt_size) {
5358       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5359       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5360       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5361       default: ShouldNotReachHere();
5362     }
5363   } else {
5364     assert(dst_bt_size < src_bt_size, "");
5365     switch (src_bt_size / dst_bt_size) {
5366       case 2: {
5367         if (vlen_enc == AVX_128bit) {
5368           vpacksswb(dst, src, src, vlen_enc);
5369         } else {
5370           vpacksswb(dst, src, src, vlen_enc);
5371           vpermq(dst, dst, 0x08, vlen_enc);
5372         }
5373         break;
5374       }
5375       case 4: {
5376         if (vlen_enc == AVX_128bit) {
5377           vpackssdw(dst, src, src, vlen_enc);
5378           vpacksswb(dst, dst, dst, vlen_enc);
5379         } else {
5380           vpackssdw(dst, src, src, vlen_enc);
5381           vpermq(dst, dst, 0x08, vlen_enc);
5382           vpacksswb(dst, dst, dst, AVX_128bit);
5383         }
5384         break;
5385       }
5386       case 8: {
5387         if (vlen_enc == AVX_128bit) {
5388           vpshufd(dst, src, 0x08, vlen_enc);
5389           vpackssdw(dst, dst, dst, vlen_enc);
5390           vpacksswb(dst, dst, dst, vlen_enc);
5391         } else {
5392           vpshufd(dst, src, 0x08, vlen_enc);
5393           vpermq(dst, dst, 0x08, vlen_enc);
5394           vpackssdw(dst, dst, dst, AVX_128bit);
5395           vpacksswb(dst, dst, dst, AVX_128bit);
5396         }
5397         break;
5398       }
5399       default: ShouldNotReachHere();
5400     }
5401   }
5402 }
5403 
5404 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5405                                    bool merge, BasicType bt, int vlen_enc) {
5406   if (bt == T_INT) {
5407     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5408   } else {
5409     assert(bt == T_LONG, "");
5410     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5411   }
5412 }
5413 
5414 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5415                                    bool merge, BasicType bt, int vlen_enc) {
5416   if (bt == T_INT) {
5417     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5418   } else {
5419     assert(bt == T_LONG, "");
5420     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5421   }
5422 }
5423 
5424 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5425                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5426                                                int vec_enc) {
5427   int index = 0;
5428   int vindex = 0;
5429   mov64(rtmp1, 0x0101010101010101L);
5430   pdepq(rtmp1, src, rtmp1);
5431   if (mask_len > 8) {
5432     movq(rtmp2, src);
5433     vpxor(xtmp, xtmp, xtmp, vec_enc);
5434     movq(xtmp, rtmp1);
5435   }
5436   movq(dst, rtmp1);
5437 
5438   mask_len -= 8;
5439   while (mask_len > 0) {
5440     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5441     index++;
5442     if ((index % 2) == 0) {
5443       pxor(xtmp, xtmp);
5444     }
5445     mov64(rtmp1, 0x0101010101010101L);
5446     shrq(rtmp2, 8);
5447     pdepq(rtmp1, rtmp2, rtmp1);
5448     pinsrq(xtmp, rtmp1, index % 2);
5449     vindex = index / 2;
5450     if (vindex) {
5451       // Write entire 16 byte vector when both 64 bit
5452       // lanes are update to save redundant instructions.
5453       if (index % 2) {
5454         vinsertf128(dst, dst, xtmp, vindex);
5455       }
5456     } else {
5457       vmovdqu(dst, xtmp);
5458     }
5459     mask_len -= 8;
5460   }
5461 }
5462 
5463 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5464   switch(opc) {
5465     case Op_VectorMaskTrueCount:
5466       popcntq(dst, tmp);
5467       break;
5468     case Op_VectorMaskLastTrue:
5469       if (VM_Version::supports_lzcnt()) {
5470         lzcntq(tmp, tmp);
5471         movl(dst, 63);
5472         subl(dst, tmp);
5473       } else {
5474         movl(dst, -1);
5475         bsrq(tmp, tmp);
5476         cmov32(Assembler::notZero, dst, tmp);
5477       }
5478       break;
5479     case Op_VectorMaskFirstTrue:
5480       if (VM_Version::supports_bmi1()) {
5481         if (masklen < 32) {
5482           orl(tmp, 1 << masklen);
5483           tzcntl(dst, tmp);
5484         } else if (masklen == 32) {
5485           tzcntl(dst, tmp);
5486         } else {
5487           assert(masklen == 64, "");
5488           tzcntq(dst, tmp);
5489         }
5490       } else {
5491         if (masklen < 32) {
5492           orl(tmp, 1 << masklen);
5493           bsfl(dst, tmp);
5494         } else {
5495           assert(masklen == 32 || masklen == 64, "");
5496           movl(dst, masklen);
5497           if (masklen == 32)  {
5498             bsfl(tmp, tmp);
5499           } else {
5500             bsfq(tmp, tmp);
5501           }
5502           cmov32(Assembler::notZero, dst, tmp);
5503         }
5504       }
5505       break;
5506     case Op_VectorMaskToLong:
5507       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5508       break;
5509     default: assert(false, "Unhandled mask operation");
5510   }
5511 }
5512 
5513 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5514                                               int masklen, int masksize, int vec_enc) {
5515   assert(VM_Version::supports_popcnt(), "");
5516 
5517   if(VM_Version::supports_avx512bw()) {
5518     kmovql(tmp, mask);
5519   } else {
5520     assert(masklen <= 16, "");
5521     kmovwl(tmp, mask);
5522   }
5523 
5524   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5525   // operations needs to be clipped.
5526   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5527     andq(tmp, (1 << masklen) - 1);
5528   }
5529 
5530   vector_mask_operation_helper(opc, dst, tmp, masklen);
5531 }
5532 
5533 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5534                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5535   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5536          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5537   assert(VM_Version::supports_popcnt(), "");
5538 
5539   bool need_clip = false;
5540   switch(bt) {
5541     case T_BOOLEAN:
5542       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5543       vpxor(xtmp, xtmp, xtmp, vec_enc);
5544       vpsubb(xtmp, xtmp, mask, vec_enc);
5545       vpmovmskb(tmp, xtmp, vec_enc);
5546       need_clip = masklen < 16;
5547       break;
5548     case T_BYTE:
5549       vpmovmskb(tmp, mask, vec_enc);
5550       need_clip = masklen < 16;
5551       break;
5552     case T_SHORT:
5553       vpacksswb(xtmp, mask, mask, vec_enc);
5554       if (masklen >= 16) {
5555         vpermpd(xtmp, xtmp, 8, vec_enc);
5556       }
5557       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5558       need_clip = masklen < 16;
5559       break;
5560     case T_INT:
5561     case T_FLOAT:
5562       vmovmskps(tmp, mask, vec_enc);
5563       need_clip = masklen < 4;
5564       break;
5565     case T_LONG:
5566     case T_DOUBLE:
5567       vmovmskpd(tmp, mask, vec_enc);
5568       need_clip = masklen < 2;
5569       break;
5570     default: assert(false, "Unhandled type, %s", type2name(bt));
5571   }
5572 
5573   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5574   // operations needs to be clipped.
5575   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5576     // need_clip implies masklen < 32
5577     andq(tmp, (1 << masklen) - 1);
5578   }
5579 
5580   vector_mask_operation_helper(opc, dst, tmp, masklen);
5581 }
5582 
5583 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5584                                              Register rtmp2, int mask_len) {
5585   kmov(rtmp1, src);
5586   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5587   mov64(rtmp2, -1L);
5588   pextq(rtmp2, rtmp2, rtmp1);
5589   kmov(dst, rtmp2);
5590 }
5591 
5592 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5593                                                     XMMRegister mask, Register rtmp, Register rscratch,
5594                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5595                                                     int vec_enc) {
5596   assert(type2aelembytes(bt) >= 4, "");
5597   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5598   address compress_perm_table = nullptr;
5599   address expand_perm_table = nullptr;
5600   if (type2aelembytes(bt) == 8) {
5601     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5602     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5603     vmovmskpd(rtmp, mask, vec_enc);
5604   } else {
5605     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5606     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5607     vmovmskps(rtmp, mask, vec_enc);
5608   }
5609   shlq(rtmp, 5); // for 32 byte permute row.
5610   if (opcode == Op_CompressV) {
5611     lea(rscratch, ExternalAddress(compress_perm_table));
5612   } else {
5613     lea(rscratch, ExternalAddress(expand_perm_table));
5614   }
5615   addptr(rtmp, rscratch);
5616   vmovdqu(permv, Address(rtmp));
5617   vpermps(dst, permv, src, Assembler::AVX_256bit);
5618   vpxor(xtmp, xtmp, xtmp, vec_enc);
5619   // Blend the result with zero vector using permute mask, each column entry
5620   // in a permute table row contains either a valid permute index or a -1 (default)
5621   // value, this can potentially be used as a blending mask after
5622   // compressing/expanding the source vector lanes.
5623   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5624 }
5625 
5626 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5627                                                bool merge, BasicType bt, int vec_enc) {
5628   if (opcode == Op_CompressV) {
5629     switch(bt) {
5630     case T_BYTE:
5631       evpcompressb(dst, mask, src, merge, vec_enc);
5632       break;
5633     case T_CHAR:
5634     case T_SHORT:
5635       evpcompressw(dst, mask, src, merge, vec_enc);
5636       break;
5637     case T_INT:
5638       evpcompressd(dst, mask, src, merge, vec_enc);
5639       break;
5640     case T_FLOAT:
5641       evcompressps(dst, mask, src, merge, vec_enc);
5642       break;
5643     case T_LONG:
5644       evpcompressq(dst, mask, src, merge, vec_enc);
5645       break;
5646     case T_DOUBLE:
5647       evcompresspd(dst, mask, src, merge, vec_enc);
5648       break;
5649     default:
5650       fatal("Unsupported type %s", type2name(bt));
5651       break;
5652     }
5653   } else {
5654     assert(opcode == Op_ExpandV, "");
5655     switch(bt) {
5656     case T_BYTE:
5657       evpexpandb(dst, mask, src, merge, vec_enc);
5658       break;
5659     case T_CHAR:
5660     case T_SHORT:
5661       evpexpandw(dst, mask, src, merge, vec_enc);
5662       break;
5663     case T_INT:
5664       evpexpandd(dst, mask, src, merge, vec_enc);
5665       break;
5666     case T_FLOAT:
5667       evexpandps(dst, mask, src, merge, vec_enc);
5668       break;
5669     case T_LONG:
5670       evpexpandq(dst, mask, src, merge, vec_enc);
5671       break;
5672     case T_DOUBLE:
5673       evexpandpd(dst, mask, src, merge, vec_enc);
5674       break;
5675     default:
5676       fatal("Unsupported type %s", type2name(bt));
5677       break;
5678     }
5679   }
5680 }
5681 
5682 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5683                                            KRegister ktmp1, int vec_enc) {
5684   if (opcode == Op_SignumVD) {
5685     vsubpd(dst, zero, one, vec_enc);
5686     // if src < 0 ? -1 : 1
5687     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5688     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5689     // if src == NaN, -0.0 or 0.0 return src.
5690     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5691     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5692   } else {
5693     assert(opcode == Op_SignumVF, "");
5694     vsubps(dst, zero, one, vec_enc);
5695     // if src < 0 ? -1 : 1
5696     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5697     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5698     // if src == NaN, -0.0 or 0.0 return src.
5699     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5700     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5701   }
5702 }
5703 
5704 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5705                                           XMMRegister xtmp1, int vec_enc) {
5706   if (opcode == Op_SignumVD) {
5707     vsubpd(dst, zero, one, vec_enc);
5708     // if src < 0 ? -1 : 1
5709     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5710     // if src == NaN, -0.0 or 0.0 return src.
5711     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5712     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5713   } else {
5714     assert(opcode == Op_SignumVF, "");
5715     vsubps(dst, zero, one, vec_enc);
5716     // if src < 0 ? -1 : 1
5717     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5718     // if src == NaN, -0.0 or 0.0 return src.
5719     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5720     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5721   }
5722 }
5723 
5724 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5725   if (VM_Version::supports_avx512bw()) {
5726     if (mask_len > 32) {
5727       kmovql(dst, src);
5728     } else {
5729       kmovdl(dst, src);
5730       if (mask_len != 32) {
5731         kshiftrdl(dst, dst, 32 - mask_len);
5732       }
5733     }
5734   } else {
5735     assert(mask_len <= 16, "");
5736     kmovwl(dst, src);
5737     if (mask_len != 16) {
5738       kshiftrwl(dst, dst, 16 - mask_len);
5739     }
5740   }
5741 }
5742 
5743 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5744   int lane_size = type2aelembytes(bt);
5745   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5746       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5747     movptr(rtmp, imm32);
5748     switch(lane_size) {
5749       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5750       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5751       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5752       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5753       fatal("Unsupported lane size %d", lane_size);
5754       break;
5755     }
5756   } else {
5757     movptr(rtmp, imm32);
5758     movq(dst, rtmp);
5759     switch(lane_size) {
5760       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5761       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5762       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5763       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5764       fatal("Unsupported lane size %d", lane_size);
5765       break;
5766     }
5767   }
5768 }
5769 
5770 //
5771 // Following is lookup table based popcount computation algorithm:-
5772 //       Index   Bit set count
5773 //     [ 0000 ->   0,
5774 //       0001 ->   1,
5775 //       0010 ->   1,
5776 //       0011 ->   2,
5777 //       0100 ->   1,
5778 //       0101 ->   2,
5779 //       0110 ->   2,
5780 //       0111 ->   3,
5781 //       1000 ->   1,
5782 //       1001 ->   2,
5783 //       1010 ->   3,
5784 //       1011 ->   3,
5785 //       1100 ->   2,
5786 //       1101 ->   3,
5787 //       1111 ->   4 ]
5788 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5789 //     shuffle indices for lookup table access.
5790 //  b. Right shift each byte of vector lane by 4 positions.
5791 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5792 //     shuffle indices for lookup table access.
5793 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5794 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5795 //     count of all the bytes of a quadword.
5796 //  f. Perform step e. for upper 128bit vector lane.
5797 //  g. Pack the bitset count of quadwords back to double word.
5798 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5799 
5800 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5801                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5802   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5803   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5804   vpsrlw(dst, src, 4, vec_enc);
5805   vpand(dst, dst, xtmp1, vec_enc);
5806   vpand(xtmp1, src, xtmp1, vec_enc);
5807   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5808   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5809   vpshufb(dst, xtmp2, dst, vec_enc);
5810   vpaddb(dst, dst, xtmp1, vec_enc);
5811 }
5812 
5813 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5814                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5815   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5816   // Following code is as per steps e,f,g and h of above algorithm.
5817   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5818   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5819   vpsadbw(dst, dst, xtmp2, vec_enc);
5820   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5821   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5822   vpackuswb(dst, xtmp1, dst, vec_enc);
5823 }
5824 
5825 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5826                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5827   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5828   // Add the popcount of upper and lower bytes of word.
5829   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5830   vpsrlw(dst, xtmp1, 8, vec_enc);
5831   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5832   vpaddw(dst, dst, xtmp1, vec_enc);
5833 }
5834 
5835 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5836                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5837   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5838   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5839   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5840 }
5841 
5842 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5843                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5844   switch(bt) {
5845     case T_LONG:
5846       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5847       break;
5848     case T_INT:
5849       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5850       break;
5851     case T_CHAR:
5852     case T_SHORT:
5853       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5854       break;
5855     case T_BYTE:
5856     case T_BOOLEAN:
5857       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5858       break;
5859     default:
5860       fatal("Unsupported type %s", type2name(bt));
5861       break;
5862   }
5863 }
5864 
5865 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5866                                                       KRegister mask, bool merge, int vec_enc) {
5867   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5868   switch(bt) {
5869     case T_LONG:
5870       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5871       evpopcntq(dst, mask, src, merge, vec_enc);
5872       break;
5873     case T_INT:
5874       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5875       evpopcntd(dst, mask, src, merge, vec_enc);
5876       break;
5877     case T_CHAR:
5878     case T_SHORT:
5879       assert(VM_Version::supports_avx512_bitalg(), "");
5880       evpopcntw(dst, mask, src, merge, vec_enc);
5881       break;
5882     case T_BYTE:
5883     case T_BOOLEAN:
5884       assert(VM_Version::supports_avx512_bitalg(), "");
5885       evpopcntb(dst, mask, src, merge, vec_enc);
5886       break;
5887     default:
5888       fatal("Unsupported type %s", type2name(bt));
5889       break;
5890   }
5891 }
5892 
5893 // Bit reversal algorithm first reverses the bits of each byte followed by
5894 // a byte level reversal for multi-byte primitive types (short/int/long).
5895 // Algorithm performs a lookup table access to get reverse bit sequence
5896 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5897 // is obtained by swapping the reverse bit sequences of upper and lower
5898 // nibble of a byte.
5899 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5900                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5901   if (VM_Version::supports_avx512vlbw()) {
5902 
5903     // Get the reverse bit sequence of lower nibble of each byte.
5904     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5905     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5906     evpandq(dst, xtmp2, src, vec_enc);
5907     vpshufb(dst, xtmp1, dst, vec_enc);
5908     vpsllq(dst, dst, 4, vec_enc);
5909 
5910     // Get the reverse bit sequence of upper nibble of each byte.
5911     vpandn(xtmp2, xtmp2, src, vec_enc);
5912     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5913     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5914 
5915     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5916     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5917     evporq(xtmp2, dst, xtmp2, vec_enc);
5918     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5919 
5920   } else if(vec_enc == Assembler::AVX_512bit) {
5921     // Shift based bit reversal.
5922     assert(bt == T_LONG || bt == T_INT, "");
5923 
5924     // Swap lower and upper nibble of each byte.
5925     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5926 
5927     // Swap two least and most significant bits of each nibble.
5928     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5929 
5930     // Swap adjacent pair of bits.
5931     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5932     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5933 
5934     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5935     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5936   } else {
5937     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5938     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5939 
5940     // Get the reverse bit sequence of lower nibble of each byte.
5941     vpand(dst, xtmp2, src, vec_enc);
5942     vpshufb(dst, xtmp1, dst, vec_enc);
5943     vpsllq(dst, dst, 4, vec_enc);
5944 
5945     // Get the reverse bit sequence of upper nibble of each byte.
5946     vpandn(xtmp2, xtmp2, src, vec_enc);
5947     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5948     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5949 
5950     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5951     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5952     vpor(xtmp2, dst, xtmp2, vec_enc);
5953     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5954   }
5955 }
5956 
5957 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5958                                                 XMMRegister xtmp, Register rscratch) {
5959   assert(VM_Version::supports_gfni(), "");
5960   assert(rscratch != noreg || always_reachable(mask), "missing");
5961 
5962   // Galois field instruction based bit reversal based on following algorithm.
5963   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5964   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5965   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5966   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5967 }
5968 
5969 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5970                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5971   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5972   evpandq(dst, xtmp1, src, vec_enc);
5973   vpsllq(dst, dst, nbits, vec_enc);
5974   vpandn(xtmp1, xtmp1, src, vec_enc);
5975   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5976   evporq(dst, dst, xtmp1, vec_enc);
5977 }
5978 
5979 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5980                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5981   // Shift based bit reversal.
5982   assert(VM_Version::supports_evex(), "");
5983   switch(bt) {
5984     case T_LONG:
5985       // Swap upper and lower double word of each quad word.
5986       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5987       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5988       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5989       break;
5990     case T_INT:
5991       // Swap upper and lower word of each double word.
5992       evprord(xtmp1, k0, src, 16, true, vec_enc);
5993       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5994       break;
5995     case T_CHAR:
5996     case T_SHORT:
5997       // Swap upper and lower byte of each word.
5998       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5999       break;
6000     case T_BYTE:
6001       evmovdquq(dst, k0, src, true, vec_enc);
6002       break;
6003     default:
6004       fatal("Unsupported type %s", type2name(bt));
6005       break;
6006   }
6007 }
6008 
6009 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6010   if (bt == T_BYTE) {
6011     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6012       evmovdquq(dst, k0, src, true, vec_enc);
6013     } else {
6014       vmovdqu(dst, src);
6015     }
6016     return;
6017   }
6018   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6019   // pre-computed shuffle indices.
6020   switch(bt) {
6021     case T_LONG:
6022       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6023       break;
6024     case T_INT:
6025       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6026       break;
6027     case T_CHAR:
6028     case T_SHORT:
6029       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6030       break;
6031     default:
6032       fatal("Unsupported type %s", type2name(bt));
6033       break;
6034   }
6035   vpshufb(dst, src, dst, vec_enc);
6036 }
6037 
6038 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6039                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6040                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6041   assert(is_integral_type(bt), "");
6042   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6043   assert(VM_Version::supports_avx512cd(), "");
6044   switch(bt) {
6045     case T_LONG:
6046       evplzcntq(dst, ktmp, src, merge, vec_enc);
6047       break;
6048     case T_INT:
6049       evplzcntd(dst, ktmp, src, merge, vec_enc);
6050       break;
6051     case T_SHORT:
6052       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6053       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6054       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6055       vpunpckhwd(dst, xtmp1, src, vec_enc);
6056       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6057       vpackusdw(dst, xtmp2, dst, vec_enc);
6058       break;
6059     case T_BYTE:
6060       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6061       // accessing the lookup table.
6062       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6063       // accessing the lookup table.
6064       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6065       assert(VM_Version::supports_avx512bw(), "");
6066       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6067       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6068       vpand(xtmp2, dst, src, vec_enc);
6069       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6070       vpsrlw(xtmp3, src, 4, vec_enc);
6071       vpand(xtmp3, dst, xtmp3, vec_enc);
6072       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6073       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6074       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6075       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6076       break;
6077     default:
6078       fatal("Unsupported type %s", type2name(bt));
6079       break;
6080   }
6081 }
6082 
6083 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6084                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6085   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6086   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6087   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6088   // accessing the lookup table.
6089   vpand(dst, xtmp2, src, vec_enc);
6090   vpshufb(dst, xtmp1, dst, vec_enc);
6091   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6092   // accessing the lookup table.
6093   vpsrlw(xtmp3, src, 4, vec_enc);
6094   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6095   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6096   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6097   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6098   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6099   vpaddb(dst, dst, xtmp2, vec_enc);
6100   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6101 }
6102 
6103 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6104                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6105   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6106   // Add zero counts of lower byte and upper byte of a word if
6107   // upper byte holds a zero value.
6108   vpsrlw(xtmp3, src, 8, vec_enc);
6109   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6110   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6111   vpsllw(xtmp2, dst, 8, vec_enc);
6112   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6113   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6114   vpsrlw(dst, dst, 8, vec_enc);
6115 }
6116 
6117 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6118                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6119   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6120   // hence biased exponent can be used to compute leading zero count as per
6121   // following formula:-
6122   // LZCNT = 31 - (biased_exp - 127)
6123   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6124 
6125   // Broadcast 0xFF
6126   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6127   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6128 
6129   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6130   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6131   // contributes to the leading number of zeros.
6132   vpsrld(xtmp2, src, 1, vec_enc);
6133   vpandn(xtmp3, xtmp2, src, vec_enc);
6134 
6135   // Extract biased exponent.
6136   vcvtdq2ps(dst, xtmp3, vec_enc);
6137   vpsrld(dst, dst, 23, vec_enc);
6138   vpand(dst, dst, xtmp1, vec_enc);
6139 
6140   // Broadcast 127.
6141   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6142   // Exponent = biased_exp - 127
6143   vpsubd(dst, dst, xtmp1, vec_enc);
6144 
6145   // Exponent_plus_one = Exponent + 1
6146   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6147   vpaddd(dst, dst, xtmp3, vec_enc);
6148 
6149   // Replace -ve exponent with zero, exponent is -ve when src
6150   // lane contains a zero value.
6151   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6152   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6153 
6154   // Rematerialize broadcast 32.
6155   vpslld(xtmp1, xtmp3, 5, vec_enc);
6156   // Exponent is 32 if corresponding source lane contains max_int value.
6157   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6158   // LZCNT = 32 - exponent_plus_one
6159   vpsubd(dst, xtmp1, dst, vec_enc);
6160 
6161   // Replace LZCNT with a value 1 if corresponding source lane
6162   // contains max_int value.
6163   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6164 
6165   // Replace biased_exp with 0 if source lane value is less than zero.
6166   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6167   vblendvps(dst, dst, xtmp2, src, vec_enc);
6168 }
6169 
6170 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6171                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6172   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6173   // Add zero counts of lower word and upper word of a double word if
6174   // upper word holds a zero value.
6175   vpsrld(xtmp3, src, 16, vec_enc);
6176   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6177   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6178   vpslld(xtmp2, dst, 16, vec_enc);
6179   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6180   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6181   vpsrld(dst, dst, 16, vec_enc);
6182   // Add zero counts of lower doubleword and upper doubleword of a
6183   // quadword if upper doubleword holds a zero value.
6184   vpsrlq(xtmp3, src, 32, vec_enc);
6185   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6186   vpsllq(xtmp2, dst, 32, vec_enc);
6187   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6188   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6189   vpsrlq(dst, dst, 32, vec_enc);
6190 }
6191 
6192 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6193                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6194                                                        Register rtmp, int vec_enc) {
6195   assert(is_integral_type(bt), "unexpected type");
6196   assert(vec_enc < Assembler::AVX_512bit, "");
6197   switch(bt) {
6198     case T_LONG:
6199       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6200       break;
6201     case T_INT:
6202       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6203       break;
6204     case T_SHORT:
6205       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6206       break;
6207     case T_BYTE:
6208       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6209       break;
6210     default:
6211       fatal("Unsupported type %s", type2name(bt));
6212       break;
6213   }
6214 }
6215 
6216 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6217   switch(bt) {
6218     case T_BYTE:
6219       vpsubb(dst, src1, src2, vec_enc);
6220       break;
6221     case T_SHORT:
6222       vpsubw(dst, src1, src2, vec_enc);
6223       break;
6224     case T_INT:
6225       vpsubd(dst, src1, src2, vec_enc);
6226       break;
6227     case T_LONG:
6228       vpsubq(dst, src1, src2, vec_enc);
6229       break;
6230     default:
6231       fatal("Unsupported type %s", type2name(bt));
6232       break;
6233   }
6234 }
6235 
6236 // Trailing zero count computation is based on leading zero count operation as per
6237 // following equation. All AVX3 targets support AVX512CD feature which offers
6238 // direct vector instruction to compute leading zero count.
6239 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6240 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6241                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6242                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6243   assert(is_integral_type(bt), "");
6244   // xtmp = -1
6245   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6246   // xtmp = xtmp + src
6247   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6248   // xtmp = xtmp & ~src
6249   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6250   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6251   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6252   vpsub(bt, dst, xtmp4, dst, vec_enc);
6253 }
6254 
6255 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6256 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6257 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6258                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6259   assert(is_integral_type(bt), "");
6260   // xtmp = 0
6261   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6262   // xtmp = 0 - src
6263   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6264   // xtmp = xtmp | src
6265   vpor(xtmp3, xtmp3, src, vec_enc);
6266   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6267   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6268   vpsub(bt, dst, xtmp1, dst, vec_enc);
6269 }
6270 
6271 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6272   Label done;
6273   Label neg_divisor_fastpath;
6274   cmpl(divisor, 0);
6275   jccb(Assembler::less, neg_divisor_fastpath);
6276   xorl(rdx, rdx);
6277   divl(divisor);
6278   jmpb(done);
6279   bind(neg_divisor_fastpath);
6280   // Fastpath for divisor < 0:
6281   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6282   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6283   movl(rdx, rax);
6284   subl(rdx, divisor);
6285   if (VM_Version::supports_bmi1()) {
6286     andnl(rax, rdx, rax);
6287   } else {
6288     notl(rdx);
6289     andl(rax, rdx);
6290   }
6291   shrl(rax, 31);
6292   bind(done);
6293 }
6294 
6295 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6296   Label done;
6297   Label neg_divisor_fastpath;
6298   cmpl(divisor, 0);
6299   jccb(Assembler::less, neg_divisor_fastpath);
6300   xorl(rdx, rdx);
6301   divl(divisor);
6302   jmpb(done);
6303   bind(neg_divisor_fastpath);
6304   // Fastpath when divisor < 0:
6305   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6306   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6307   movl(rdx, rax);
6308   subl(rax, divisor);
6309   if (VM_Version::supports_bmi1()) {
6310     andnl(rax, rax, rdx);
6311   } else {
6312     notl(rax);
6313     andl(rax, rdx);
6314   }
6315   sarl(rax, 31);
6316   andl(rax, divisor);
6317   subl(rdx, rax);
6318   bind(done);
6319 }
6320 
6321 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6322   Label done;
6323   Label neg_divisor_fastpath;
6324 
6325   cmpl(divisor, 0);
6326   jccb(Assembler::less, neg_divisor_fastpath);
6327   xorl(rdx, rdx);
6328   divl(divisor);
6329   jmpb(done);
6330   bind(neg_divisor_fastpath);
6331   // Fastpath for divisor < 0:
6332   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6333   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6334   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6335   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6336   movl(rdx, rax);
6337   subl(rax, divisor);
6338   if (VM_Version::supports_bmi1()) {
6339     andnl(rax, rax, rdx);
6340   } else {
6341     notl(rax);
6342     andl(rax, rdx);
6343   }
6344   movl(tmp, rax);
6345   shrl(rax, 31); // quotient
6346   sarl(tmp, 31);
6347   andl(tmp, divisor);
6348   subl(rdx, tmp); // remainder
6349   bind(done);
6350 }
6351 
6352 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6353                                  XMMRegister xtmp2, Register rtmp) {
6354   if(VM_Version::supports_gfni()) {
6355     // Galois field instruction based bit reversal based on following algorithm.
6356     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6357     mov64(rtmp, 0x8040201008040201L);
6358     movq(xtmp1, src);
6359     movq(xtmp2, rtmp);
6360     gf2p8affineqb(xtmp1, xtmp2, 0);
6361     movq(dst, xtmp1);
6362   } else {
6363     // Swap even and odd numbered bits.
6364     movl(rtmp, src);
6365     andl(rtmp, 0x55555555);
6366     shll(rtmp, 1);
6367     movl(dst, src);
6368     andl(dst, 0xAAAAAAAA);
6369     shrl(dst, 1);
6370     orl(dst, rtmp);
6371 
6372     // Swap LSB and MSB 2 bits of each nibble.
6373     movl(rtmp, dst);
6374     andl(rtmp, 0x33333333);
6375     shll(rtmp, 2);
6376     andl(dst, 0xCCCCCCCC);
6377     shrl(dst, 2);
6378     orl(dst, rtmp);
6379 
6380     // Swap LSB and MSB 4 bits of each byte.
6381     movl(rtmp, dst);
6382     andl(rtmp, 0x0F0F0F0F);
6383     shll(rtmp, 4);
6384     andl(dst, 0xF0F0F0F0);
6385     shrl(dst, 4);
6386     orl(dst, rtmp);
6387   }
6388   bswapl(dst);
6389 }
6390 
6391 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6392                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6393   if(VM_Version::supports_gfni()) {
6394     // Galois field instruction based bit reversal based on following algorithm.
6395     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6396     mov64(rtmp1, 0x8040201008040201L);
6397     movq(xtmp1, src);
6398     movq(xtmp2, rtmp1);
6399     gf2p8affineqb(xtmp1, xtmp2, 0);
6400     movq(dst, xtmp1);
6401   } else {
6402     // Swap even and odd numbered bits.
6403     movq(rtmp1, src);
6404     mov64(rtmp2, 0x5555555555555555L);
6405     andq(rtmp1, rtmp2);
6406     shlq(rtmp1, 1);
6407     movq(dst, src);
6408     notq(rtmp2);
6409     andq(dst, rtmp2);
6410     shrq(dst, 1);
6411     orq(dst, rtmp1);
6412 
6413     // Swap LSB and MSB 2 bits of each nibble.
6414     movq(rtmp1, dst);
6415     mov64(rtmp2, 0x3333333333333333L);
6416     andq(rtmp1, rtmp2);
6417     shlq(rtmp1, 2);
6418     notq(rtmp2);
6419     andq(dst, rtmp2);
6420     shrq(dst, 2);
6421     orq(dst, rtmp1);
6422 
6423     // Swap LSB and MSB 4 bits of each byte.
6424     movq(rtmp1, dst);
6425     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6426     andq(rtmp1, rtmp2);
6427     shlq(rtmp1, 4);
6428     notq(rtmp2);
6429     andq(dst, rtmp2);
6430     shrq(dst, 4);
6431     orq(dst, rtmp1);
6432   }
6433   bswapq(dst);
6434 }
6435 
6436 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6437   Label done;
6438   Label neg_divisor_fastpath;
6439   cmpq(divisor, 0);
6440   jccb(Assembler::less, neg_divisor_fastpath);
6441   xorl(rdx, rdx);
6442   divq(divisor);
6443   jmpb(done);
6444   bind(neg_divisor_fastpath);
6445   // Fastpath for divisor < 0:
6446   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6447   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6448   movq(rdx, rax);
6449   subq(rdx, divisor);
6450   if (VM_Version::supports_bmi1()) {
6451     andnq(rax, rdx, rax);
6452   } else {
6453     notq(rdx);
6454     andq(rax, rdx);
6455   }
6456   shrq(rax, 63);
6457   bind(done);
6458 }
6459 
6460 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6461   Label done;
6462   Label neg_divisor_fastpath;
6463   cmpq(divisor, 0);
6464   jccb(Assembler::less, neg_divisor_fastpath);
6465   xorq(rdx, rdx);
6466   divq(divisor);
6467   jmp(done);
6468   bind(neg_divisor_fastpath);
6469   // Fastpath when divisor < 0:
6470   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6471   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6472   movq(rdx, rax);
6473   subq(rax, divisor);
6474   if (VM_Version::supports_bmi1()) {
6475     andnq(rax, rax, rdx);
6476   } else {
6477     notq(rax);
6478     andq(rax, rdx);
6479   }
6480   sarq(rax, 63);
6481   andq(rax, divisor);
6482   subq(rdx, rax);
6483   bind(done);
6484 }
6485 
6486 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6487   Label done;
6488   Label neg_divisor_fastpath;
6489   cmpq(divisor, 0);
6490   jccb(Assembler::less, neg_divisor_fastpath);
6491   xorq(rdx, rdx);
6492   divq(divisor);
6493   jmp(done);
6494   bind(neg_divisor_fastpath);
6495   // Fastpath for divisor < 0:
6496   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6497   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6498   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6499   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6500   movq(rdx, rax);
6501   subq(rax, divisor);
6502   if (VM_Version::supports_bmi1()) {
6503     andnq(rax, rax, rdx);
6504   } else {
6505     notq(rax);
6506     andq(rax, rdx);
6507   }
6508   movq(tmp, rax);
6509   shrq(rax, 63); // quotient
6510   sarq(tmp, 63);
6511   andq(tmp, divisor);
6512   subq(rdx, tmp); // remainder
6513   bind(done);
6514 }
6515 
6516 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6517                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6518                                         int vlen_enc) {
6519   assert(VM_Version::supports_avx512bw(), "");
6520   // Byte shuffles are inlane operations and indices are determined using
6521   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6522   // normalized to index range 0-15. This makes sure that all the multiples
6523   // of an index value are placed at same relative position in 128 bit
6524   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6525   // will be 16th element in their respective 128 bit lanes.
6526   movl(rtmp, 16);
6527   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6528 
6529   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6530   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6531   // original shuffle indices and move the shuffled lanes corresponding to true
6532   // mask to destination vector.
6533   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6534   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6535   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6536 
6537   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6538   // and broadcasting second 128 bit lane.
6539   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6540   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6541   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6542   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6543   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6544 
6545   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6546   // and broadcasting third 128 bit lane.
6547   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6548   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6549   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6550   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6551   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6552 
6553   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6554   // and broadcasting third 128 bit lane.
6555   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6556   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6557   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6558   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6559   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6560 }
6561 
6562 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6563                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6564   if (vlen_enc == AVX_128bit) {
6565     vpermilps(dst, src, shuffle, vlen_enc);
6566   } else if (bt == T_INT) {
6567     vpermd(dst, shuffle, src, vlen_enc);
6568   } else {
6569     assert(bt == T_FLOAT, "");
6570     vpermps(dst, shuffle, src, vlen_enc);
6571   }
6572 }
6573 
6574 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6575   switch(opcode) {
6576     case Op_AddHF: vaddsh(dst, src1, src2); break;
6577     case Op_SubHF: vsubsh(dst, src1, src2); break;
6578     case Op_MulHF: vmulsh(dst, src1, src2); break;
6579     case Op_DivHF: vdivsh(dst, src1, src2); break;
6580     default: assert(false, "%s", NodeClassNames[opcode]); break;
6581   }
6582 }
6583 
6584 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6585   switch(elem_bt) {
6586     case T_BYTE:
6587       if (ideal_opc == Op_SaturatingAddV) {
6588         vpaddsb(dst, src1, src2, vlen_enc);
6589       } else {
6590         assert(ideal_opc == Op_SaturatingSubV, "");
6591         vpsubsb(dst, src1, src2, vlen_enc);
6592       }
6593       break;
6594     case T_SHORT:
6595       if (ideal_opc == Op_SaturatingAddV) {
6596         vpaddsw(dst, src1, src2, vlen_enc);
6597       } else {
6598         assert(ideal_opc == Op_SaturatingSubV, "");
6599         vpsubsw(dst, src1, src2, vlen_enc);
6600       }
6601       break;
6602     default:
6603       fatal("Unsupported type %s", type2name(elem_bt));
6604       break;
6605   }
6606 }
6607 
6608 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6609   switch(elem_bt) {
6610     case T_BYTE:
6611       if (ideal_opc == Op_SaturatingAddV) {
6612         vpaddusb(dst, src1, src2, vlen_enc);
6613       } else {
6614         assert(ideal_opc == Op_SaturatingSubV, "");
6615         vpsubusb(dst, src1, src2, vlen_enc);
6616       }
6617       break;
6618     case T_SHORT:
6619       if (ideal_opc == Op_SaturatingAddV) {
6620         vpaddusw(dst, src1, src2, vlen_enc);
6621       } else {
6622         assert(ideal_opc == Op_SaturatingSubV, "");
6623         vpsubusw(dst, src1, src2, vlen_enc);
6624       }
6625       break;
6626     default:
6627       fatal("Unsupported type %s", type2name(elem_bt));
6628       break;
6629   }
6630 }
6631 
6632 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6633                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6634   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6635   // overflow_mask = Inp1 <u Inp2
6636   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6637   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6638   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6639 }
6640 
6641 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6642                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6643   // Emulate unsigned comparison using signed comparison
6644   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6645   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6646   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6647   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6648 
6649   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6650 
6651   // Res = INP1 - INP2 (non-commutative and non-associative)
6652   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6653   // Res = Mask ? Zero : Res
6654   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6655   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6656 }
6657 
6658 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6659                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6660   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6661   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6662   // Res = Signed Add INP1, INP2
6663   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6664   // T1 = SRC1 | SRC2
6665   vpor(xtmp1, src1, src2, vlen_enc);
6666   // Max_Unsigned = -1
6667   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6668   // Unsigned compare:  Mask = Res <u T1
6669   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6670   // res  = Mask ? Max_Unsigned : Res
6671   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6672 }
6673 
6674 //
6675 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6676 // unsigned addition operation.
6677 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6678 //
6679 // We empirically determined its semantic equivalence to following reduced expression
6680 //    overflow_mask =  (a + b) <u (a | b)
6681 //
6682 // and also verified it though Alive2 solver.
6683 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6684 //
6685 
6686 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6687                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6688   // Res = Signed Add INP1, INP2
6689   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6690   // Compute T1 = INP1 | INP2
6691   vpor(xtmp3, src1, src2, vlen_enc);
6692   // T1 = Minimum signed value.
6693   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6694   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6695   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6696   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6697   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6698   // Compute overflow detection mask = Res<1> <s T1
6699   if (elem_bt == T_INT) {
6700     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6701   } else {
6702     assert(elem_bt == T_LONG, "");
6703     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6704   }
6705   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6706 }
6707 
6708 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6709                                       int vlen_enc, bool xtmp2_hold_M1) {
6710   if (VM_Version::supports_avx512dq()) {
6711     evpmovq2m(ktmp, src, vlen_enc);
6712   } else {
6713     assert(VM_Version::supports_evex(), "");
6714     if (!xtmp2_hold_M1) {
6715       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6716     }
6717     evpsraq(xtmp1, src, 63, vlen_enc);
6718     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6719   }
6720 }
6721 
6722 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6723                                       int vlen_enc, bool xtmp2_hold_M1) {
6724   if (VM_Version::supports_avx512dq()) {
6725     evpmovd2m(ktmp, src, vlen_enc);
6726   } else {
6727     assert(VM_Version::supports_evex(), "");
6728     if (!xtmp2_hold_M1) {
6729       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6730     }
6731     vpsrad(xtmp1, src, 31, vlen_enc);
6732     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6733   }
6734 }
6735 
6736 
6737 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6738   if (elem_bt == T_LONG) {
6739     if (VM_Version::supports_evex()) {
6740       evpsraq(dst, src, 63, vlen_enc);
6741     } else {
6742       vpsrad(dst, src, 31, vlen_enc);
6743       vpshufd(dst, dst, 0xF5, vlen_enc);
6744     }
6745   } else {
6746     assert(elem_bt == T_INT, "");
6747     vpsrad(dst, src, 31, vlen_enc);
6748   }
6749 }
6750 
6751 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6752   if (compute_allones) {
6753     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6754       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6755     } else {
6756       vpcmpeqq(allones, allones, allones, vlen_enc);
6757     }
6758   }
6759   if (elem_bt == T_LONG) {
6760     vpsrlq(dst, allones, 1, vlen_enc);
6761   } else {
6762     assert(elem_bt == T_INT, "");
6763     vpsrld(dst, allones, 1, vlen_enc);
6764   }
6765 }
6766 
6767 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6768   if (compute_allones) {
6769     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6770       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6771     } else {
6772       vpcmpeqq(allones, allones, allones, vlen_enc);
6773     }
6774   }
6775   if (elem_bt == T_LONG) {
6776     vpsllq(dst, allones, 63, vlen_enc);
6777   } else {
6778     assert(elem_bt == T_INT, "");
6779     vpslld(dst, allones, 31, vlen_enc);
6780   }
6781 }
6782 
6783 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6784                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6785   switch(elem_bt) {
6786     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6787     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6788     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6789     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6790     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6791   }
6792 }
6793 
6794 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6795   switch(elem_bt) {
6796     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6797     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6798     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6799     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6800     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6801   }
6802 }
6803 
6804 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6805                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6806   if (elem_bt == T_LONG) {
6807     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6808   } else {
6809     assert(elem_bt == T_INT, "");
6810     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6811   }
6812 }
6813 
6814 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6815                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6816                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6817   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6818   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6819   // Overflow detection based on Hacker's delight section 2-13.
6820   if (ideal_opc == Op_SaturatingAddV) {
6821     // res = src1 + src2
6822     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6823     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6824     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6825     vpxor(xtmp1, dst, src1, vlen_enc);
6826     vpxor(xtmp2, dst, src2, vlen_enc);
6827     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6828   } else {
6829     assert(ideal_opc == Op_SaturatingSubV, "");
6830     // res = src1 - src2
6831     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6832     // Overflow occurs when both inputs have opposite polarity and
6833     // result polarity does not comply with first input polarity.
6834     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6835     vpxor(xtmp1, src1, src2, vlen_enc);
6836     vpxor(xtmp2, dst, src1, vlen_enc);
6837     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6838   }
6839 
6840   // Compute overflow detection mask.
6841   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6842   // Note: xtmp1 hold -1 in all its lanes after above call.
6843 
6844   // Compute mask based on first input polarity.
6845   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6846 
6847   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6848   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6849 
6850   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6851   // set bits in first input polarity mask holds a min value.
6852   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6853   // Blend destination lanes with saturated values using overflow detection mask.
6854   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6855 }
6856 
6857 
6858 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6859                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6860                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6861   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6862   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6863   // Overflow detection based on Hacker's delight section 2-13.
6864   if (ideal_opc == Op_SaturatingAddV) {
6865     // res = src1 + src2
6866     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6867     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6868     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6869     vpxor(xtmp1, dst, src1, vlen_enc);
6870     vpxor(xtmp2, dst, src2, vlen_enc);
6871     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6872   } else {
6873     assert(ideal_opc == Op_SaturatingSubV, "");
6874     // res = src1 - src2
6875     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6876     // Overflow occurs when both inputs have opposite polarity and
6877     // result polarity does not comply with first input polarity.
6878     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6879     vpxor(xtmp1, src1, src2, vlen_enc);
6880     vpxor(xtmp2, dst, src1, vlen_enc);
6881     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6882   }
6883 
6884   // Sign-extend to compute overflow detection mask.
6885   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6886 
6887   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6888   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6889   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6890 
6891   // Compose saturating min/max vector using first input polarity mask.
6892   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6893   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6894 
6895   // Blend result with saturating vector using overflow detection mask.
6896   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6897 }
6898 
6899 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6900   switch(elem_bt) {
6901     case T_BYTE:
6902       if (ideal_opc == Op_SaturatingAddV) {
6903         vpaddsb(dst, src1, src2, vlen_enc);
6904       } else {
6905         assert(ideal_opc == Op_SaturatingSubV, "");
6906         vpsubsb(dst, src1, src2, vlen_enc);
6907       }
6908       break;
6909     case T_SHORT:
6910       if (ideal_opc == Op_SaturatingAddV) {
6911         vpaddsw(dst, src1, src2, vlen_enc);
6912       } else {
6913         assert(ideal_opc == Op_SaturatingSubV, "");
6914         vpsubsw(dst, src1, src2, vlen_enc);
6915       }
6916       break;
6917     default:
6918       fatal("Unsupported type %s", type2name(elem_bt));
6919       break;
6920   }
6921 }
6922 
6923 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6924   switch(elem_bt) {
6925     case T_BYTE:
6926       if (ideal_opc == Op_SaturatingAddV) {
6927         vpaddusb(dst, src1, src2, vlen_enc);
6928       } else {
6929         assert(ideal_opc == Op_SaturatingSubV, "");
6930         vpsubusb(dst, src1, src2, vlen_enc);
6931       }
6932       break;
6933     case T_SHORT:
6934       if (ideal_opc == Op_SaturatingAddV) {
6935         vpaddusw(dst, src1, src2, vlen_enc);
6936       } else {
6937         assert(ideal_opc == Op_SaturatingSubV, "");
6938         vpsubusw(dst, src1, src2, vlen_enc);
6939       }
6940       break;
6941     default:
6942       fatal("Unsupported type %s", type2name(elem_bt));
6943       break;
6944   }
6945 }
6946 
6947 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6948                                                      XMMRegister src2, int vlen_enc) {
6949   switch(elem_bt) {
6950     case T_BYTE:
6951       evpermi2b(dst, src1, src2, vlen_enc);
6952       break;
6953     case T_SHORT:
6954       evpermi2w(dst, src1, src2, vlen_enc);
6955       break;
6956     case T_INT:
6957       evpermi2d(dst, src1, src2, vlen_enc);
6958       break;
6959     case T_LONG:
6960       evpermi2q(dst, src1, src2, vlen_enc);
6961       break;
6962     case T_FLOAT:
6963       evpermi2ps(dst, src1, src2, vlen_enc);
6964       break;
6965     case T_DOUBLE:
6966       evpermi2pd(dst, src1, src2, vlen_enc);
6967       break;
6968     default:
6969       fatal("Unsupported type %s", type2name(elem_bt));
6970       break;
6971   }
6972 }
6973 
6974 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
6975   if (is_unsigned) {
6976     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6977   } else {
6978     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6979   }
6980 }
6981 
6982 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
6983   if (is_unsigned) {
6984     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6985   } else {
6986     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6987   }
6988 }
6989 
6990 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6991                                           KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6992   if (opcode == Op_MaxHF) {
6993     // Move sign bits of src2 to mask register.
6994     evpmovw2m(ktmp, src2, vlen_enc);
6995     // xtmp1 = src2 < 0 ? src2 : src1
6996     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
6997     // xtmp2 = src2 < 0 ? ? src1 : src2
6998     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
6999     // Idea behind above swapping is to make seconds source operand a +ve value.
7000     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7001     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7002     // the second source operand, either a NaN or a valid floating-point value, is returned
7003     // dst = max(xtmp1, xtmp2)
7004     vmaxsh(dst, xtmp1, xtmp2);
7005     // isNaN = is_unordered_quiet(xtmp1)
7006     evcmpsh(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q);
7007     // Final result is same as first source if its a NaN value,
7008     // in case second operand holds a NaN value then as per above semantics
7009     // result is same as second operand.
7010     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7011   } else {
7012     assert(opcode == Op_MinHF, "");
7013     // Move sign bits of src1 to mask register.
7014     evpmovw2m(ktmp, src1, vlen_enc);
7015     // xtmp1 = src1 < 0 ? src2 : src1
7016     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7017     // xtmp2 = src1 < 0 ? src1 : src2
7018     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7019     // Idea behind above swapping is to make seconds source operand a -ve value.
7020     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7021     // the second source operand is returned.
7022     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7023     // or a valid floating-point value, is written to the result.
7024     // dst = min(xtmp1, xtmp2)
7025     vminsh(dst, xtmp1, xtmp2);
7026     // isNaN = is_unordered_quiet(xtmp1)
7027     evcmpsh(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q);
7028     // Final result is same as first source if its a NaN value,
7029     // in case second operand holds a NaN value then as per above semantics
7030     // result is same as second operand.
7031     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7032   }
7033 }