1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  53 
  54   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  55   // NativeJump::patch_verified_entry will be able to patch out the entry
  56   // code safely. The push to verify stack depth is ok at 5 bytes,
  57   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  58   // stack bang then we must use the 6 byte frame allocation even if
  59   // we have no frame. :-(
  60   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  61 
  62   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  63   // Remove word for return addr
  64   framesize -= wordSize;
  65   stack_bang_size -= wordSize;
  66 
  67   // Calls to C2R adapters often do not accept exceptional returns.
  68   // We require that their callers must bang for them.  But be careful, because
  69   // some VM calls (such as call site linkage) can use several kilobytes of
  70   // stack.  But the stack safety zone should account for that.
  71   // See bugs 4446381, 4468289, 4497237.
  72   if (stack_bang_size > 0) {
  73     generate_stack_overflow_check(stack_bang_size);
  74 
  75     // We always push rbp, so that on return to interpreter rbp, will be
  76     // restored correctly and we can correct the stack.
  77     push(rbp);
  78     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  79     if (PreserveFramePointer) {
  80       mov(rbp, rsp);
  81     }
  82     // Remove word for ebp
  83     framesize -= wordSize;
  84 
  85     // Create frame
  86     if (framesize) {
  87       subptr(rsp, framesize);
  88     }
  89   } else {
  90     // Create frame (force generation of a 4 byte immediate value)
  91     subptr_imm32(rsp, framesize);
  92 
  93     // Save RBP register now.
  94     framesize -= wordSize;
  95     movptr(Address(rsp, framesize), rbp);
  96     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  97     if (PreserveFramePointer) {
  98       movptr(rbp, rsp);
  99       if (framesize > 0) {
 100         addptr(rbp, framesize);
 101       }
 102     }
 103   }
 104 
 105   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 106     framesize -= wordSize;
 107     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 108   }
 109 
 110 #ifdef ASSERT
 111   if (VerifyStackAtCalls) {
 112     Label L;
 113     push(rax);
 114     mov(rax, rsp);
 115     andptr(rax, StackAlignmentInBytes-1);
 116     cmpptr(rax, StackAlignmentInBytes-wordSize);
 117     pop(rax);
 118     jcc(Assembler::equal, L);
 119     STOP("Stack is not properly aligned!");
 120     bind(L);
 121   }
 122 #endif
 123 
 124   if (!is_stub) {
 125     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 126     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 127     Label dummy_slow_path;
 128     Label dummy_continuation;
 129     Label* slow_path = &dummy_slow_path;
 130     Label* continuation = &dummy_continuation;
 131     if (!Compile::current()->output()->in_scratch_emit_size()) {
 132       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 133       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 134       Compile::current()->output()->add_stub(stub);
 135       slow_path = &stub->entry();
 136       continuation = &stub->continuation();
 137     }
 138     bs->nmethod_entry_barrier(this, slow_path, continuation);
 139   }
 140 }
 141 
 142 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 143   switch (vlen_in_bytes) {
 144     case  4: // fall-through
 145     case  8: // fall-through
 146     case 16: return Assembler::AVX_128bit;
 147     case 32: return Assembler::AVX_256bit;
 148     case 64: return Assembler::AVX_512bit;
 149 
 150     default: {
 151       ShouldNotReachHere();
 152       return Assembler::AVX_NoVec;
 153     }
 154   }
 155 }
 156 
 157 // fast_lock and fast_unlock used by C2
 158 
 159 // Because the transitions from emitted code to the runtime
 160 // monitorenter/exit helper stubs are so slow it's critical that
 161 // we inline both the stack-locking fast path and the inflated fast path.
 162 //
 163 // See also: cmpFastLock and cmpFastUnlock.
 164 //
 165 // What follows is a specialized inline transliteration of the code
 166 // in enter() and exit(). If we're concerned about I$ bloat another
 167 // option would be to emit TrySlowEnter and TrySlowExit methods
 168 // at startup-time.  These methods would accept arguments as
 169 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 170 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 171 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 172 // In practice, however, the # of lock sites is bounded and is usually small.
 173 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 174 // if the processor uses simple bimodal branch predictors keyed by EIP
 175 // Since the helper routines would be called from multiple synchronization
 176 // sites.
 177 //
 178 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 179 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 180 // to those specialized methods.  That'd give us a mostly platform-independent
 181 // implementation that the JITs could optimize and inline at their pleasure.
 182 // Done correctly, the only time we'd need to cross to native could would be
 183 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 184 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 185 // (b) explicit barriers or fence operations.
 186 //
 187 // TODO:
 188 //
 189 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 190 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 191 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 192 //    the lock operators would typically be faster than reifying Self.
 193 //
 194 // *  Ideally I'd define the primitives as:
 195 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 196 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 197 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 198 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 199 //    Furthermore the register assignments are overconstrained, possibly resulting in
 200 //    sub-optimal code near the synchronization site.
 201 //
 202 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 203 //    Alternately, use a better sp-proximity test.
 204 //
 205 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 206 //    Either one is sufficient to uniquely identify a thread.
 207 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 208 //
 209 // *  Intrinsify notify() and notifyAll() for the common cases where the
 210 //    object is locked by the calling thread but the waitlist is empty.
 211 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 212 //
 213 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 214 //    But beware of excessive branch density on AMD Opterons.
 215 //
 216 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 217 //    or failure of the fast path.  If the fast path fails then we pass
 218 //    control to the slow path, typically in C.  In fast_lock and
 219 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 220 //    will emit a conditional branch immediately after the node.
 221 //    So we have branches to branches and lots of ICC.ZF games.
 222 //    Instead, it might be better to have C2 pass a "FailureLabel"
 223 //    into fast_lock and fast_unlock.  In the case of success, control
 224 //    will drop through the node.  ICC.ZF is undefined at exit.
 225 //    In the case of failure, the node will branch directly to the
 226 //    FailureLabel
 227 
 228 
 229 // obj: object to lock
 230 // box: on-stack box address (displaced header location) - KILLED
 231 // rax,: tmp -- KILLED
 232 // scr: tmp -- KILLED
 233 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 234                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 235                                  Metadata* method_data) {
 236   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 237   // Ensure the register assignments are disjoint
 238   assert(tmpReg == rax, "");
 239   assert(cx1Reg == noreg, "");
 240   assert(cx2Reg == noreg, "");
 241   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 242 
 243   // Possible cases that we'll encounter in fast_lock
 244   // ------------------------------------------------
 245   // * Inflated
 246   //    -- unlocked
 247   //    -- Locked
 248   //       = by self
 249   //       = by other
 250   // * neutral
 251   // * stack-locked
 252   //    -- by self
 253   //       = sp-proximity test hits
 254   //       = sp-proximity test generates false-negative
 255   //    -- by other
 256   //
 257 
 258   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 259 
 260   if (DiagnoseSyncOnValueBasedClasses != 0) {
 261     load_klass(tmpReg, objReg, scrReg);
 262     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 263     jcc(Assembler::notZero, DONE_LABEL);
 264   }
 265 
 266   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 267   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 268   jcc(Assembler::notZero, IsInflated);
 269 
 270   if (LockingMode == LM_MONITOR) {
 271     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 272     testptr(objReg, objReg);
 273   } else {
 274     assert(LockingMode == LM_LEGACY, "must be");
 275     // Attempt stack-locking ...
 276     orptr (tmpReg, markWord::unlocked_value);
 277     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 278     lock();
 279     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 280     jcc(Assembler::equal, COUNT);           // Success
 281 
 282     // Recursive locking.
 283     // The object is stack-locked: markword contains stack pointer to BasicLock.
 284     // Locked by current thread if difference with current SP is less than one page.
 285     subptr(tmpReg, rsp);
 286     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 287     andptr(tmpReg, (int32_t) (7 - (int)os::vm_page_size()) );
 288     movptr(Address(boxReg, 0), tmpReg);
 289   }
 290   jmp(DONE_LABEL);
 291 
 292   bind(IsInflated);
 293   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 294 
 295   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 296   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 297   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 298 
 299   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 300   movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset()));
 301   movq(scrReg, tmpReg);
 302   xorq(tmpReg, tmpReg);
 303   lock();
 304   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 305 
 306   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 307   jccb(Assembler::equal, COUNT);    // CAS above succeeded; propagate ZF = 1 (success)
 308 
 309   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 310   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 311   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 312   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 313   bind(DONE_LABEL);
 314 
 315   // ZFlag == 1 count in fast path
 316   // ZFlag == 0 count in slow path
 317   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 318 
 319   bind(COUNT);
 320   if (LockingMode == LM_LEGACY) {
 321     // Count monitors in fast path
 322     increment(Address(thread, JavaThread::held_monitor_count_offset()));
 323   }
 324   xorl(tmpReg, tmpReg); // Set ZF == 1
 325 
 326   bind(NO_COUNT);
 327 
 328   // At NO_COUNT the icc ZFlag is set as follows ...
 329   // fast_unlock uses the same protocol.
 330   // ZFlag == 1 -> Success
 331   // ZFlag == 0 -> Failure - force control through the slow path
 332 }
 333 
 334 // obj: object to unlock
 335 // box: box address (displaced header location), killed.  Must be EAX.
 336 // tmp: killed, cannot be obj nor box.
 337 //
 338 // Some commentary on balanced locking:
 339 //
 340 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 341 // Methods that don't have provably balanced locking are forced to run in the
 342 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 343 // The interpreter provides two properties:
 344 // I1:  At return-time the interpreter automatically and quietly unlocks any
 345 //      objects acquired the current activation (frame).  Recall that the
 346 //      interpreter maintains an on-stack list of locks currently held by
 347 //      a frame.
 348 // I2:  If a method attempts to unlock an object that is not held by the
 349 //      the frame the interpreter throws IMSX.
 350 //
 351 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 352 // B() doesn't have provably balanced locking so it runs in the interpreter.
 353 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 354 // is still locked by A().
 355 //
 356 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 357 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 358 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 359 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 360 // Arguably given that the spec legislates the JNI case as undefined our implementation
 361 // could reasonably *avoid* checking owner in fast_unlock().
 362 // In the interest of performance we elide m->Owner==Self check in unlock.
 363 // A perfectly viable alternative is to elide the owner check except when
 364 // Xcheck:jni is enabled.
 365 
 366 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 367   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 368   assert(boxReg == rax, "");
 369   assert_different_registers(objReg, boxReg, tmpReg);
 370 
 371   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 372 
 373   if (LockingMode == LM_LEGACY) {
 374     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 375     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 376   }
 377   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 378   if (LockingMode != LM_MONITOR) {
 379     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 380     jcc(Assembler::zero, Stacked);
 381   }
 382 
 383   // It's inflated.
 384 
 385   // Despite our balanced locking property we still check that m->_owner == Self
 386   // as java routines or native JNI code called by this thread might
 387   // have released the lock.
 388   //
 389   // If there's no contention try a 1-0 exit.  That is, exit without
 390   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 391   // we detect and recover from the race that the 1-0 exit admits.
 392   //
 393   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 394   // before it STs null into _owner, releasing the lock.  Updates
 395   // to data protected by the critical section must be visible before
 396   // we drop the lock (and thus before any other thread could acquire
 397   // the lock and observe the fields protected by the lock).
 398   // IA32's memory-model is SPO, so STs are ordered with respect to
 399   // each other and there's no need for an explicit barrier (fence).
 400   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 401   Label LSuccess, LNotRecursive;
 402 
 403   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 404   jccb(Assembler::equal, LNotRecursive);
 405 
 406   // Recursive inflated unlock
 407   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 408   jmpb(LSuccess);
 409 
 410   bind(LNotRecursive);
 411 
 412   // Set owner to null.
 413   // Release to satisfy the JMM
 414   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 415   // We need a full fence after clearing owner to avoid stranding.
 416   // StoreLoad achieves this.
 417   membar(StoreLoad);
 418 
 419   // Check if the entry_list is empty.
 420   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD);
 421   jccb(Assembler::zero, LSuccess);    // If so we are done.
 422 
 423   // Check if there is a successor.
 424   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 425   jccb(Assembler::notZero, LSuccess); // If so we are done.
 426 
 427   // Save the monitor pointer in the current thread, so we can try to
 428   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 429   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 430   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 431 
 432   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 433   jmpb  (DONE_LABEL);
 434 
 435   bind  (LSuccess);
 436   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 437   jmpb  (DONE_LABEL);
 438 
 439   if (LockingMode == LM_LEGACY) {
 440     bind  (Stacked);
 441     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 442     lock();
 443     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 444     // Intentional fall-thru into DONE_LABEL
 445   }
 446 
 447   bind(DONE_LABEL);
 448 
 449   // ZFlag == 1 count in fast path
 450   // ZFlag == 0 count in slow path
 451   jccb(Assembler::notZero, NO_COUNT);
 452 
 453   bind(COUNT);
 454 
 455   if (LockingMode == LM_LEGACY) {
 456     // Count monitors in fast path
 457     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 458   }
 459 
 460   xorl(tmpReg, tmpReg); // Set ZF == 1
 461 
 462   bind(NO_COUNT);
 463 }
 464 
 465 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 466                                               Register t, Register thread) {
 467   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 468   assert(rax_reg == rax, "Used for CAS");
 469   assert_different_registers(obj, box, rax_reg, t, thread);
 470 
 471   // Handle inflated monitor.
 472   Label inflated;
 473   // Finish fast lock successfully. ZF value is irrelevant.
 474   Label locked;
 475   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 476   Label slow_path;
 477 
 478   if (UseObjectMonitorTable) {
 479     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 480     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 481   }
 482 
 483   if (DiagnoseSyncOnValueBasedClasses != 0) {
 484     load_klass(rax_reg, obj, t);
 485     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 486     jcc(Assembler::notZero, slow_path);
 487   }
 488 
 489   const Register mark = t;
 490 
 491   { // Lightweight Lock
 492 
 493     Label push;
 494 
 495     const Register top = UseObjectMonitorTable ? rax_reg : box;
 496 
 497     // Load the mark.
 498     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 499 
 500     // Prefetch top.
 501     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 502 
 503     // Check for monitor (0b10).
 504     testptr(mark, markWord::monitor_value);
 505     jcc(Assembler::notZero, inflated);
 506 
 507     // Check if lock-stack is full.
 508     cmpl(top, LockStack::end_offset() - 1);
 509     jcc(Assembler::greater, slow_path);
 510 
 511     // Check if recursive.
 512     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 513     jccb(Assembler::equal, push);
 514 
 515     // Try to lock. Transition lock bits 0b01 => 0b00
 516     movptr(rax_reg, mark);
 517     orptr(rax_reg, markWord::unlocked_value);
 518     andptr(mark, ~(int32_t)markWord::unlocked_value);
 519     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 520     jcc(Assembler::notEqual, slow_path);
 521 
 522     if (UseObjectMonitorTable) {
 523       // Need to reload top, clobbered by CAS.
 524       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 525     }
 526     bind(push);
 527     // After successful lock, push object on lock-stack.
 528     movptr(Address(thread, top), obj);
 529     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 530     jmpb(locked);
 531   }
 532 
 533   { // Handle inflated monitor.
 534     bind(inflated);
 535 
 536     const Register monitor = t;
 537 
 538     if (!UseObjectMonitorTable) {
 539       assert(mark == monitor, "should be the same here");
 540     } else {
 541       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 542       // Fetch ObjectMonitor* from the cache or take the slow-path.
 543       Label monitor_found;
 544 
 545       // Load cache address
 546       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 547 
 548       const int num_unrolled = 2;
 549       for (int i = 0; i < num_unrolled; i++) {
 550         cmpptr(obj, Address(t));
 551         jccb(Assembler::equal, monitor_found);
 552         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 553       }
 554 
 555       Label loop;
 556 
 557       // Search for obj in cache.
 558       bind(loop);
 559 
 560       // Check for match.
 561       cmpptr(obj, Address(t));
 562       jccb(Assembler::equal, monitor_found);
 563 
 564       // Search until null encountered, guaranteed _null_sentinel at end.
 565       cmpptr(Address(t), 1);
 566       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 567       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 568       jmpb(loop);
 569 
 570       // Cache hit.
 571       bind(monitor_found);
 572       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 573     }
 574     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 575     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 576     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 577 
 578     Label monitor_locked;
 579     // Lock the monitor.
 580 
 581     if (UseObjectMonitorTable) {
 582       // Cache the monitor for unlock before trashing box. On failure to acquire
 583       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 584       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 585     }
 586 
 587     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 588     xorptr(rax_reg, rax_reg);
 589     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 590     lock(); cmpxchgptr(box, owner_address);
 591     jccb(Assembler::equal, monitor_locked);
 592 
 593     // Check if recursive.
 594     cmpptr(box, rax_reg);
 595     jccb(Assembler::notEqual, slow_path);
 596 
 597     // Recursive.
 598     increment(recursions_address);
 599 
 600     bind(monitor_locked);
 601   }
 602 
 603   bind(locked);
 604   // Set ZF = 1
 605   xorl(rax_reg, rax_reg);
 606 
 607 #ifdef ASSERT
 608   // Check that locked label is reached with ZF set.
 609   Label zf_correct;
 610   Label zf_bad_zero;
 611   jcc(Assembler::zero, zf_correct);
 612   jmp(zf_bad_zero);
 613 #endif
 614 
 615   bind(slow_path);
 616 #ifdef ASSERT
 617   // Check that slow_path label is reached with ZF not set.
 618   jcc(Assembler::notZero, zf_correct);
 619   stop("Fast Lock ZF != 0");
 620   bind(zf_bad_zero);
 621   stop("Fast Lock ZF != 1");
 622   bind(zf_correct);
 623 #endif
 624   // C2 uses the value of ZF to determine the continuation.
 625 }
 626 
 627 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 628   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 629   assert(reg_rax == rax, "Used for CAS");
 630   assert_different_registers(obj, reg_rax, t);
 631 
 632   // Handle inflated monitor.
 633   Label inflated, inflated_check_lock_stack;
 634   // Finish fast unlock successfully.  MUST jump with ZF == 1
 635   Label unlocked, slow_path;
 636 
 637   const Register mark = t;
 638   const Register monitor = t;
 639   const Register top = UseObjectMonitorTable ? t : reg_rax;
 640   const Register box = reg_rax;
 641 
 642   Label dummy;
 643   C2FastUnlockLightweightStub* stub = nullptr;
 644 
 645   if (!Compile::current()->output()->in_scratch_emit_size()) {
 646     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 647     Compile::current()->output()->add_stub(stub);
 648   }
 649 
 650   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 651 
 652   { // Lightweight Unlock
 653 
 654     // Load top.
 655     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 656 
 657     if (!UseObjectMonitorTable) {
 658       // Prefetch mark.
 659       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 660     }
 661 
 662     // Check if obj is top of lock-stack.
 663     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 664     // Top of lock stack was not obj. Must be monitor.
 665     jcc(Assembler::notEqual, inflated_check_lock_stack);
 666 
 667     // Pop lock-stack.
 668     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 669     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 670 
 671     // Check if recursive.
 672     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 673     jcc(Assembler::equal, unlocked);
 674 
 675     // We elide the monitor check, let the CAS fail instead.
 676 
 677     if (UseObjectMonitorTable) {
 678       // Load mark.
 679       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 680     }
 681 
 682     // Try to unlock. Transition lock bits 0b00 => 0b01
 683     movptr(reg_rax, mark);
 684     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 685     orptr(mark, markWord::unlocked_value);
 686     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 687     jcc(Assembler::notEqual, push_and_slow_path);
 688     jmp(unlocked);
 689   }
 690 
 691 
 692   { // Handle inflated monitor.
 693     bind(inflated_check_lock_stack);
 694 #ifdef ASSERT
 695     Label check_done;
 696     subl(top, oopSize);
 697     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 698     jcc(Assembler::below, check_done);
 699     cmpptr(obj, Address(thread, top));
 700     jccb(Assembler::notEqual, inflated_check_lock_stack);
 701     stop("Fast Unlock lock on stack");
 702     bind(check_done);
 703     if (UseObjectMonitorTable) {
 704       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 705     }
 706     testptr(mark, markWord::monitor_value);
 707     jccb(Assembler::notZero, inflated);
 708     stop("Fast Unlock not monitor");
 709 #endif
 710 
 711     bind(inflated);
 712 
 713     if (!UseObjectMonitorTable) {
 714       assert(mark == monitor, "should be the same here");
 715     } else {
 716       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 717       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 718       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 719       cmpptr(monitor, alignof(ObjectMonitor*));
 720       jcc(Assembler::below, slow_path);
 721     }
 722     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 723     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 724     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 725     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 726     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 727 
 728     Label recursive;
 729 
 730     // Check if recursive.
 731     cmpptr(recursions_address, 0);
 732     jccb(Assembler::notZero, recursive);
 733 
 734     // Set owner to null.
 735     // Release to satisfy the JMM
 736     movptr(owner_address, NULL_WORD);
 737     // We need a full fence after clearing owner to avoid stranding.
 738     // StoreLoad achieves this.
 739     membar(StoreLoad);
 740 
 741     // Check if the entry_list is empty.
 742     cmpptr(entry_list_address, NULL_WORD);
 743     jccb(Assembler::zero, unlocked);    // If so we are done.
 744 
 745     // Check if there is a successor.
 746     cmpptr(succ_address, NULL_WORD);
 747     jccb(Assembler::notZero, unlocked); // If so we are done.
 748 
 749     // Save the monitor pointer in the current thread, so we can try to
 750     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 751     if (!UseObjectMonitorTable) {
 752       andptr(monitor, ~(int32_t)markWord::monitor_value);
 753     }
 754     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 755 
 756     orl(t, 1); // Fast Unlock ZF = 0
 757     jmpb(slow_path);
 758 
 759     // Recursive unlock.
 760     bind(recursive);
 761     decrement(recursions_address);
 762   }
 763 
 764   bind(unlocked);
 765   xorl(t, t); // Fast Unlock ZF = 1
 766 
 767 #ifdef ASSERT
 768   // Check that unlocked label is reached with ZF set.
 769   Label zf_correct;
 770   Label zf_bad_zero;
 771   jcc(Assembler::zero, zf_correct);
 772   jmp(zf_bad_zero);
 773 #endif
 774 
 775   bind(slow_path);
 776   if (stub != nullptr) {
 777     bind(stub->slow_path_continuation());
 778   }
 779 #ifdef ASSERT
 780   // Check that stub->continuation() label is reached with ZF not set.
 781   jcc(Assembler::notZero, zf_correct);
 782   stop("Fast Unlock ZF != 0");
 783   bind(zf_bad_zero);
 784   stop("Fast Unlock ZF != 1");
 785   bind(zf_correct);
 786 #endif
 787   // C2 uses the value of ZF to determine the continuation.
 788 }
 789 
 790 //-------------------------------------------------------------------------------------------
 791 // Generic instructions support for use in .ad files C2 code generation
 792 
 793 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 794   if (dst != src) {
 795     movdqu(dst, src);
 796   }
 797   if (opcode == Op_AbsVD) {
 798     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 799   } else {
 800     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 801     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 802   }
 803 }
 804 
 805 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 806   if (opcode == Op_AbsVD) {
 807     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 808   } else {
 809     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 810     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 811   }
 812 }
 813 
 814 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 815   if (dst != src) {
 816     movdqu(dst, src);
 817   }
 818   if (opcode == Op_AbsVF) {
 819     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 820   } else {
 821     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 822     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 823   }
 824 }
 825 
 826 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 827   if (opcode == Op_AbsVF) {
 828     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 829   } else {
 830     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 831     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 832   }
 833 }
 834 
 835 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 836   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 837   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 838 
 839   if (opcode == Op_MinV) {
 840     if (elem_bt == T_BYTE) {
 841       pminsb(dst, src);
 842     } else if (elem_bt == T_SHORT) {
 843       pminsw(dst, src);
 844     } else if (elem_bt == T_INT) {
 845       pminsd(dst, src);
 846     } else {
 847       assert(elem_bt == T_LONG, "required");
 848       assert(tmp == xmm0, "required");
 849       assert_different_registers(dst, src, tmp);
 850       movdqu(xmm0, dst);
 851       pcmpgtq(xmm0, src);
 852       blendvpd(dst, src);  // xmm0 as mask
 853     }
 854   } else { // opcode == Op_MaxV
 855     if (elem_bt == T_BYTE) {
 856       pmaxsb(dst, src);
 857     } else if (elem_bt == T_SHORT) {
 858       pmaxsw(dst, src);
 859     } else if (elem_bt == T_INT) {
 860       pmaxsd(dst, src);
 861     } else {
 862       assert(elem_bt == T_LONG, "required");
 863       assert(tmp == xmm0, "required");
 864       assert_different_registers(dst, src, tmp);
 865       movdqu(xmm0, src);
 866       pcmpgtq(xmm0, dst);
 867       blendvpd(dst, src);  // xmm0 as mask
 868     }
 869   }
 870 }
 871 
 872 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 873                                   XMMRegister src1, Address src2, int vlen_enc) {
 874   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 875   if (opcode == Op_UMinV) {
 876     switch(elem_bt) {
 877       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 878       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 879       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 880       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 881       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 882     }
 883   } else {
 884     assert(opcode == Op_UMaxV, "required");
 885     switch(elem_bt) {
 886       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 887       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 888       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 889       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 890       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 891     }
 892   }
 893 }
 894 
 895 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 896   // For optimality, leverage a full vector width of 512 bits
 897   // for operations over smaller vector sizes on AVX512 targets.
 898   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 899     if (opcode == Op_UMaxV) {
 900       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 901     } else {
 902       assert(opcode == Op_UMinV, "required");
 903       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 904     }
 905   } else {
 906     // T1 = -1
 907     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 908     // T1 = -1 << 63
 909     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 910     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 911     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 912     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 913     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 914     // Mask = T2 > T1
 915     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 916     if (opcode == Op_UMaxV) {
 917       // Res = Mask ? Src2 : Src1
 918       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 919     } else {
 920       // Res = Mask ? Src1 : Src2
 921       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 922     }
 923   }
 924 }
 925 
 926 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 927                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 928   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 929   if (opcode == Op_UMinV) {
 930     switch(elem_bt) {
 931       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 932       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 933       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 934       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 935       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 936     }
 937   } else {
 938     assert(opcode == Op_UMaxV, "required");
 939     switch(elem_bt) {
 940       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 941       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 942       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 943       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 944       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 945     }
 946   }
 947 }
 948 
 949 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 950                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 951                                  int vlen_enc) {
 952   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 953 
 954   if (opcode == Op_MinV) {
 955     if (elem_bt == T_BYTE) {
 956       vpminsb(dst, src1, src2, vlen_enc);
 957     } else if (elem_bt == T_SHORT) {
 958       vpminsw(dst, src1, src2, vlen_enc);
 959     } else if (elem_bt == T_INT) {
 960       vpminsd(dst, src1, src2, vlen_enc);
 961     } else {
 962       assert(elem_bt == T_LONG, "required");
 963       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 964         vpminsq(dst, src1, src2, vlen_enc);
 965       } else {
 966         assert_different_registers(dst, src1, src2);
 967         vpcmpgtq(dst, src1, src2, vlen_enc);
 968         vblendvpd(dst, src1, src2, dst, vlen_enc);
 969       }
 970     }
 971   } else { // opcode == Op_MaxV
 972     if (elem_bt == T_BYTE) {
 973       vpmaxsb(dst, src1, src2, vlen_enc);
 974     } else if (elem_bt == T_SHORT) {
 975       vpmaxsw(dst, src1, src2, vlen_enc);
 976     } else if (elem_bt == T_INT) {
 977       vpmaxsd(dst, src1, src2, vlen_enc);
 978     } else {
 979       assert(elem_bt == T_LONG, "required");
 980       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 981         vpmaxsq(dst, src1, src2, vlen_enc);
 982       } else {
 983         assert_different_registers(dst, src1, src2);
 984         vpcmpgtq(dst, src1, src2, vlen_enc);
 985         vblendvpd(dst, src2, src1, dst, vlen_enc);
 986       }
 987     }
 988   }
 989 }
 990 
 991 // Float/Double min max
 992 
 993 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 994                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 995                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 996                                    int vlen_enc) {
 997   assert(UseAVX > 0, "required");
 998   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 999          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1000   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1001   assert_different_registers(a, tmp, atmp, btmp);
1002   assert_different_registers(b, tmp, atmp, btmp);
1003 
1004   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1005   bool is_double_word = is_double_word_type(elem_bt);
1006 
1007   /* Note on 'non-obvious' assembly sequence:
1008    *
1009    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1010    * and Java on how they handle floats:
1011    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1012    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1013    *
1014    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1015    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1016    *                (only useful when signs differ, noop otherwise)
1017    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1018 
1019    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1020    *   btmp = (b < +0.0) ? a : b
1021    *   atmp = (b < +0.0) ? b : a
1022    *   Tmp  = Max_Float(atmp , btmp)
1023    *   Res  = (atmp == NaN) ? atmp : Tmp
1024    */
1025 
1026   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1027   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1028   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1029   XMMRegister mask;
1030 
1031   if (!is_double_word && is_min) {
1032     mask = a;
1033     vblend = &MacroAssembler::vblendvps;
1034     vmaxmin = &MacroAssembler::vminps;
1035     vcmp = &MacroAssembler::vcmpps;
1036   } else if (!is_double_word && !is_min) {
1037     mask = b;
1038     vblend = &MacroAssembler::vblendvps;
1039     vmaxmin = &MacroAssembler::vmaxps;
1040     vcmp = &MacroAssembler::vcmpps;
1041   } else if (is_double_word && is_min) {
1042     mask = a;
1043     vblend = &MacroAssembler::vblendvpd;
1044     vmaxmin = &MacroAssembler::vminpd;
1045     vcmp = &MacroAssembler::vcmppd;
1046   } else {
1047     assert(is_double_word && !is_min, "sanity");
1048     mask = b;
1049     vblend = &MacroAssembler::vblendvpd;
1050     vmaxmin = &MacroAssembler::vmaxpd;
1051     vcmp = &MacroAssembler::vcmppd;
1052   }
1053 
1054   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1055   XMMRegister maxmin, scratch;
1056   if (dst == btmp) {
1057     maxmin = btmp;
1058     scratch = tmp;
1059   } else {
1060     maxmin = tmp;
1061     scratch = btmp;
1062   }
1063 
1064   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1065   if (precompute_mask && !is_double_word) {
1066     vpsrad(tmp, mask, 32, vlen_enc);
1067     mask = tmp;
1068   } else if (precompute_mask && is_double_word) {
1069     vpxor(tmp, tmp, tmp, vlen_enc);
1070     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1071     mask = tmp;
1072   }
1073 
1074   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1075   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1076   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1077   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1078   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1079 }
1080 
1081 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1082                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1083                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1084                                     int vlen_enc) {
1085   assert(UseAVX > 2, "required");
1086   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1087          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1088   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1089   assert_different_registers(dst, a, atmp, btmp);
1090   assert_different_registers(dst, b, atmp, btmp);
1091 
1092   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1093   bool is_double_word = is_double_word_type(elem_bt);
1094   bool merge = true;
1095 
1096   if (!is_double_word && is_min) {
1097     evpmovd2m(ktmp, a, vlen_enc);
1098     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1099     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1100     vminps(dst, atmp, btmp, vlen_enc);
1101     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1102     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1103   } else if (!is_double_word && !is_min) {
1104     evpmovd2m(ktmp, b, vlen_enc);
1105     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1106     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1107     vmaxps(dst, atmp, btmp, vlen_enc);
1108     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1109     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1110   } else if (is_double_word && is_min) {
1111     evpmovq2m(ktmp, a, vlen_enc);
1112     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1113     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1114     vminpd(dst, atmp, btmp, vlen_enc);
1115     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1116     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1117   } else {
1118     assert(is_double_word && !is_min, "sanity");
1119     evpmovq2m(ktmp, b, vlen_enc);
1120     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1121     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1122     vmaxpd(dst, atmp, btmp, vlen_enc);
1123     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1124     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1125   }
1126 }
1127 
1128 // Float/Double signum
1129 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1130   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1131 
1132   Label DONE_LABEL;
1133 
1134   if (opcode == Op_SignumF) {
1135     ucomiss(dst, zero);
1136     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1137     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1138     movflt(dst, one);
1139     jcc(Assembler::above, DONE_LABEL);
1140     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1141   } else if (opcode == Op_SignumD) {
1142     ucomisd(dst, zero);
1143     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1144     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1145     movdbl(dst, one);
1146     jcc(Assembler::above, DONE_LABEL);
1147     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1148   }
1149 
1150   bind(DONE_LABEL);
1151 }
1152 
1153 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1154   if (sign) {
1155     pmovsxbw(dst, src);
1156   } else {
1157     pmovzxbw(dst, src);
1158   }
1159 }
1160 
1161 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1162   if (sign) {
1163     vpmovsxbw(dst, src, vector_len);
1164   } else {
1165     vpmovzxbw(dst, src, vector_len);
1166   }
1167 }
1168 
1169 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1170   if (sign) {
1171     vpmovsxbd(dst, src, vector_len);
1172   } else {
1173     vpmovzxbd(dst, src, vector_len);
1174   }
1175 }
1176 
1177 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1178   if (sign) {
1179     vpmovsxwd(dst, src, vector_len);
1180   } else {
1181     vpmovzxwd(dst, src, vector_len);
1182   }
1183 }
1184 
1185 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1186                                      int shift, int vector_len) {
1187   if (opcode == Op_RotateLeftV) {
1188     if (etype == T_INT) {
1189       evprold(dst, src, shift, vector_len);
1190     } else {
1191       assert(etype == T_LONG, "expected type T_LONG");
1192       evprolq(dst, src, shift, vector_len);
1193     }
1194   } else {
1195     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1196     if (etype == T_INT) {
1197       evprord(dst, src, shift, vector_len);
1198     } else {
1199       assert(etype == T_LONG, "expected type T_LONG");
1200       evprorq(dst, src, shift, vector_len);
1201     }
1202   }
1203 }
1204 
1205 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1206                                      XMMRegister shift, int vector_len) {
1207   if (opcode == Op_RotateLeftV) {
1208     if (etype == T_INT) {
1209       evprolvd(dst, src, shift, vector_len);
1210     } else {
1211       assert(etype == T_LONG, "expected type T_LONG");
1212       evprolvq(dst, src, shift, vector_len);
1213     }
1214   } else {
1215     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1216     if (etype == T_INT) {
1217       evprorvd(dst, src, shift, vector_len);
1218     } else {
1219       assert(etype == T_LONG, "expected type T_LONG");
1220       evprorvq(dst, src, shift, vector_len);
1221     }
1222   }
1223 }
1224 
1225 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1226   if (opcode == Op_RShiftVI) {
1227     psrad(dst, shift);
1228   } else if (opcode == Op_LShiftVI) {
1229     pslld(dst, shift);
1230   } else {
1231     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1232     psrld(dst, shift);
1233   }
1234 }
1235 
1236 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1237   switch (opcode) {
1238     case Op_RShiftVI:  psrad(dst, shift); break;
1239     case Op_LShiftVI:  pslld(dst, shift); break;
1240     case Op_URShiftVI: psrld(dst, shift); break;
1241 
1242     default: assert(false, "%s", NodeClassNames[opcode]);
1243   }
1244 }
1245 
1246 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1247   if (opcode == Op_RShiftVI) {
1248     vpsrad(dst, nds, shift, vector_len);
1249   } else if (opcode == Op_LShiftVI) {
1250     vpslld(dst, nds, shift, vector_len);
1251   } else {
1252     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1253     vpsrld(dst, nds, shift, vector_len);
1254   }
1255 }
1256 
1257 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1258   switch (opcode) {
1259     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1260     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1261     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1262 
1263     default: assert(false, "%s", NodeClassNames[opcode]);
1264   }
1265 }
1266 
1267 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1268   switch (opcode) {
1269     case Op_RShiftVB:  // fall-through
1270     case Op_RShiftVS:  psraw(dst, shift); break;
1271 
1272     case Op_LShiftVB:  // fall-through
1273     case Op_LShiftVS:  psllw(dst, shift);   break;
1274 
1275     case Op_URShiftVS: // fall-through
1276     case Op_URShiftVB: psrlw(dst, shift);  break;
1277 
1278     default: assert(false, "%s", NodeClassNames[opcode]);
1279   }
1280 }
1281 
1282 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1283   switch (opcode) {
1284     case Op_RShiftVB:  // fall-through
1285     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1286 
1287     case Op_LShiftVB:  // fall-through
1288     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1289 
1290     case Op_URShiftVS: // fall-through
1291     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1292 
1293     default: assert(false, "%s", NodeClassNames[opcode]);
1294   }
1295 }
1296 
1297 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1298   switch (opcode) {
1299     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1300     case Op_LShiftVL:  psllq(dst, shift); break;
1301     case Op_URShiftVL: psrlq(dst, shift); break;
1302 
1303     default: assert(false, "%s", NodeClassNames[opcode]);
1304   }
1305 }
1306 
1307 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1308   if (opcode == Op_RShiftVL) {
1309     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1310   } else if (opcode == Op_LShiftVL) {
1311     psllq(dst, shift);
1312   } else {
1313     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1314     psrlq(dst, shift);
1315   }
1316 }
1317 
1318 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1319   switch (opcode) {
1320     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1321     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1322     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1323 
1324     default: assert(false, "%s", NodeClassNames[opcode]);
1325   }
1326 }
1327 
1328 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1329   if (opcode == Op_RShiftVL) {
1330     evpsraq(dst, nds, shift, vector_len);
1331   } else if (opcode == Op_LShiftVL) {
1332     vpsllq(dst, nds, shift, vector_len);
1333   } else {
1334     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1335     vpsrlq(dst, nds, shift, vector_len);
1336   }
1337 }
1338 
1339 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1340   switch (opcode) {
1341     case Op_RShiftVB:  // fall-through
1342     case Op_RShiftVS:  // fall-through
1343     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1344 
1345     case Op_LShiftVB:  // fall-through
1346     case Op_LShiftVS:  // fall-through
1347     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1348 
1349     case Op_URShiftVB: // fall-through
1350     case Op_URShiftVS: // fall-through
1351     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1352 
1353     default: assert(false, "%s", NodeClassNames[opcode]);
1354   }
1355 }
1356 
1357 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1358   switch (opcode) {
1359     case Op_RShiftVB:  // fall-through
1360     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1361 
1362     case Op_LShiftVB:  // fall-through
1363     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1364 
1365     case Op_URShiftVB: // fall-through
1366     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1367 
1368     default: assert(false, "%s", NodeClassNames[opcode]);
1369   }
1370 }
1371 
1372 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1373   assert(UseAVX >= 2, "required");
1374   switch (opcode) {
1375     case Op_RShiftVL: {
1376       if (UseAVX > 2) {
1377         assert(tmp == xnoreg, "not used");
1378         if (!VM_Version::supports_avx512vl()) {
1379           vlen_enc = Assembler::AVX_512bit;
1380         }
1381         evpsravq(dst, src, shift, vlen_enc);
1382       } else {
1383         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1384         vpsrlvq(dst, src, shift, vlen_enc);
1385         vpsrlvq(tmp, tmp, shift, vlen_enc);
1386         vpxor(dst, dst, tmp, vlen_enc);
1387         vpsubq(dst, dst, tmp, vlen_enc);
1388       }
1389       break;
1390     }
1391     case Op_LShiftVL: {
1392       assert(tmp == xnoreg, "not used");
1393       vpsllvq(dst, src, shift, vlen_enc);
1394       break;
1395     }
1396     case Op_URShiftVL: {
1397       assert(tmp == xnoreg, "not used");
1398       vpsrlvq(dst, src, shift, vlen_enc);
1399       break;
1400     }
1401     default: assert(false, "%s", NodeClassNames[opcode]);
1402   }
1403 }
1404 
1405 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1406 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1407   assert(opcode == Op_LShiftVB ||
1408          opcode == Op_RShiftVB ||
1409          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1410   bool sign = (opcode != Op_URShiftVB);
1411   assert(vector_len == 0, "required");
1412   vextendbd(sign, dst, src, 1);
1413   vpmovzxbd(vtmp, shift, 1);
1414   varshiftd(opcode, dst, dst, vtmp, 1);
1415   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1416   vextracti128_high(vtmp, dst);
1417   vpackusdw(dst, dst, vtmp, 0);
1418 }
1419 
1420 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1421 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1422   assert(opcode == Op_LShiftVB ||
1423          opcode == Op_RShiftVB ||
1424          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1425   bool sign = (opcode != Op_URShiftVB);
1426   int ext_vector_len = vector_len + 1;
1427   vextendbw(sign, dst, src, ext_vector_len);
1428   vpmovzxbw(vtmp, shift, ext_vector_len);
1429   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1430   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1431   if (vector_len == 0) {
1432     vextracti128_high(vtmp, dst);
1433     vpackuswb(dst, dst, vtmp, vector_len);
1434   } else {
1435     vextracti64x4_high(vtmp, dst);
1436     vpackuswb(dst, dst, vtmp, vector_len);
1437     vpermq(dst, dst, 0xD8, vector_len);
1438   }
1439 }
1440 
1441 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1442   switch(typ) {
1443     case T_BYTE:
1444       pinsrb(dst, val, idx);
1445       break;
1446     case T_SHORT:
1447       pinsrw(dst, val, idx);
1448       break;
1449     case T_INT:
1450       pinsrd(dst, val, idx);
1451       break;
1452     case T_LONG:
1453       pinsrq(dst, val, idx);
1454       break;
1455     default:
1456       assert(false,"Should not reach here.");
1457       break;
1458   }
1459 }
1460 
1461 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1462   switch(typ) {
1463     case T_BYTE:
1464       vpinsrb(dst, src, val, idx);
1465       break;
1466     case T_SHORT:
1467       vpinsrw(dst, src, val, idx);
1468       break;
1469     case T_INT:
1470       vpinsrd(dst, src, val, idx);
1471       break;
1472     case T_LONG:
1473       vpinsrq(dst, src, val, idx);
1474       break;
1475     default:
1476       assert(false,"Should not reach here.");
1477       break;
1478   }
1479 }
1480 
1481 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1482                                                 XMMRegister dst, Register base,
1483                                                 Register idx_base,
1484                                                 Register offset, Register mask,
1485                                                 Register mask_idx, Register rtmp,
1486                                                 int vlen_enc) {
1487   vpxor(dst, dst, dst, vlen_enc);
1488   if (elem_bt == T_SHORT) {
1489     for (int i = 0; i < 4; i++) {
1490       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1491       Label skip_load;
1492       btq(mask, mask_idx);
1493       jccb(Assembler::carryClear, skip_load);
1494       movl(rtmp, Address(idx_base, i * 4));
1495       if (offset != noreg) {
1496         addl(rtmp, offset);
1497       }
1498       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1499       bind(skip_load);
1500       incq(mask_idx);
1501     }
1502   } else {
1503     assert(elem_bt == T_BYTE, "");
1504     for (int i = 0; i < 8; i++) {
1505       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1506       Label skip_load;
1507       btq(mask, mask_idx);
1508       jccb(Assembler::carryClear, skip_load);
1509       movl(rtmp, Address(idx_base, i * 4));
1510       if (offset != noreg) {
1511         addl(rtmp, offset);
1512       }
1513       pinsrb(dst, Address(base, rtmp), i);
1514       bind(skip_load);
1515       incq(mask_idx);
1516     }
1517   }
1518 }
1519 
1520 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1521                                          Register base, Register idx_base,
1522                                          Register offset, Register rtmp,
1523                                          int vlen_enc) {
1524   vpxor(dst, dst, dst, vlen_enc);
1525   if (elem_bt == T_SHORT) {
1526     for (int i = 0; i < 4; i++) {
1527       // dst[i] = src[offset + idx_base[i]]
1528       movl(rtmp, Address(idx_base, i * 4));
1529       if (offset != noreg) {
1530         addl(rtmp, offset);
1531       }
1532       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1533     }
1534   } else {
1535     assert(elem_bt == T_BYTE, "");
1536     for (int i = 0; i < 8; i++) {
1537       // dst[i] = src[offset + idx_base[i]]
1538       movl(rtmp, Address(idx_base, i * 4));
1539       if (offset != noreg) {
1540         addl(rtmp, offset);
1541       }
1542       pinsrb(dst, Address(base, rtmp), i);
1543     }
1544   }
1545 }
1546 
1547 /*
1548  * Gather using hybrid algorithm, first partially unroll scalar loop
1549  * to accumulate values from gather indices into a quad-word(64bit) slice.
1550  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1551  * permutation to place the slice into appropriate vector lane
1552  * locations in destination vector. Following pseudo code describes the
1553  * algorithm in detail:
1554  *
1555  * DST_VEC = ZERO_VEC
1556  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1557  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1558  * FOREACH_ITER:
1559  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1560  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1561  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1562  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1563  *
1564  * With each iteration, doubleword permute indices (0,1) corresponding
1565  * to gathered quadword gets right shifted by two lane positions.
1566  *
1567  */
1568 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1569                                         Register base, Register idx_base,
1570                                         Register offset, Register mask,
1571                                         XMMRegister xtmp1, XMMRegister xtmp2,
1572                                         XMMRegister temp_dst, Register rtmp,
1573                                         Register mask_idx, Register length,
1574                                         int vector_len, int vlen_enc) {
1575   Label GATHER8_LOOP;
1576   assert(is_subword_type(elem_ty), "");
1577   movl(length, vector_len);
1578   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1579   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1580   vallones(xtmp2, vlen_enc);
1581   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1582   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1583   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1584 
1585   bind(GATHER8_LOOP);
1586     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1587     if (mask == noreg) {
1588       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1589     } else {
1590       vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc);
1591     }
1592     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1593     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1594     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1595     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1596     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1597     vpor(dst, dst, temp_dst, vlen_enc);
1598     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1599     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1600     jcc(Assembler::notEqual, GATHER8_LOOP);
1601 }
1602 
1603 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1604   switch(typ) {
1605     case T_INT:
1606       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1607       break;
1608     case T_FLOAT:
1609       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1610       break;
1611     case T_LONG:
1612       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1613       break;
1614     case T_DOUBLE:
1615       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1616       break;
1617     default:
1618       assert(false,"Should not reach here.");
1619       break;
1620   }
1621 }
1622 
1623 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1624   switch(typ) {
1625     case T_INT:
1626       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1627       break;
1628     case T_FLOAT:
1629       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1630       break;
1631     case T_LONG:
1632       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1633       break;
1634     case T_DOUBLE:
1635       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1636       break;
1637     default:
1638       assert(false,"Should not reach here.");
1639       break;
1640   }
1641 }
1642 
1643 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1644   switch(typ) {
1645     case T_INT:
1646       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1647       break;
1648     case T_FLOAT:
1649       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1650       break;
1651     case T_LONG:
1652       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1653       break;
1654     case T_DOUBLE:
1655       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1656       break;
1657     default:
1658       assert(false,"Should not reach here.");
1659       break;
1660   }
1661 }
1662 
1663 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1664   if (vlen_in_bytes <= 16) {
1665     pxor (dst, dst);
1666     psubb(dst, src);
1667     switch (elem_bt) {
1668       case T_BYTE:   /* nothing to do */ break;
1669       case T_SHORT:  pmovsxbw(dst, dst); break;
1670       case T_INT:    pmovsxbd(dst, dst); break;
1671       case T_FLOAT:  pmovsxbd(dst, dst); break;
1672       case T_LONG:   pmovsxbq(dst, dst); break;
1673       case T_DOUBLE: pmovsxbq(dst, dst); break;
1674 
1675       default: assert(false, "%s", type2name(elem_bt));
1676     }
1677   } else {
1678     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1679     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1680 
1681     vpxor (dst, dst, dst, vlen_enc);
1682     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1683 
1684     switch (elem_bt) {
1685       case T_BYTE:   /* nothing to do */            break;
1686       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1687       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1688       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1689       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1690       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1691 
1692       default: assert(false, "%s", type2name(elem_bt));
1693     }
1694   }
1695 }
1696 
1697 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1698   if (novlbwdq) {
1699     vpmovsxbd(xtmp, src, vlen_enc);
1700     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1701             Assembler::eq, true, vlen_enc, noreg);
1702   } else {
1703     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1704     vpsubb(xtmp, xtmp, src, vlen_enc);
1705     evpmovb2m(dst, xtmp, vlen_enc);
1706   }
1707 }
1708 
1709 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1710   if (is_integral_type(bt)) {
1711     switch (vlen_in_bytes) {
1712       case 4:  movdl(dst, src);   break;
1713       case 8:  movq(dst, src);    break;
1714       case 16: movdqu(dst, src);  break;
1715       case 32: vmovdqu(dst, src); break;
1716       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1717       default: ShouldNotReachHere();
1718     }
1719   } else {
1720     switch (vlen_in_bytes) {
1721       case 4:  movflt(dst, src); break;
1722       case 8:  movdbl(dst, src); break;
1723       case 16: movups(dst, src); break;
1724       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1725       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1726       default: ShouldNotReachHere();
1727     }
1728   }
1729 }
1730 
1731 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1732   assert(rscratch != noreg || always_reachable(src), "missing");
1733 
1734   if (reachable(src)) {
1735     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1736   } else {
1737     lea(rscratch, src);
1738     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1739   }
1740 }
1741 
1742 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1743   int vlen_enc = vector_length_encoding(vlen);
1744   if (VM_Version::supports_avx()) {
1745     if (bt == T_LONG) {
1746       if (VM_Version::supports_avx2()) {
1747         vpbroadcastq(dst, src, vlen_enc);
1748       } else {
1749         vmovddup(dst, src, vlen_enc);
1750       }
1751     } else if (bt == T_DOUBLE) {
1752       if (vlen_enc != Assembler::AVX_128bit) {
1753         vbroadcastsd(dst, src, vlen_enc, noreg);
1754       } else {
1755         vmovddup(dst, src, vlen_enc);
1756       }
1757     } else {
1758       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1759         vpbroadcastd(dst, src, vlen_enc);
1760       } else {
1761         vbroadcastss(dst, src, vlen_enc);
1762       }
1763     }
1764   } else if (VM_Version::supports_sse3()) {
1765     movddup(dst, src);
1766   } else {
1767     load_vector(bt, dst, src, vlen);
1768   }
1769 }
1770 
1771 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1772   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1773   int offset = exact_log2(type2aelembytes(bt)) << 6;
1774   if (is_floating_point_type(bt)) {
1775     offset += 128;
1776   }
1777   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1778   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1779 }
1780 
1781 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1782 
1783 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1784   int vector_len = Assembler::AVX_128bit;
1785 
1786   switch (opcode) {
1787     case Op_AndReductionV:  pand(dst, src); break;
1788     case Op_OrReductionV:   por (dst, src); break;
1789     case Op_XorReductionV:  pxor(dst, src); break;
1790     case Op_MinReductionV:
1791       switch (typ) {
1792         case T_BYTE:        pminsb(dst, src); break;
1793         case T_SHORT:       pminsw(dst, src); break;
1794         case T_INT:         pminsd(dst, src); break;
1795         case T_LONG:        assert(UseAVX > 2, "required");
1796                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1797         default:            assert(false, "wrong type");
1798       }
1799       break;
1800     case Op_MaxReductionV:
1801       switch (typ) {
1802         case T_BYTE:        pmaxsb(dst, src); break;
1803         case T_SHORT:       pmaxsw(dst, src); break;
1804         case T_INT:         pmaxsd(dst, src); break;
1805         case T_LONG:        assert(UseAVX > 2, "required");
1806                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1807         default:            assert(false, "wrong type");
1808       }
1809       break;
1810     case Op_AddReductionVF: addss(dst, src); break;
1811     case Op_AddReductionVD: addsd(dst, src); break;
1812     case Op_AddReductionVI:
1813       switch (typ) {
1814         case T_BYTE:        paddb(dst, src); break;
1815         case T_SHORT:       paddw(dst, src); break;
1816         case T_INT:         paddd(dst, src); break;
1817         default:            assert(false, "wrong type");
1818       }
1819       break;
1820     case Op_AddReductionVL: paddq(dst, src); break;
1821     case Op_MulReductionVF: mulss(dst, src); break;
1822     case Op_MulReductionVD: mulsd(dst, src); break;
1823     case Op_MulReductionVI:
1824       switch (typ) {
1825         case T_SHORT:       pmullw(dst, src); break;
1826         case T_INT:         pmulld(dst, src); break;
1827         default:            assert(false, "wrong type");
1828       }
1829       break;
1830     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1831                             evpmullq(dst, dst, src, vector_len); break;
1832     default:                assert(false, "wrong opcode");
1833   }
1834 }
1835 
1836 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1837   switch (opcode) {
1838     case Op_AddReductionVF: addps(dst, src); break;
1839     case Op_AddReductionVD: addpd(dst, src); break;
1840     case Op_MulReductionVF: mulps(dst, src); break;
1841     case Op_MulReductionVD: mulpd(dst, src); break;
1842     default:                assert(false, "%s", NodeClassNames[opcode]);
1843   }
1844 }
1845 
1846 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1847   int vector_len = Assembler::AVX_256bit;
1848 
1849   switch (opcode) {
1850     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1851     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1852     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1853     case Op_MinReductionV:
1854       switch (typ) {
1855         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1856         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1857         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1858         case T_LONG:        assert(UseAVX > 2, "required");
1859                             vpminsq(dst, src1, src2, vector_len); break;
1860         default:            assert(false, "wrong type");
1861       }
1862       break;
1863     case Op_MaxReductionV:
1864       switch (typ) {
1865         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1866         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1867         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1868         case T_LONG:        assert(UseAVX > 2, "required");
1869                             vpmaxsq(dst, src1, src2, vector_len); break;
1870         default:            assert(false, "wrong type");
1871       }
1872       break;
1873     case Op_AddReductionVI:
1874       switch (typ) {
1875         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1876         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1877         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1878         default:            assert(false, "wrong type");
1879       }
1880       break;
1881     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1882     case Op_MulReductionVI:
1883       switch (typ) {
1884         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1885         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1886         default:            assert(false, "wrong type");
1887       }
1888       break;
1889     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1890     default:                assert(false, "wrong opcode");
1891   }
1892 }
1893 
1894 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1895   int vector_len = Assembler::AVX_256bit;
1896 
1897   switch (opcode) {
1898     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1899     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1900     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1901     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1902     default:                assert(false, "%s", NodeClassNames[opcode]);
1903   }
1904 }
1905 
1906 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1907                                   XMMRegister dst, XMMRegister src,
1908                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1909   switch (opcode) {
1910     case Op_AddReductionVF:
1911     case Op_MulReductionVF:
1912       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1913       break;
1914 
1915     case Op_AddReductionVD:
1916     case Op_MulReductionVD:
1917       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1918       break;
1919 
1920     default: assert(false, "wrong opcode");
1921   }
1922 }
1923 
1924 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1925                                             XMMRegister dst, XMMRegister src,
1926                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1927   switch (opcode) {
1928     case Op_AddReductionVF:
1929     case Op_MulReductionVF:
1930       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1931       break;
1932 
1933     case Op_AddReductionVD:
1934     case Op_MulReductionVD:
1935       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1936       break;
1937 
1938     default: assert(false, "%s", NodeClassNames[opcode]);
1939   }
1940 }
1941 
1942 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1943                              Register dst, Register src1, XMMRegister src2,
1944                              XMMRegister vtmp1, XMMRegister vtmp2) {
1945   switch (vlen) {
1946     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1947     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1948     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1949     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1950 
1951     default: assert(false, "wrong vector length");
1952   }
1953 }
1954 
1955 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1956                              Register dst, Register src1, XMMRegister src2,
1957                              XMMRegister vtmp1, XMMRegister vtmp2) {
1958   switch (vlen) {
1959     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1960     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1961     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1962     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1963 
1964     default: assert(false, "wrong vector length");
1965   }
1966 }
1967 
1968 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1969                              Register dst, Register src1, XMMRegister src2,
1970                              XMMRegister vtmp1, XMMRegister vtmp2) {
1971   switch (vlen) {
1972     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1973     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1974     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1975     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1976 
1977     default: assert(false, "wrong vector length");
1978   }
1979 }
1980 
1981 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1982                              Register dst, Register src1, XMMRegister src2,
1983                              XMMRegister vtmp1, XMMRegister vtmp2) {
1984   switch (vlen) {
1985     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1986     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1987     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1988     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1989 
1990     default: assert(false, "wrong vector length");
1991   }
1992 }
1993 
1994 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1995                              Register dst, Register src1, XMMRegister src2,
1996                              XMMRegister vtmp1, XMMRegister vtmp2) {
1997   switch (vlen) {
1998     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1999     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2000     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2001 
2002     default: assert(false, "wrong vector length");
2003   }
2004 }
2005 
2006 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2007   switch (vlen) {
2008     case 2:
2009       assert(vtmp2 == xnoreg, "");
2010       reduce2F(opcode, dst, src, vtmp1);
2011       break;
2012     case 4:
2013       assert(vtmp2 == xnoreg, "");
2014       reduce4F(opcode, dst, src, vtmp1);
2015       break;
2016     case 8:
2017       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2018       break;
2019     case 16:
2020       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2021       break;
2022     default: assert(false, "wrong vector length");
2023   }
2024 }
2025 
2026 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2027   switch (vlen) {
2028     case 2:
2029       assert(vtmp2 == xnoreg, "");
2030       reduce2D(opcode, dst, src, vtmp1);
2031       break;
2032     case 4:
2033       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2034       break;
2035     case 8:
2036       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2037       break;
2038     default: assert(false, "wrong vector length");
2039   }
2040 }
2041 
2042 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2043   switch (vlen) {
2044     case 2:
2045       assert(vtmp1 == xnoreg, "");
2046       assert(vtmp2 == xnoreg, "");
2047       unorderedReduce2F(opcode, dst, src);
2048       break;
2049     case 4:
2050       assert(vtmp2 == xnoreg, "");
2051       unorderedReduce4F(opcode, dst, src, vtmp1);
2052       break;
2053     case 8:
2054       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2055       break;
2056     case 16:
2057       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2058       break;
2059     default: assert(false, "wrong vector length");
2060   }
2061 }
2062 
2063 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2064   switch (vlen) {
2065     case 2:
2066       assert(vtmp1 == xnoreg, "");
2067       assert(vtmp2 == xnoreg, "");
2068       unorderedReduce2D(opcode, dst, src);
2069       break;
2070     case 4:
2071       assert(vtmp2 == xnoreg, "");
2072       unorderedReduce4D(opcode, dst, src, vtmp1);
2073       break;
2074     case 8:
2075       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2076       break;
2077     default: assert(false, "wrong vector length");
2078   }
2079 }
2080 
2081 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2082   if (opcode == Op_AddReductionVI) {
2083     if (vtmp1 != src2) {
2084       movdqu(vtmp1, src2);
2085     }
2086     phaddd(vtmp1, vtmp1);
2087   } else {
2088     pshufd(vtmp1, src2, 0x1);
2089     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2090   }
2091   movdl(vtmp2, src1);
2092   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2093   movdl(dst, vtmp1);
2094 }
2095 
2096 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2097   if (opcode == Op_AddReductionVI) {
2098     if (vtmp1 != src2) {
2099       movdqu(vtmp1, src2);
2100     }
2101     phaddd(vtmp1, src2);
2102     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2103   } else {
2104     pshufd(vtmp2, src2, 0xE);
2105     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2106     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2107   }
2108 }
2109 
2110 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2111   if (opcode == Op_AddReductionVI) {
2112     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2113     vextracti128_high(vtmp2, vtmp1);
2114     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2115     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2116   } else {
2117     vextracti128_high(vtmp1, src2);
2118     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2119     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2120   }
2121 }
2122 
2123 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2124   vextracti64x4_high(vtmp2, src2);
2125   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2126   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2127 }
2128 
2129 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2130   pshufd(vtmp2, src2, 0x1);
2131   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2132   movdqu(vtmp1, vtmp2);
2133   psrldq(vtmp1, 2);
2134   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2135   movdqu(vtmp2, vtmp1);
2136   psrldq(vtmp2, 1);
2137   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2138   movdl(vtmp2, src1);
2139   pmovsxbd(vtmp1, vtmp1);
2140   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2141   pextrb(dst, vtmp1, 0x0);
2142   movsbl(dst, dst);
2143 }
2144 
2145 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2146   pshufd(vtmp1, src2, 0xE);
2147   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2148   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2149 }
2150 
2151 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2152   vextracti128_high(vtmp2, src2);
2153   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2154   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2155 }
2156 
2157 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2158   vextracti64x4_high(vtmp1, src2);
2159   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2160   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2161 }
2162 
2163 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2164   pmovsxbw(vtmp2, src2);
2165   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2166 }
2167 
2168 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2169   if (UseAVX > 1) {
2170     int vector_len = Assembler::AVX_256bit;
2171     vpmovsxbw(vtmp1, src2, vector_len);
2172     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2173   } else {
2174     pmovsxbw(vtmp2, src2);
2175     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2176     pshufd(vtmp2, src2, 0x1);
2177     pmovsxbw(vtmp2, src2);
2178     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2179   }
2180 }
2181 
2182 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2183   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2184     int vector_len = Assembler::AVX_512bit;
2185     vpmovsxbw(vtmp1, src2, vector_len);
2186     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2187   } else {
2188     assert(UseAVX >= 2,"Should not reach here.");
2189     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2190     vextracti128_high(vtmp2, src2);
2191     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2192   }
2193 }
2194 
2195 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2196   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2197   vextracti64x4_high(vtmp2, src2);
2198   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2199 }
2200 
2201 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2202   if (opcode == Op_AddReductionVI) {
2203     if (vtmp1 != src2) {
2204       movdqu(vtmp1, src2);
2205     }
2206     phaddw(vtmp1, vtmp1);
2207     phaddw(vtmp1, vtmp1);
2208   } else {
2209     pshufd(vtmp2, src2, 0x1);
2210     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2211     movdqu(vtmp1, vtmp2);
2212     psrldq(vtmp1, 2);
2213     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2214   }
2215   movdl(vtmp2, src1);
2216   pmovsxwd(vtmp1, vtmp1);
2217   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2218   pextrw(dst, vtmp1, 0x0);
2219   movswl(dst, dst);
2220 }
2221 
2222 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2223   if (opcode == Op_AddReductionVI) {
2224     if (vtmp1 != src2) {
2225       movdqu(vtmp1, src2);
2226     }
2227     phaddw(vtmp1, src2);
2228   } else {
2229     pshufd(vtmp1, src2, 0xE);
2230     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2231   }
2232   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2233 }
2234 
2235 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2236   if (opcode == Op_AddReductionVI) {
2237     int vector_len = Assembler::AVX_256bit;
2238     vphaddw(vtmp2, src2, src2, vector_len);
2239     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2240   } else {
2241     vextracti128_high(vtmp2, src2);
2242     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2243   }
2244   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2245 }
2246 
2247 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2248   int vector_len = Assembler::AVX_256bit;
2249   vextracti64x4_high(vtmp1, src2);
2250   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2251   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2252 }
2253 
2254 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2255   pshufd(vtmp2, src2, 0xE);
2256   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2257   movdq(vtmp1, src1);
2258   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2259   movdq(dst, vtmp1);
2260 }
2261 
2262 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2263   vextracti128_high(vtmp1, src2);
2264   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2265   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2266 }
2267 
2268 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2269   vextracti64x4_high(vtmp2, src2);
2270   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2271   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2272 }
2273 
2274 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2275   mov64(temp, -1L);
2276   bzhiq(temp, temp, len);
2277   kmovql(dst, temp);
2278 }
2279 
2280 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2281   reduce_operation_128(T_FLOAT, opcode, dst, src);
2282   pshufd(vtmp, src, 0x1);
2283   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2284 }
2285 
2286 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2287   reduce2F(opcode, dst, src, vtmp);
2288   pshufd(vtmp, src, 0x2);
2289   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2290   pshufd(vtmp, src, 0x3);
2291   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2292 }
2293 
2294 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2295   reduce4F(opcode, dst, src, vtmp2);
2296   vextractf128_high(vtmp2, src);
2297   reduce4F(opcode, dst, vtmp2, vtmp1);
2298 }
2299 
2300 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2301   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2302   vextracti64x4_high(vtmp1, src);
2303   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2304 }
2305 
2306 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2307   pshufd(dst, src, 0x1);
2308   reduce_operation_128(T_FLOAT, opcode, dst, src);
2309 }
2310 
2311 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2312   pshufd(vtmp, src, 0xE);
2313   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2314   unorderedReduce2F(opcode, dst, vtmp);
2315 }
2316 
2317 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2318   vextractf128_high(vtmp1, src);
2319   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2320   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2321 }
2322 
2323 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2324   vextractf64x4_high(vtmp2, src);
2325   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2326   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2327 }
2328 
2329 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2330   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2331   pshufd(vtmp, src, 0xE);
2332   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2333 }
2334 
2335 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2336   reduce2D(opcode, dst, src, vtmp2);
2337   vextractf128_high(vtmp2, src);
2338   reduce2D(opcode, dst, vtmp2, vtmp1);
2339 }
2340 
2341 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2342   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2343   vextracti64x4_high(vtmp1, src);
2344   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2345 }
2346 
2347 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2348   pshufd(dst, src, 0xE);
2349   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2350 }
2351 
2352 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2353   vextractf128_high(vtmp, src);
2354   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2355   unorderedReduce2D(opcode, dst, vtmp);
2356 }
2357 
2358 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2359   vextractf64x4_high(vtmp2, src);
2360   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2361   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2362 }
2363 
2364 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2365   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2366 }
2367 
2368 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2369   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2370 }
2371 
2372 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2373   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2374 }
2375 
2376 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2377                                  int vec_enc) {
2378   switch(elem_bt) {
2379     case T_INT:
2380     case T_FLOAT:
2381       vmaskmovps(dst, src, mask, vec_enc);
2382       break;
2383     case T_LONG:
2384     case T_DOUBLE:
2385       vmaskmovpd(dst, src, mask, vec_enc);
2386       break;
2387     default:
2388       fatal("Unsupported type %s", type2name(elem_bt));
2389       break;
2390   }
2391 }
2392 
2393 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2394                                  int vec_enc) {
2395   switch(elem_bt) {
2396     case T_INT:
2397     case T_FLOAT:
2398       vmaskmovps(dst, src, mask, vec_enc);
2399       break;
2400     case T_LONG:
2401     case T_DOUBLE:
2402       vmaskmovpd(dst, src, mask, vec_enc);
2403       break;
2404     default:
2405       fatal("Unsupported type %s", type2name(elem_bt));
2406       break;
2407   }
2408 }
2409 
2410 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2411                                           XMMRegister dst, XMMRegister src,
2412                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2413                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2414   const int permconst[] = {1, 14};
2415   XMMRegister wsrc = src;
2416   XMMRegister wdst = xmm_0;
2417   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2418 
2419   int vlen_enc = Assembler::AVX_128bit;
2420   if (vlen == 16) {
2421     vlen_enc = Assembler::AVX_256bit;
2422   }
2423 
2424   for (int i = log2(vlen) - 1; i >=0; i--) {
2425     if (i == 0 && !is_dst_valid) {
2426       wdst = dst;
2427     }
2428     if (i == 3) {
2429       vextracti64x4_high(wtmp, wsrc);
2430     } else if (i == 2) {
2431       vextracti128_high(wtmp, wsrc);
2432     } else { // i = [0,1]
2433       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2434     }
2435     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2436     wsrc = wdst;
2437     vlen_enc = Assembler::AVX_128bit;
2438   }
2439   if (is_dst_valid) {
2440     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2441   }
2442 }
2443 
2444 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2445                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2446                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2447   XMMRegister wsrc = src;
2448   XMMRegister wdst = xmm_0;
2449   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2450   int vlen_enc = Assembler::AVX_128bit;
2451   if (vlen == 8) {
2452     vlen_enc = Assembler::AVX_256bit;
2453   }
2454   for (int i = log2(vlen) - 1; i >=0; i--) {
2455     if (i == 0 && !is_dst_valid) {
2456       wdst = dst;
2457     }
2458     if (i == 1) {
2459       vextracti128_high(wtmp, wsrc);
2460     } else if (i == 2) {
2461       vextracti64x4_high(wtmp, wsrc);
2462     } else {
2463       assert(i == 0, "%d", i);
2464       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2465     }
2466     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2467     wsrc = wdst;
2468     vlen_enc = Assembler::AVX_128bit;
2469   }
2470   if (is_dst_valid) {
2471     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2472   }
2473 }
2474 
2475 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2476   switch (bt) {
2477     case T_BYTE:  pextrb(dst, src, idx); break;
2478     case T_SHORT: pextrw(dst, src, idx); break;
2479     case T_INT:   pextrd(dst, src, idx); break;
2480     case T_LONG:  pextrq(dst, src, idx); break;
2481 
2482     default:
2483       assert(false,"Should not reach here.");
2484       break;
2485   }
2486 }
2487 
2488 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2489   int esize =  type2aelembytes(typ);
2490   int elem_per_lane = 16/esize;
2491   int lane = elemindex / elem_per_lane;
2492   int eindex = elemindex % elem_per_lane;
2493 
2494   if (lane >= 2) {
2495     assert(UseAVX > 2, "required");
2496     vextractf32x4(dst, src, lane & 3);
2497     return dst;
2498   } else if (lane > 0) {
2499     assert(UseAVX > 0, "required");
2500     vextractf128(dst, src, lane);
2501     return dst;
2502   } else {
2503     return src;
2504   }
2505 }
2506 
2507 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2508   if (typ == T_BYTE) {
2509     movsbl(dst, dst);
2510   } else if (typ == T_SHORT) {
2511     movswl(dst, dst);
2512   }
2513 }
2514 
2515 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2516   int esize =  type2aelembytes(typ);
2517   int elem_per_lane = 16/esize;
2518   int eindex = elemindex % elem_per_lane;
2519   assert(is_integral_type(typ),"required");
2520 
2521   if (eindex == 0) {
2522     if (typ == T_LONG) {
2523       movq(dst, src);
2524     } else {
2525       movdl(dst, src);
2526       movsxl(typ, dst);
2527     }
2528   } else {
2529     extract(typ, dst, src, eindex);
2530     movsxl(typ, dst);
2531   }
2532 }
2533 
2534 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2535   int esize =  type2aelembytes(typ);
2536   int elem_per_lane = 16/esize;
2537   int eindex = elemindex % elem_per_lane;
2538   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2539 
2540   if (eindex == 0) {
2541     movq(dst, src);
2542   } else {
2543     if (typ == T_FLOAT) {
2544       if (UseAVX == 0) {
2545         movdqu(dst, src);
2546         shufps(dst, dst, eindex);
2547       } else {
2548         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2549       }
2550     } else {
2551       if (UseAVX == 0) {
2552         movdqu(dst, src);
2553         psrldq(dst, eindex*esize);
2554       } else {
2555         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2556       }
2557       movq(dst, dst);
2558     }
2559   }
2560   // Zero upper bits
2561   if (typ == T_FLOAT) {
2562     if (UseAVX == 0) {
2563       assert(vtmp != xnoreg, "required.");
2564       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2565       pand(dst, vtmp);
2566     } else {
2567       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2568     }
2569   }
2570 }
2571 
2572 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2573   switch(typ) {
2574     case T_BYTE:
2575     case T_BOOLEAN:
2576       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2577       break;
2578     case T_SHORT:
2579     case T_CHAR:
2580       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2581       break;
2582     case T_INT:
2583     case T_FLOAT:
2584       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2585       break;
2586     case T_LONG:
2587     case T_DOUBLE:
2588       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2589       break;
2590     default:
2591       assert(false,"Should not reach here.");
2592       break;
2593   }
2594 }
2595 
2596 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2597   assert(rscratch != noreg || always_reachable(src2), "missing");
2598 
2599   switch(typ) {
2600     case T_BOOLEAN:
2601     case T_BYTE:
2602       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2603       break;
2604     case T_CHAR:
2605     case T_SHORT:
2606       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2607       break;
2608     case T_INT:
2609     case T_FLOAT:
2610       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2611       break;
2612     case T_LONG:
2613     case T_DOUBLE:
2614       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2615       break;
2616     default:
2617       assert(false,"Should not reach here.");
2618       break;
2619   }
2620 }
2621 
2622 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2623   switch(typ) {
2624     case T_BYTE:
2625       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2626       break;
2627     case T_SHORT:
2628       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2629       break;
2630     case T_INT:
2631     case T_FLOAT:
2632       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2633       break;
2634     case T_LONG:
2635     case T_DOUBLE:
2636       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2637       break;
2638     default:
2639       assert(false,"Should not reach here.");
2640       break;
2641   }
2642 }
2643 
2644 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2645   assert(vlen_in_bytes <= 32, "");
2646   int esize = type2aelembytes(bt);
2647   if (vlen_in_bytes == 32) {
2648     assert(vtmp == xnoreg, "required.");
2649     if (esize >= 4) {
2650       vtestps(src1, src2, AVX_256bit);
2651     } else {
2652       vptest(src1, src2, AVX_256bit);
2653     }
2654     return;
2655   }
2656   if (vlen_in_bytes < 16) {
2657     // Duplicate the lower part to fill the whole register,
2658     // Don't need to do so for src2
2659     assert(vtmp != xnoreg, "required");
2660     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2661     pshufd(vtmp, src1, shuffle_imm);
2662   } else {
2663     assert(vtmp == xnoreg, "required");
2664     vtmp = src1;
2665   }
2666   if (esize >= 4 && VM_Version::supports_avx()) {
2667     vtestps(vtmp, src2, AVX_128bit);
2668   } else {
2669     ptest(vtmp, src2);
2670   }
2671 }
2672 
2673 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2674 #ifdef ASSERT
2675   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2676   bool is_bw_supported = VM_Version::supports_avx512bw();
2677   if (is_bw && !is_bw_supported) {
2678     assert(vlen_enc != Assembler::AVX_512bit, "required");
2679     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2680            "XMM register should be 0-15");
2681   }
2682 #endif // ASSERT
2683   switch (elem_bt) {
2684     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2685     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2686     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2687     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2688     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2689     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2690     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2691   }
2692 }
2693 
2694 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2695   assert(UseAVX >= 2, "required");
2696   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2697   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2698   if ((UseAVX > 2) &&
2699       (!is_bw || VM_Version::supports_avx512bw()) &&
2700       (!is_vl || VM_Version::supports_avx512vl())) {
2701     switch (elem_bt) {
2702       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2703       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2704       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2705       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2706       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2707     }
2708   } else {
2709     assert(vlen_enc != Assembler::AVX_512bit, "required");
2710     assert((dst->encoding() < 16),"XMM register should be 0-15");
2711     switch (elem_bt) {
2712       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2713       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2714       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2715       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2716       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2717       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2718       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2719     }
2720   }
2721 }
2722 
2723 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2724   switch (to_elem_bt) {
2725     case T_SHORT:
2726       vpmovsxbw(dst, src, vlen_enc);
2727       break;
2728     case T_INT:
2729       vpmovsxbd(dst, src, vlen_enc);
2730       break;
2731     case T_FLOAT:
2732       vpmovsxbd(dst, src, vlen_enc);
2733       vcvtdq2ps(dst, dst, vlen_enc);
2734       break;
2735     case T_LONG:
2736       vpmovsxbq(dst, src, vlen_enc);
2737       break;
2738     case T_DOUBLE: {
2739       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2740       vpmovsxbd(dst, src, mid_vlen_enc);
2741       vcvtdq2pd(dst, dst, vlen_enc);
2742       break;
2743     }
2744     default:
2745       fatal("Unsupported type %s", type2name(to_elem_bt));
2746       break;
2747   }
2748 }
2749 
2750 //-------------------------------------------------------------------------------------------
2751 
2752 // IndexOf for constant substrings with size >= 8 chars
2753 // which don't need to be loaded through stack.
2754 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2755                                          Register cnt1, Register cnt2,
2756                                          int int_cnt2,  Register result,
2757                                          XMMRegister vec, Register tmp,
2758                                          int ae) {
2759   ShortBranchVerifier sbv(this);
2760   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2761   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2762 
2763   // This method uses the pcmpestri instruction with bound registers
2764   //   inputs:
2765   //     xmm - substring
2766   //     rax - substring length (elements count)
2767   //     mem - scanned string
2768   //     rdx - string length (elements count)
2769   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2770   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2771   //   outputs:
2772   //     rcx - matched index in string
2773   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2774   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2775   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2776   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2777   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2778 
2779   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2780         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2781         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2782 
2783   // Note, inline_string_indexOf() generates checks:
2784   // if (substr.count > string.count) return -1;
2785   // if (substr.count == 0) return 0;
2786   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2787 
2788   // Load substring.
2789   if (ae == StrIntrinsicNode::UL) {
2790     pmovzxbw(vec, Address(str2, 0));
2791   } else {
2792     movdqu(vec, Address(str2, 0));
2793   }
2794   movl(cnt2, int_cnt2);
2795   movptr(result, str1); // string addr
2796 
2797   if (int_cnt2 > stride) {
2798     jmpb(SCAN_TO_SUBSTR);
2799 
2800     // Reload substr for rescan, this code
2801     // is executed only for large substrings (> 8 chars)
2802     bind(RELOAD_SUBSTR);
2803     if (ae == StrIntrinsicNode::UL) {
2804       pmovzxbw(vec, Address(str2, 0));
2805     } else {
2806       movdqu(vec, Address(str2, 0));
2807     }
2808     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2809 
2810     bind(RELOAD_STR);
2811     // We came here after the beginning of the substring was
2812     // matched but the rest of it was not so we need to search
2813     // again. Start from the next element after the previous match.
2814 
2815     // cnt2 is number of substring reminding elements and
2816     // cnt1 is number of string reminding elements when cmp failed.
2817     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2818     subl(cnt1, cnt2);
2819     addl(cnt1, int_cnt2);
2820     movl(cnt2, int_cnt2); // Now restore cnt2
2821 
2822     decrementl(cnt1);     // Shift to next element
2823     cmpl(cnt1, cnt2);
2824     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2825 
2826     addptr(result, (1<<scale1));
2827 
2828   } // (int_cnt2 > 8)
2829 
2830   // Scan string for start of substr in 16-byte vectors
2831   bind(SCAN_TO_SUBSTR);
2832   pcmpestri(vec, Address(result, 0), mode);
2833   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2834   subl(cnt1, stride);
2835   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2836   cmpl(cnt1, cnt2);
2837   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2838   addptr(result, 16);
2839   jmpb(SCAN_TO_SUBSTR);
2840 
2841   // Found a potential substr
2842   bind(FOUND_CANDIDATE);
2843   // Matched whole vector if first element matched (tmp(rcx) == 0).
2844   if (int_cnt2 == stride) {
2845     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2846   } else { // int_cnt2 > 8
2847     jccb(Assembler::overflow, FOUND_SUBSTR);
2848   }
2849   // After pcmpestri tmp(rcx) contains matched element index
2850   // Compute start addr of substr
2851   lea(result, Address(result, tmp, scale1));
2852 
2853   // Make sure string is still long enough
2854   subl(cnt1, tmp);
2855   cmpl(cnt1, cnt2);
2856   if (int_cnt2 == stride) {
2857     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2858   } else { // int_cnt2 > 8
2859     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2860   }
2861   // Left less then substring.
2862 
2863   bind(RET_NOT_FOUND);
2864   movl(result, -1);
2865   jmp(EXIT);
2866 
2867   if (int_cnt2 > stride) {
2868     // This code is optimized for the case when whole substring
2869     // is matched if its head is matched.
2870     bind(MATCH_SUBSTR_HEAD);
2871     pcmpestri(vec, Address(result, 0), mode);
2872     // Reload only string if does not match
2873     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2874 
2875     Label CONT_SCAN_SUBSTR;
2876     // Compare the rest of substring (> 8 chars).
2877     bind(FOUND_SUBSTR);
2878     // First 8 chars are already matched.
2879     negptr(cnt2);
2880     addptr(cnt2, stride);
2881 
2882     bind(SCAN_SUBSTR);
2883     subl(cnt1, stride);
2884     cmpl(cnt2, -stride); // Do not read beyond substring
2885     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2886     // Back-up strings to avoid reading beyond substring:
2887     // cnt1 = cnt1 - cnt2 + 8
2888     addl(cnt1, cnt2); // cnt2 is negative
2889     addl(cnt1, stride);
2890     movl(cnt2, stride); negptr(cnt2);
2891     bind(CONT_SCAN_SUBSTR);
2892     if (int_cnt2 < (int)G) {
2893       int tail_off1 = int_cnt2<<scale1;
2894       int tail_off2 = int_cnt2<<scale2;
2895       if (ae == StrIntrinsicNode::UL) {
2896         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2897       } else {
2898         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2899       }
2900       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2901     } else {
2902       // calculate index in register to avoid integer overflow (int_cnt2*2)
2903       movl(tmp, int_cnt2);
2904       addptr(tmp, cnt2);
2905       if (ae == StrIntrinsicNode::UL) {
2906         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2907       } else {
2908         movdqu(vec, Address(str2, tmp, scale2, 0));
2909       }
2910       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2911     }
2912     // Need to reload strings pointers if not matched whole vector
2913     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2914     addptr(cnt2, stride);
2915     jcc(Assembler::negative, SCAN_SUBSTR);
2916     // Fall through if found full substring
2917 
2918   } // (int_cnt2 > 8)
2919 
2920   bind(RET_FOUND);
2921   // Found result if we matched full small substring.
2922   // Compute substr offset
2923   subptr(result, str1);
2924   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2925     shrl(result, 1); // index
2926   }
2927   bind(EXIT);
2928 
2929 } // string_indexofC8
2930 
2931 // Small strings are loaded through stack if they cross page boundary.
2932 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2933                                        Register cnt1, Register cnt2,
2934                                        int int_cnt2,  Register result,
2935                                        XMMRegister vec, Register tmp,
2936                                        int ae) {
2937   ShortBranchVerifier sbv(this);
2938   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2939   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2940 
2941   //
2942   // int_cnt2 is length of small (< 8 chars) constant substring
2943   // or (-1) for non constant substring in which case its length
2944   // is in cnt2 register.
2945   //
2946   // Note, inline_string_indexOf() generates checks:
2947   // if (substr.count > string.count) return -1;
2948   // if (substr.count == 0) return 0;
2949   //
2950   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2951   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2952   // This method uses the pcmpestri instruction with bound registers
2953   //   inputs:
2954   //     xmm - substring
2955   //     rax - substring length (elements count)
2956   //     mem - scanned string
2957   //     rdx - string length (elements count)
2958   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2959   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2960   //   outputs:
2961   //     rcx - matched index in string
2962   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2963   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2964   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2965   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2966 
2967   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2968         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2969         FOUND_CANDIDATE;
2970 
2971   { //========================================================
2972     // We don't know where these strings are located
2973     // and we can't read beyond them. Load them through stack.
2974     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2975 
2976     movptr(tmp, rsp); // save old SP
2977 
2978     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2979       if (int_cnt2 == (1>>scale2)) { // One byte
2980         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2981         load_unsigned_byte(result, Address(str2, 0));
2982         movdl(vec, result); // move 32 bits
2983       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2984         // Not enough header space in 32-bit VM: 12+3 = 15.
2985         movl(result, Address(str2, -1));
2986         shrl(result, 8);
2987         movdl(vec, result); // move 32 bits
2988       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2989         load_unsigned_short(result, Address(str2, 0));
2990         movdl(vec, result); // move 32 bits
2991       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2992         movdl(vec, Address(str2, 0)); // move 32 bits
2993       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2994         movq(vec, Address(str2, 0));  // move 64 bits
2995       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2996         // Array header size is 12 bytes in 32-bit VM
2997         // + 6 bytes for 3 chars == 18 bytes,
2998         // enough space to load vec and shift.
2999         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3000         if (ae == StrIntrinsicNode::UL) {
3001           int tail_off = int_cnt2-8;
3002           pmovzxbw(vec, Address(str2, tail_off));
3003           psrldq(vec, -2*tail_off);
3004         }
3005         else {
3006           int tail_off = int_cnt2*(1<<scale2);
3007           movdqu(vec, Address(str2, tail_off-16));
3008           psrldq(vec, 16-tail_off);
3009         }
3010       }
3011     } else { // not constant substring
3012       cmpl(cnt2, stride);
3013       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3014 
3015       // We can read beyond string if srt+16 does not cross page boundary
3016       // since heaps are aligned and mapped by pages.
3017       assert(os::vm_page_size() < (int)G, "default page should be small");
3018       movl(result, str2); // We need only low 32 bits
3019       andl(result, ((int)os::vm_page_size()-1));
3020       cmpl(result, ((int)os::vm_page_size()-16));
3021       jccb(Assembler::belowEqual, CHECK_STR);
3022 
3023       // Move small strings to stack to allow load 16 bytes into vec.
3024       subptr(rsp, 16);
3025       int stk_offset = wordSize-(1<<scale2);
3026       push(cnt2);
3027 
3028       bind(COPY_SUBSTR);
3029       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3030         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3031         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3032       } else if (ae == StrIntrinsicNode::UU) {
3033         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3034         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3035       }
3036       decrement(cnt2);
3037       jccb(Assembler::notZero, COPY_SUBSTR);
3038 
3039       pop(cnt2);
3040       movptr(str2, rsp);  // New substring address
3041     } // non constant
3042 
3043     bind(CHECK_STR);
3044     cmpl(cnt1, stride);
3045     jccb(Assembler::aboveEqual, BIG_STRINGS);
3046 
3047     // Check cross page boundary.
3048     movl(result, str1); // We need only low 32 bits
3049     andl(result, ((int)os::vm_page_size()-1));
3050     cmpl(result, ((int)os::vm_page_size()-16));
3051     jccb(Assembler::belowEqual, BIG_STRINGS);
3052 
3053     subptr(rsp, 16);
3054     int stk_offset = -(1<<scale1);
3055     if (int_cnt2 < 0) { // not constant
3056       push(cnt2);
3057       stk_offset += wordSize;
3058     }
3059     movl(cnt2, cnt1);
3060 
3061     bind(COPY_STR);
3062     if (ae == StrIntrinsicNode::LL) {
3063       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3064       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3065     } else {
3066       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3067       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3068     }
3069     decrement(cnt2);
3070     jccb(Assembler::notZero, COPY_STR);
3071 
3072     if (int_cnt2 < 0) { // not constant
3073       pop(cnt2);
3074     }
3075     movptr(str1, rsp);  // New string address
3076 
3077     bind(BIG_STRINGS);
3078     // Load substring.
3079     if (int_cnt2 < 0) { // -1
3080       if (ae == StrIntrinsicNode::UL) {
3081         pmovzxbw(vec, Address(str2, 0));
3082       } else {
3083         movdqu(vec, Address(str2, 0));
3084       }
3085       push(cnt2);       // substr count
3086       push(str2);       // substr addr
3087       push(str1);       // string addr
3088     } else {
3089       // Small (< 8 chars) constant substrings are loaded already.
3090       movl(cnt2, int_cnt2);
3091     }
3092     push(tmp);  // original SP
3093 
3094   } // Finished loading
3095 
3096   //========================================================
3097   // Start search
3098   //
3099 
3100   movptr(result, str1); // string addr
3101 
3102   if (int_cnt2  < 0) {  // Only for non constant substring
3103     jmpb(SCAN_TO_SUBSTR);
3104 
3105     // SP saved at sp+0
3106     // String saved at sp+1*wordSize
3107     // Substr saved at sp+2*wordSize
3108     // Substr count saved at sp+3*wordSize
3109 
3110     // Reload substr for rescan, this code
3111     // is executed only for large substrings (> 8 chars)
3112     bind(RELOAD_SUBSTR);
3113     movptr(str2, Address(rsp, 2*wordSize));
3114     movl(cnt2, Address(rsp, 3*wordSize));
3115     if (ae == StrIntrinsicNode::UL) {
3116       pmovzxbw(vec, Address(str2, 0));
3117     } else {
3118       movdqu(vec, Address(str2, 0));
3119     }
3120     // We came here after the beginning of the substring was
3121     // matched but the rest of it was not so we need to search
3122     // again. Start from the next element after the previous match.
3123     subptr(str1, result); // Restore counter
3124     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3125       shrl(str1, 1);
3126     }
3127     addl(cnt1, str1);
3128     decrementl(cnt1);   // Shift to next element
3129     cmpl(cnt1, cnt2);
3130     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3131 
3132     addptr(result, (1<<scale1));
3133   } // non constant
3134 
3135   // Scan string for start of substr in 16-byte vectors
3136   bind(SCAN_TO_SUBSTR);
3137   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3138   pcmpestri(vec, Address(result, 0), mode);
3139   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3140   subl(cnt1, stride);
3141   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3142   cmpl(cnt1, cnt2);
3143   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3144   addptr(result, 16);
3145 
3146   bind(ADJUST_STR);
3147   cmpl(cnt1, stride); // Do not read beyond string
3148   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3149   // Back-up string to avoid reading beyond string.
3150   lea(result, Address(result, cnt1, scale1, -16));
3151   movl(cnt1, stride);
3152   jmpb(SCAN_TO_SUBSTR);
3153 
3154   // Found a potential substr
3155   bind(FOUND_CANDIDATE);
3156   // After pcmpestri tmp(rcx) contains matched element index
3157 
3158   // Make sure string is still long enough
3159   subl(cnt1, tmp);
3160   cmpl(cnt1, cnt2);
3161   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3162   // Left less then substring.
3163 
3164   bind(RET_NOT_FOUND);
3165   movl(result, -1);
3166   jmp(CLEANUP);
3167 
3168   bind(FOUND_SUBSTR);
3169   // Compute start addr of substr
3170   lea(result, Address(result, tmp, scale1));
3171   if (int_cnt2 > 0) { // Constant substring
3172     // Repeat search for small substring (< 8 chars)
3173     // from new point without reloading substring.
3174     // Have to check that we don't read beyond string.
3175     cmpl(tmp, stride-int_cnt2);
3176     jccb(Assembler::greater, ADJUST_STR);
3177     // Fall through if matched whole substring.
3178   } else { // non constant
3179     assert(int_cnt2 == -1, "should be != 0");
3180 
3181     addl(tmp, cnt2);
3182     // Found result if we matched whole substring.
3183     cmpl(tmp, stride);
3184     jcc(Assembler::lessEqual, RET_FOUND);
3185 
3186     // Repeat search for small substring (<= 8 chars)
3187     // from new point 'str1' without reloading substring.
3188     cmpl(cnt2, stride);
3189     // Have to check that we don't read beyond string.
3190     jccb(Assembler::lessEqual, ADJUST_STR);
3191 
3192     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3193     // Compare the rest of substring (> 8 chars).
3194     movptr(str1, result);
3195 
3196     cmpl(tmp, cnt2);
3197     // First 8 chars are already matched.
3198     jccb(Assembler::equal, CHECK_NEXT);
3199 
3200     bind(SCAN_SUBSTR);
3201     pcmpestri(vec, Address(str1, 0), mode);
3202     // Need to reload strings pointers if not matched whole vector
3203     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3204 
3205     bind(CHECK_NEXT);
3206     subl(cnt2, stride);
3207     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3208     addptr(str1, 16);
3209     if (ae == StrIntrinsicNode::UL) {
3210       addptr(str2, 8);
3211     } else {
3212       addptr(str2, 16);
3213     }
3214     subl(cnt1, stride);
3215     cmpl(cnt2, stride); // Do not read beyond substring
3216     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3217     // Back-up strings to avoid reading beyond substring.
3218 
3219     if (ae == StrIntrinsicNode::UL) {
3220       lea(str2, Address(str2, cnt2, scale2, -8));
3221       lea(str1, Address(str1, cnt2, scale1, -16));
3222     } else {
3223       lea(str2, Address(str2, cnt2, scale2, -16));
3224       lea(str1, Address(str1, cnt2, scale1, -16));
3225     }
3226     subl(cnt1, cnt2);
3227     movl(cnt2, stride);
3228     addl(cnt1, stride);
3229     bind(CONT_SCAN_SUBSTR);
3230     if (ae == StrIntrinsicNode::UL) {
3231       pmovzxbw(vec, Address(str2, 0));
3232     } else {
3233       movdqu(vec, Address(str2, 0));
3234     }
3235     jmp(SCAN_SUBSTR);
3236 
3237     bind(RET_FOUND_LONG);
3238     movptr(str1, Address(rsp, wordSize));
3239   } // non constant
3240 
3241   bind(RET_FOUND);
3242   // Compute substr offset
3243   subptr(result, str1);
3244   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3245     shrl(result, 1); // index
3246   }
3247   bind(CLEANUP);
3248   pop(rsp); // restore SP
3249 
3250 } // string_indexof
3251 
3252 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3253                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3254   ShortBranchVerifier sbv(this);
3255   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3256 
3257   int stride = 8;
3258 
3259   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3260         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3261         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3262         FOUND_SEQ_CHAR, DONE_LABEL;
3263 
3264   movptr(result, str1);
3265   if (UseAVX >= 2) {
3266     cmpl(cnt1, stride);
3267     jcc(Assembler::less, SCAN_TO_CHAR);
3268     cmpl(cnt1, 2*stride);
3269     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3270     movdl(vec1, ch);
3271     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3272     vpxor(vec2, vec2);
3273     movl(tmp, cnt1);
3274     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3275     andl(cnt1,0x0000000F);  //tail count (in chars)
3276 
3277     bind(SCAN_TO_16_CHAR_LOOP);
3278     vmovdqu(vec3, Address(result, 0));
3279     vpcmpeqw(vec3, vec3, vec1, 1);
3280     vptest(vec2, vec3);
3281     jcc(Assembler::carryClear, FOUND_CHAR);
3282     addptr(result, 32);
3283     subl(tmp, 2*stride);
3284     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3285     jmp(SCAN_TO_8_CHAR);
3286     bind(SCAN_TO_8_CHAR_INIT);
3287     movdl(vec1, ch);
3288     pshuflw(vec1, vec1, 0x00);
3289     pshufd(vec1, vec1, 0);
3290     pxor(vec2, vec2);
3291   }
3292   bind(SCAN_TO_8_CHAR);
3293   cmpl(cnt1, stride);
3294   jcc(Assembler::less, SCAN_TO_CHAR);
3295   if (UseAVX < 2) {
3296     movdl(vec1, ch);
3297     pshuflw(vec1, vec1, 0x00);
3298     pshufd(vec1, vec1, 0);
3299     pxor(vec2, vec2);
3300   }
3301   movl(tmp, cnt1);
3302   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3303   andl(cnt1,0x00000007);  //tail count (in chars)
3304 
3305   bind(SCAN_TO_8_CHAR_LOOP);
3306   movdqu(vec3, Address(result, 0));
3307   pcmpeqw(vec3, vec1);
3308   ptest(vec2, vec3);
3309   jcc(Assembler::carryClear, FOUND_CHAR);
3310   addptr(result, 16);
3311   subl(tmp, stride);
3312   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3313   bind(SCAN_TO_CHAR);
3314   testl(cnt1, cnt1);
3315   jcc(Assembler::zero, RET_NOT_FOUND);
3316   bind(SCAN_TO_CHAR_LOOP);
3317   load_unsigned_short(tmp, Address(result, 0));
3318   cmpl(ch, tmp);
3319   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3320   addptr(result, 2);
3321   subl(cnt1, 1);
3322   jccb(Assembler::zero, RET_NOT_FOUND);
3323   jmp(SCAN_TO_CHAR_LOOP);
3324 
3325   bind(RET_NOT_FOUND);
3326   movl(result, -1);
3327   jmpb(DONE_LABEL);
3328 
3329   bind(FOUND_CHAR);
3330   if (UseAVX >= 2) {
3331     vpmovmskb(tmp, vec3);
3332   } else {
3333     pmovmskb(tmp, vec3);
3334   }
3335   bsfl(ch, tmp);
3336   addptr(result, ch);
3337 
3338   bind(FOUND_SEQ_CHAR);
3339   subptr(result, str1);
3340   shrl(result, 1);
3341 
3342   bind(DONE_LABEL);
3343 } // string_indexof_char
3344 
3345 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3346                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3347   ShortBranchVerifier sbv(this);
3348   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3349 
3350   int stride = 16;
3351 
3352   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3353         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3354         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3355         FOUND_SEQ_CHAR, DONE_LABEL;
3356 
3357   movptr(result, str1);
3358   if (UseAVX >= 2) {
3359     cmpl(cnt1, stride);
3360     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3361     cmpl(cnt1, stride*2);
3362     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3363     movdl(vec1, ch);
3364     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3365     vpxor(vec2, vec2);
3366     movl(tmp, cnt1);
3367     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3368     andl(cnt1,0x0000001F);  //tail count (in chars)
3369 
3370     bind(SCAN_TO_32_CHAR_LOOP);
3371     vmovdqu(vec3, Address(result, 0));
3372     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3373     vptest(vec2, vec3);
3374     jcc(Assembler::carryClear, FOUND_CHAR);
3375     addptr(result, 32);
3376     subl(tmp, stride*2);
3377     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3378     jmp(SCAN_TO_16_CHAR);
3379 
3380     bind(SCAN_TO_16_CHAR_INIT);
3381     movdl(vec1, ch);
3382     pxor(vec2, vec2);
3383     pshufb(vec1, vec2);
3384   }
3385 
3386   bind(SCAN_TO_16_CHAR);
3387   cmpl(cnt1, stride);
3388   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3389   if (UseAVX < 2) {
3390     movdl(vec1, ch);
3391     pxor(vec2, vec2);
3392     pshufb(vec1, vec2);
3393   }
3394   movl(tmp, cnt1);
3395   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3396   andl(cnt1,0x0000000F);  //tail count (in bytes)
3397 
3398   bind(SCAN_TO_16_CHAR_LOOP);
3399   movdqu(vec3, Address(result, 0));
3400   pcmpeqb(vec3, vec1);
3401   ptest(vec2, vec3);
3402   jcc(Assembler::carryClear, FOUND_CHAR);
3403   addptr(result, 16);
3404   subl(tmp, stride);
3405   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3406 
3407   bind(SCAN_TO_CHAR_INIT);
3408   testl(cnt1, cnt1);
3409   jcc(Assembler::zero, RET_NOT_FOUND);
3410   bind(SCAN_TO_CHAR_LOOP);
3411   load_unsigned_byte(tmp, Address(result, 0));
3412   cmpl(ch, tmp);
3413   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3414   addptr(result, 1);
3415   subl(cnt1, 1);
3416   jccb(Assembler::zero, RET_NOT_FOUND);
3417   jmp(SCAN_TO_CHAR_LOOP);
3418 
3419   bind(RET_NOT_FOUND);
3420   movl(result, -1);
3421   jmpb(DONE_LABEL);
3422 
3423   bind(FOUND_CHAR);
3424   if (UseAVX >= 2) {
3425     vpmovmskb(tmp, vec3);
3426   } else {
3427     pmovmskb(tmp, vec3);
3428   }
3429   bsfl(ch, tmp);
3430   addptr(result, ch);
3431 
3432   bind(FOUND_SEQ_CHAR);
3433   subptr(result, str1);
3434 
3435   bind(DONE_LABEL);
3436 } // stringL_indexof_char
3437 
3438 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3439   switch (eltype) {
3440   case T_BOOLEAN: return sizeof(jboolean);
3441   case T_BYTE:  return sizeof(jbyte);
3442   case T_SHORT: return sizeof(jshort);
3443   case T_CHAR:  return sizeof(jchar);
3444   case T_INT:   return sizeof(jint);
3445   default:
3446     ShouldNotReachHere();
3447     return -1;
3448   }
3449 }
3450 
3451 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3452   switch (eltype) {
3453   // T_BOOLEAN used as surrogate for unsigned byte
3454   case T_BOOLEAN: movzbl(dst, src);   break;
3455   case T_BYTE:    movsbl(dst, src);   break;
3456   case T_SHORT:   movswl(dst, src);   break;
3457   case T_CHAR:    movzwl(dst, src);   break;
3458   case T_INT:     movl(dst, src);     break;
3459   default:
3460     ShouldNotReachHere();
3461   }
3462 }
3463 
3464 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3465   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3466 }
3467 
3468 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3469   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3470 }
3471 
3472 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3473   const int vlen = Assembler::AVX_256bit;
3474   switch (eltype) {
3475   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3476   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3477   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3478   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3479   case T_INT:
3480     // do nothing
3481     break;
3482   default:
3483     ShouldNotReachHere();
3484   }
3485 }
3486 
3487 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3488                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3489                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3490                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3491                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3492                                         BasicType eltype) {
3493   ShortBranchVerifier sbv(this);
3494   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3495   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3496   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3497 
3498   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3499         SHORT_UNROLLED_LOOP_EXIT,
3500         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3501         UNROLLED_VECTOR_LOOP_BEGIN,
3502         END;
3503   switch (eltype) {
3504   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3505   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3506   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3507   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3508   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3509   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3510   }
3511 
3512   // For "renaming" for readibility of the code
3513   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3514                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3515                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3516 
3517   const int elsize = arrays_hashcode_elsize(eltype);
3518 
3519   /*
3520     if (cnt1 >= 2) {
3521       if (cnt1 >= 32) {
3522         UNROLLED VECTOR LOOP
3523       }
3524       UNROLLED SCALAR LOOP
3525     }
3526     SINGLE SCALAR
3527    */
3528 
3529   cmpl(cnt1, 32);
3530   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3531 
3532   // cnt1 >= 32 && generate_vectorized_loop
3533   xorl(index, index);
3534 
3535   // vresult = IntVector.zero(I256);
3536   for (int idx = 0; idx < 4; idx++) {
3537     vpxor(vresult[idx], vresult[idx]);
3538   }
3539   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3540   Register bound = tmp2;
3541   Register next = tmp3;
3542   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3543   movl(next, Address(tmp2, 0));
3544   movdl(vnext, next);
3545   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3546 
3547   // index = 0;
3548   // bound = cnt1 & ~(32 - 1);
3549   movl(bound, cnt1);
3550   andl(bound, ~(32 - 1));
3551   // for (; index < bound; index += 32) {
3552   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3553   // result *= next;
3554   imull(result, next);
3555   // loop fission to upfront the cost of fetching from memory, OOO execution
3556   // can then hopefully do a better job of prefetching
3557   for (int idx = 0; idx < 4; idx++) {
3558     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3559   }
3560   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3561   for (int idx = 0; idx < 4; idx++) {
3562     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3563     arrays_hashcode_elvcast(vtmp[idx], eltype);
3564     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3565   }
3566   // index += 32;
3567   addl(index, 32);
3568   // index < bound;
3569   cmpl(index, bound);
3570   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3571   // }
3572 
3573   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3574   subl(cnt1, bound);
3575   // release bound
3576 
3577   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3578   for (int idx = 0; idx < 4; idx++) {
3579     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3580     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3581     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3582   }
3583   // result += vresult.reduceLanes(ADD);
3584   for (int idx = 0; idx < 4; idx++) {
3585     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3586   }
3587 
3588   // } else if (cnt1 < 32) {
3589 
3590   bind(SHORT_UNROLLED_BEGIN);
3591   // int i = 1;
3592   movl(index, 1);
3593   cmpl(index, cnt1);
3594   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3595 
3596   // for (; i < cnt1 ; i += 2) {
3597   bind(SHORT_UNROLLED_LOOP_BEGIN);
3598   movl(tmp3, 961);
3599   imull(result, tmp3);
3600   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3601   movl(tmp3, tmp2);
3602   shll(tmp3, 5);
3603   subl(tmp3, tmp2);
3604   addl(result, tmp3);
3605   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3606   addl(result, tmp3);
3607   addl(index, 2);
3608   cmpl(index, cnt1);
3609   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3610 
3611   // }
3612   // if (i >= cnt1) {
3613   bind(SHORT_UNROLLED_LOOP_EXIT);
3614   jccb(Assembler::greater, END);
3615   movl(tmp2, result);
3616   shll(result, 5);
3617   subl(result, tmp2);
3618   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3619   addl(result, tmp3);
3620   // }
3621   bind(END);
3622 
3623   BLOCK_COMMENT("} // arrays_hashcode");
3624 
3625 } // arrays_hashcode
3626 
3627 // helper function for string_compare
3628 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3629                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3630                                            Address::ScaleFactor scale2, Register index, int ae) {
3631   if (ae == StrIntrinsicNode::LL) {
3632     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3633     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3634   } else if (ae == StrIntrinsicNode::UU) {
3635     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3636     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3637   } else {
3638     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3639     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3640   }
3641 }
3642 
3643 // Compare strings, used for char[] and byte[].
3644 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3645                                        Register cnt1, Register cnt2, Register result,
3646                                        XMMRegister vec1, int ae, KRegister mask) {
3647   ShortBranchVerifier sbv(this);
3648   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3649   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3650   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3651   int stride2x2 = 0x40;
3652   Address::ScaleFactor scale = Address::no_scale;
3653   Address::ScaleFactor scale1 = Address::no_scale;
3654   Address::ScaleFactor scale2 = Address::no_scale;
3655 
3656   if (ae != StrIntrinsicNode::LL) {
3657     stride2x2 = 0x20;
3658   }
3659 
3660   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3661     shrl(cnt2, 1);
3662   }
3663   // Compute the minimum of the string lengths and the
3664   // difference of the string lengths (stack).
3665   // Do the conditional move stuff
3666   movl(result, cnt1);
3667   subl(cnt1, cnt2);
3668   push(cnt1);
3669   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3670 
3671   // Is the minimum length zero?
3672   testl(cnt2, cnt2);
3673   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3674   if (ae == StrIntrinsicNode::LL) {
3675     // Load first bytes
3676     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3677     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3678   } else if (ae == StrIntrinsicNode::UU) {
3679     // Load first characters
3680     load_unsigned_short(result, Address(str1, 0));
3681     load_unsigned_short(cnt1, Address(str2, 0));
3682   } else {
3683     load_unsigned_byte(result, Address(str1, 0));
3684     load_unsigned_short(cnt1, Address(str2, 0));
3685   }
3686   subl(result, cnt1);
3687   jcc(Assembler::notZero,  POP_LABEL);
3688 
3689   if (ae == StrIntrinsicNode::UU) {
3690     // Divide length by 2 to get number of chars
3691     shrl(cnt2, 1);
3692   }
3693   cmpl(cnt2, 1);
3694   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3695 
3696   // Check if the strings start at the same location and setup scale and stride
3697   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3698     cmpptr(str1, str2);
3699     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3700     if (ae == StrIntrinsicNode::LL) {
3701       scale = Address::times_1;
3702       stride = 16;
3703     } else {
3704       scale = Address::times_2;
3705       stride = 8;
3706     }
3707   } else {
3708     scale1 = Address::times_1;
3709     scale2 = Address::times_2;
3710     // scale not used
3711     stride = 8;
3712   }
3713 
3714   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3715     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3716     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3717     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3718     Label COMPARE_TAIL_LONG;
3719     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3720 
3721     int pcmpmask = 0x19;
3722     if (ae == StrIntrinsicNode::LL) {
3723       pcmpmask &= ~0x01;
3724     }
3725 
3726     // Setup to compare 16-chars (32-bytes) vectors,
3727     // start from first character again because it has aligned address.
3728     if (ae == StrIntrinsicNode::LL) {
3729       stride2 = 32;
3730     } else {
3731       stride2 = 16;
3732     }
3733     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3734       adr_stride = stride << scale;
3735     } else {
3736       adr_stride1 = 8;  //stride << scale1;
3737       adr_stride2 = 16; //stride << scale2;
3738     }
3739 
3740     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3741     // rax and rdx are used by pcmpestri as elements counters
3742     movl(result, cnt2);
3743     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3744     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3745 
3746     // fast path : compare first 2 8-char vectors.
3747     bind(COMPARE_16_CHARS);
3748     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3749       movdqu(vec1, Address(str1, 0));
3750     } else {
3751       pmovzxbw(vec1, Address(str1, 0));
3752     }
3753     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3754     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3755 
3756     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3757       movdqu(vec1, Address(str1, adr_stride));
3758       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3759     } else {
3760       pmovzxbw(vec1, Address(str1, adr_stride1));
3761       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3762     }
3763     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3764     addl(cnt1, stride);
3765 
3766     // Compare the characters at index in cnt1
3767     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3768     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3769     subl(result, cnt2);
3770     jmp(POP_LABEL);
3771 
3772     // Setup the registers to start vector comparison loop
3773     bind(COMPARE_WIDE_VECTORS);
3774     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3775       lea(str1, Address(str1, result, scale));
3776       lea(str2, Address(str2, result, scale));
3777     } else {
3778       lea(str1, Address(str1, result, scale1));
3779       lea(str2, Address(str2, result, scale2));
3780     }
3781     subl(result, stride2);
3782     subl(cnt2, stride2);
3783     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3784     negptr(result);
3785 
3786     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3787     bind(COMPARE_WIDE_VECTORS_LOOP);
3788 
3789     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3790       cmpl(cnt2, stride2x2);
3791       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3792       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3793       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3794 
3795       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3796       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3797         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3798         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3799       } else {
3800         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3801         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3802       }
3803       kortestql(mask, mask);
3804       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3805       addptr(result, stride2x2);  // update since we already compared at this addr
3806       subl(cnt2, stride2x2);      // and sub the size too
3807       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3808 
3809       vpxor(vec1, vec1);
3810       jmpb(COMPARE_WIDE_TAIL);
3811     }//if (VM_Version::supports_avx512vlbw())
3812 
3813     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3814     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3815       vmovdqu(vec1, Address(str1, result, scale));
3816       vpxor(vec1, Address(str2, result, scale));
3817     } else {
3818       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3819       vpxor(vec1, Address(str2, result, scale2));
3820     }
3821     vptest(vec1, vec1);
3822     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3823     addptr(result, stride2);
3824     subl(cnt2, stride2);
3825     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3826     // clean upper bits of YMM registers
3827     vpxor(vec1, vec1);
3828 
3829     // compare wide vectors tail
3830     bind(COMPARE_WIDE_TAIL);
3831     testptr(result, result);
3832     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3833 
3834     movl(result, stride2);
3835     movl(cnt2, result);
3836     negptr(result);
3837     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3838 
3839     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3840     bind(VECTOR_NOT_EQUAL);
3841     // clean upper bits of YMM registers
3842     vpxor(vec1, vec1);
3843     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3844       lea(str1, Address(str1, result, scale));
3845       lea(str2, Address(str2, result, scale));
3846     } else {
3847       lea(str1, Address(str1, result, scale1));
3848       lea(str2, Address(str2, result, scale2));
3849     }
3850     jmp(COMPARE_16_CHARS);
3851 
3852     // Compare tail chars, length between 1 to 15 chars
3853     bind(COMPARE_TAIL_LONG);
3854     movl(cnt2, result);
3855     cmpl(cnt2, stride);
3856     jcc(Assembler::less, COMPARE_SMALL_STR);
3857 
3858     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3859       movdqu(vec1, Address(str1, 0));
3860     } else {
3861       pmovzxbw(vec1, Address(str1, 0));
3862     }
3863     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3864     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3865     subptr(cnt2, stride);
3866     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3867     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3868       lea(str1, Address(str1, result, scale));
3869       lea(str2, Address(str2, result, scale));
3870     } else {
3871       lea(str1, Address(str1, result, scale1));
3872       lea(str2, Address(str2, result, scale2));
3873     }
3874     negptr(cnt2);
3875     jmpb(WHILE_HEAD_LABEL);
3876 
3877     bind(COMPARE_SMALL_STR);
3878   } else if (UseSSE42Intrinsics) {
3879     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3880     int pcmpmask = 0x19;
3881     // Setup to compare 8-char (16-byte) vectors,
3882     // start from first character again because it has aligned address.
3883     movl(result, cnt2);
3884     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3885     if (ae == StrIntrinsicNode::LL) {
3886       pcmpmask &= ~0x01;
3887     }
3888     jcc(Assembler::zero, COMPARE_TAIL);
3889     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3890       lea(str1, Address(str1, result, scale));
3891       lea(str2, Address(str2, result, scale));
3892     } else {
3893       lea(str1, Address(str1, result, scale1));
3894       lea(str2, Address(str2, result, scale2));
3895     }
3896     negptr(result);
3897 
3898     // pcmpestri
3899     //   inputs:
3900     //     vec1- substring
3901     //     rax - negative string length (elements count)
3902     //     mem - scanned string
3903     //     rdx - string length (elements count)
3904     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3905     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3906     //   outputs:
3907     //     rcx - first mismatched element index
3908     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3909 
3910     bind(COMPARE_WIDE_VECTORS);
3911     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3912       movdqu(vec1, Address(str1, result, scale));
3913       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3914     } else {
3915       pmovzxbw(vec1, Address(str1, result, scale1));
3916       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3917     }
3918     // After pcmpestri cnt1(rcx) contains mismatched element index
3919 
3920     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3921     addptr(result, stride);
3922     subptr(cnt2, stride);
3923     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3924 
3925     // compare wide vectors tail
3926     testptr(result, result);
3927     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3928 
3929     movl(cnt2, stride);
3930     movl(result, stride);
3931     negptr(result);
3932     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3933       movdqu(vec1, Address(str1, result, scale));
3934       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3935     } else {
3936       pmovzxbw(vec1, Address(str1, result, scale1));
3937       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3938     }
3939     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3940 
3941     // Mismatched characters in the vectors
3942     bind(VECTOR_NOT_EQUAL);
3943     addptr(cnt1, result);
3944     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3945     subl(result, cnt2);
3946     jmpb(POP_LABEL);
3947 
3948     bind(COMPARE_TAIL); // limit is zero
3949     movl(cnt2, result);
3950     // Fallthru to tail compare
3951   }
3952   // Shift str2 and str1 to the end of the arrays, negate min
3953   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3954     lea(str1, Address(str1, cnt2, scale));
3955     lea(str2, Address(str2, cnt2, scale));
3956   } else {
3957     lea(str1, Address(str1, cnt2, scale1));
3958     lea(str2, Address(str2, cnt2, scale2));
3959   }
3960   decrementl(cnt2);  // first character was compared already
3961   negptr(cnt2);
3962 
3963   // Compare the rest of the elements
3964   bind(WHILE_HEAD_LABEL);
3965   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3966   subl(result, cnt1);
3967   jccb(Assembler::notZero, POP_LABEL);
3968   increment(cnt2);
3969   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3970 
3971   // Strings are equal up to min length.  Return the length difference.
3972   bind(LENGTH_DIFF_LABEL);
3973   pop(result);
3974   if (ae == StrIntrinsicNode::UU) {
3975     // Divide diff by 2 to get number of chars
3976     sarl(result, 1);
3977   }
3978   jmpb(DONE_LABEL);
3979 
3980   if (VM_Version::supports_avx512vlbw()) {
3981 
3982     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3983 
3984     kmovql(cnt1, mask);
3985     notq(cnt1);
3986     bsfq(cnt2, cnt1);
3987     if (ae != StrIntrinsicNode::LL) {
3988       // Divide diff by 2 to get number of chars
3989       sarl(cnt2, 1);
3990     }
3991     addq(result, cnt2);
3992     if (ae == StrIntrinsicNode::LL) {
3993       load_unsigned_byte(cnt1, Address(str2, result));
3994       load_unsigned_byte(result, Address(str1, result));
3995     } else if (ae == StrIntrinsicNode::UU) {
3996       load_unsigned_short(cnt1, Address(str2, result, scale));
3997       load_unsigned_short(result, Address(str1, result, scale));
3998     } else {
3999       load_unsigned_short(cnt1, Address(str2, result, scale2));
4000       load_unsigned_byte(result, Address(str1, result, scale1));
4001     }
4002     subl(result, cnt1);
4003     jmpb(POP_LABEL);
4004   }//if (VM_Version::supports_avx512vlbw())
4005 
4006   // Discard the stored length difference
4007   bind(POP_LABEL);
4008   pop(cnt1);
4009 
4010   // That's it
4011   bind(DONE_LABEL);
4012   if(ae == StrIntrinsicNode::UL) {
4013     negl(result);
4014   }
4015 
4016 }
4017 
4018 // Search for Non-ASCII character (Negative byte value) in a byte array,
4019 // return the index of the first such character, otherwise the length
4020 // of the array segment searched.
4021 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4022 //   @IntrinsicCandidate
4023 //   public static int countPositives(byte[] ba, int off, int len) {
4024 //     for (int i = off; i < off + len; i++) {
4025 //       if (ba[i] < 0) {
4026 //         return i - off;
4027 //       }
4028 //     }
4029 //     return len;
4030 //   }
4031 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4032   Register result, Register tmp1,
4033   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4034   // rsi: byte array
4035   // rcx: len
4036   // rax: result
4037   ShortBranchVerifier sbv(this);
4038   assert_different_registers(ary1, len, result, tmp1);
4039   assert_different_registers(vec1, vec2);
4040   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4041 
4042   movl(result, len); // copy
4043   // len == 0
4044   testl(len, len);
4045   jcc(Assembler::zero, DONE);
4046 
4047   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4048     VM_Version::supports_avx512vlbw() &&
4049     VM_Version::supports_bmi2()) {
4050 
4051     Label test_64_loop, test_tail, BREAK_LOOP;
4052     movl(tmp1, len);
4053     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4054 
4055     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4056     andl(len,  0xffffffc0); // vector count (in chars)
4057     jccb(Assembler::zero, test_tail);
4058 
4059     lea(ary1, Address(ary1, len, Address::times_1));
4060     negptr(len);
4061 
4062     bind(test_64_loop);
4063     // Check whether our 64 elements of size byte contain negatives
4064     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4065     kortestql(mask1, mask1);
4066     jcc(Assembler::notZero, BREAK_LOOP);
4067 
4068     addptr(len, 64);
4069     jccb(Assembler::notZero, test_64_loop);
4070 
4071     bind(test_tail);
4072     // bail out when there is nothing to be done
4073     testl(tmp1, -1);
4074     jcc(Assembler::zero, DONE);
4075 
4076 
4077     // check the tail for absense of negatives
4078     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4079     {
4080       Register tmp3_aliased = len;
4081       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4082       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4083       notq(tmp3_aliased);
4084       kmovql(mask2, tmp3_aliased);
4085     }
4086 
4087     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4088     ktestq(mask1, mask2);
4089     jcc(Assembler::zero, DONE);
4090 
4091     // do a full check for negative registers in the tail
4092     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4093                      // ary1 already pointing to the right place
4094     jmpb(TAIL_START);
4095 
4096     bind(BREAK_LOOP);
4097     // At least one byte in the last 64 byte block was negative.
4098     // Set up to look at the last 64 bytes as if they were a tail
4099     lea(ary1, Address(ary1, len, Address::times_1));
4100     addptr(result, len);
4101     // Ignore the very last byte: if all others are positive,
4102     // it must be negative, so we can skip right to the 2+1 byte
4103     // end comparison at this point
4104     orl(result, 63);
4105     movl(len, 63);
4106     // Fallthru to tail compare
4107   } else {
4108 
4109     if (UseAVX >= 2) {
4110       // With AVX2, use 32-byte vector compare
4111       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4112 
4113       // Compare 32-byte vectors
4114       testl(len, 0xffffffe0);   // vector count (in bytes)
4115       jccb(Assembler::zero, TAIL_START);
4116 
4117       andl(len, 0xffffffe0);
4118       lea(ary1, Address(ary1, len, Address::times_1));
4119       negptr(len);
4120 
4121       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4122       movdl(vec2, tmp1);
4123       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4124 
4125       bind(COMPARE_WIDE_VECTORS);
4126       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4127       vptest(vec1, vec2);
4128       jccb(Assembler::notZero, BREAK_LOOP);
4129       addptr(len, 32);
4130       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4131 
4132       testl(result, 0x0000001f);   // any bytes remaining?
4133       jcc(Assembler::zero, DONE);
4134 
4135       // Quick test using the already prepared vector mask
4136       movl(len, result);
4137       andl(len, 0x0000001f);
4138       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4139       vptest(vec1, vec2);
4140       jcc(Assembler::zero, DONE);
4141       // There are zeros, jump to the tail to determine exactly where
4142       jmpb(TAIL_START);
4143 
4144       bind(BREAK_LOOP);
4145       // At least one byte in the last 32-byte vector is negative.
4146       // Set up to look at the last 32 bytes as if they were a tail
4147       lea(ary1, Address(ary1, len, Address::times_1));
4148       addptr(result, len);
4149       // Ignore the very last byte: if all others are positive,
4150       // it must be negative, so we can skip right to the 2+1 byte
4151       // end comparison at this point
4152       orl(result, 31);
4153       movl(len, 31);
4154       // Fallthru to tail compare
4155     } else if (UseSSE42Intrinsics) {
4156       // With SSE4.2, use double quad vector compare
4157       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4158 
4159       // Compare 16-byte vectors
4160       testl(len, 0xfffffff0);   // vector count (in bytes)
4161       jcc(Assembler::zero, TAIL_START);
4162 
4163       andl(len, 0xfffffff0);
4164       lea(ary1, Address(ary1, len, Address::times_1));
4165       negptr(len);
4166 
4167       movl(tmp1, 0x80808080);
4168       movdl(vec2, tmp1);
4169       pshufd(vec2, vec2, 0);
4170 
4171       bind(COMPARE_WIDE_VECTORS);
4172       movdqu(vec1, Address(ary1, len, Address::times_1));
4173       ptest(vec1, vec2);
4174       jccb(Assembler::notZero, BREAK_LOOP);
4175       addptr(len, 16);
4176       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4177 
4178       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4179       jcc(Assembler::zero, DONE);
4180 
4181       // Quick test using the already prepared vector mask
4182       movl(len, result);
4183       andl(len, 0x0000000f);   // tail count (in bytes)
4184       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4185       ptest(vec1, vec2);
4186       jcc(Assembler::zero, DONE);
4187       jmpb(TAIL_START);
4188 
4189       bind(BREAK_LOOP);
4190       // At least one byte in the last 16-byte vector is negative.
4191       // Set up and look at the last 16 bytes as if they were a tail
4192       lea(ary1, Address(ary1, len, Address::times_1));
4193       addptr(result, len);
4194       // Ignore the very last byte: if all others are positive,
4195       // it must be negative, so we can skip right to the 2+1 byte
4196       // end comparison at this point
4197       orl(result, 15);
4198       movl(len, 15);
4199       // Fallthru to tail compare
4200     }
4201   }
4202 
4203   bind(TAIL_START);
4204   // Compare 4-byte vectors
4205   andl(len, 0xfffffffc); // vector count (in bytes)
4206   jccb(Assembler::zero, COMPARE_CHAR);
4207 
4208   lea(ary1, Address(ary1, len, Address::times_1));
4209   negptr(len);
4210 
4211   bind(COMPARE_VECTORS);
4212   movl(tmp1, Address(ary1, len, Address::times_1));
4213   andl(tmp1, 0x80808080);
4214   jccb(Assembler::notZero, TAIL_ADJUST);
4215   addptr(len, 4);
4216   jccb(Assembler::notZero, COMPARE_VECTORS);
4217 
4218   // Compare trailing char (final 2-3 bytes), if any
4219   bind(COMPARE_CHAR);
4220 
4221   testl(result, 0x2);   // tail  char
4222   jccb(Assembler::zero, COMPARE_BYTE);
4223   load_unsigned_short(tmp1, Address(ary1, 0));
4224   andl(tmp1, 0x00008080);
4225   jccb(Assembler::notZero, CHAR_ADJUST);
4226   lea(ary1, Address(ary1, 2));
4227 
4228   bind(COMPARE_BYTE);
4229   testl(result, 0x1);   // tail  byte
4230   jccb(Assembler::zero, DONE);
4231   load_unsigned_byte(tmp1, Address(ary1, 0));
4232   testl(tmp1, 0x00000080);
4233   jccb(Assembler::zero, DONE);
4234   subptr(result, 1);
4235   jmpb(DONE);
4236 
4237   bind(TAIL_ADJUST);
4238   // there are negative bits in the last 4 byte block.
4239   // Adjust result and check the next three bytes
4240   addptr(result, len);
4241   orl(result, 3);
4242   lea(ary1, Address(ary1, len, Address::times_1));
4243   jmpb(COMPARE_CHAR);
4244 
4245   bind(CHAR_ADJUST);
4246   // We are looking at a char + optional byte tail, and found that one
4247   // of the bytes in the char is negative. Adjust the result, check the
4248   // first byte and readjust if needed.
4249   andl(result, 0xfffffffc);
4250   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4251   jccb(Assembler::notZero, DONE);
4252   addptr(result, 1);
4253 
4254   // That's it
4255   bind(DONE);
4256   if (UseAVX >= 2) {
4257     // clean upper bits of YMM registers
4258     vpxor(vec1, vec1);
4259     vpxor(vec2, vec2);
4260   }
4261 }
4262 
4263 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4264 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4265                                       Register limit, Register result, Register chr,
4266                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4267                                       KRegister mask, bool expand_ary2) {
4268   // for expand_ary2, limit is the (smaller) size of the second array.
4269   ShortBranchVerifier sbv(this);
4270   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4271 
4272   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4273          "Expansion only implemented for AVX2");
4274 
4275   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4276   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4277 
4278   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4279   int scaleIncr = expand_ary2 ? 8 : 16;
4280 
4281   if (is_array_equ) {
4282     // Check the input args
4283     cmpoop(ary1, ary2);
4284     jcc(Assembler::equal, TRUE_LABEL);
4285 
4286     // Need additional checks for arrays_equals.
4287     testptr(ary1, ary1);
4288     jcc(Assembler::zero, FALSE_LABEL);
4289     testptr(ary2, ary2);
4290     jcc(Assembler::zero, FALSE_LABEL);
4291 
4292     // Check the lengths
4293     movl(limit, Address(ary1, length_offset));
4294     cmpl(limit, Address(ary2, length_offset));
4295     jcc(Assembler::notEqual, FALSE_LABEL);
4296   }
4297 
4298   // count == 0
4299   testl(limit, limit);
4300   jcc(Assembler::zero, TRUE_LABEL);
4301 
4302   if (is_array_equ) {
4303     // Load array address
4304     lea(ary1, Address(ary1, base_offset));
4305     lea(ary2, Address(ary2, base_offset));
4306   }
4307 
4308   if (is_array_equ && is_char) {
4309     // arrays_equals when used for char[].
4310     shll(limit, 1);      // byte count != 0
4311   }
4312   movl(result, limit); // copy
4313 
4314   if (UseAVX >= 2) {
4315     // With AVX2, use 32-byte vector compare
4316     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4317 
4318     // Compare 32-byte vectors
4319     if (expand_ary2) {
4320       andl(result, 0x0000000f);  //   tail count (in bytes)
4321       andl(limit, 0xfffffff0);   // vector count (in bytes)
4322       jcc(Assembler::zero, COMPARE_TAIL);
4323     } else {
4324       andl(result, 0x0000001f);  //   tail count (in bytes)
4325       andl(limit, 0xffffffe0);   // vector count (in bytes)
4326       jcc(Assembler::zero, COMPARE_TAIL_16);
4327     }
4328 
4329     lea(ary1, Address(ary1, limit, scaleFactor));
4330     lea(ary2, Address(ary2, limit, Address::times_1));
4331     negptr(limit);
4332 
4333     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4334       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4335 
4336       cmpl(limit, -64);
4337       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4338 
4339       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4340 
4341       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4342       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4343       kortestql(mask, mask);
4344       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4345       addptr(limit, 64);  // update since we already compared at this addr
4346       cmpl(limit, -64);
4347       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4348 
4349       // At this point we may still need to compare -limit+result bytes.
4350       // We could execute the next two instruction and just continue via non-wide path:
4351       //  cmpl(limit, 0);
4352       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4353       // But since we stopped at the points ary{1,2}+limit which are
4354       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4355       // (|limit| <= 32 and result < 32),
4356       // we may just compare the last 64 bytes.
4357       //
4358       addptr(result, -64);   // it is safe, bc we just came from this area
4359       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4360       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4361       kortestql(mask, mask);
4362       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4363 
4364       jmp(TRUE_LABEL);
4365 
4366       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4367 
4368     }//if (VM_Version::supports_avx512vlbw())
4369 
4370     bind(COMPARE_WIDE_VECTORS);
4371     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4372     if (expand_ary2) {
4373       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4374     } else {
4375       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4376     }
4377     vpxor(vec1, vec2);
4378 
4379     vptest(vec1, vec1);
4380     jcc(Assembler::notZero, FALSE_LABEL);
4381     addptr(limit, scaleIncr * 2);
4382     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4383 
4384     testl(result, result);
4385     jcc(Assembler::zero, TRUE_LABEL);
4386 
4387     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4388     if (expand_ary2) {
4389       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4390     } else {
4391       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4392     }
4393     vpxor(vec1, vec2);
4394 
4395     vptest(vec1, vec1);
4396     jcc(Assembler::notZero, FALSE_LABEL);
4397     jmp(TRUE_LABEL);
4398 
4399     bind(COMPARE_TAIL_16); // limit is zero
4400     movl(limit, result);
4401 
4402     // Compare 16-byte chunks
4403     andl(result, 0x0000000f);  //   tail count (in bytes)
4404     andl(limit, 0xfffffff0);   // vector count (in bytes)
4405     jcc(Assembler::zero, COMPARE_TAIL);
4406 
4407     lea(ary1, Address(ary1, limit, scaleFactor));
4408     lea(ary2, Address(ary2, limit, Address::times_1));
4409     negptr(limit);
4410 
4411     bind(COMPARE_WIDE_VECTORS_16);
4412     movdqu(vec1, Address(ary1, limit, scaleFactor));
4413     if (expand_ary2) {
4414       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4415     } else {
4416       movdqu(vec2, Address(ary2, limit, Address::times_1));
4417     }
4418     pxor(vec1, vec2);
4419 
4420     ptest(vec1, vec1);
4421     jcc(Assembler::notZero, FALSE_LABEL);
4422     addptr(limit, scaleIncr);
4423     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4424 
4425     bind(COMPARE_TAIL); // limit is zero
4426     movl(limit, result);
4427     // Fallthru to tail compare
4428   } else if (UseSSE42Intrinsics) {
4429     // With SSE4.2, use double quad vector compare
4430     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4431 
4432     // Compare 16-byte vectors
4433     andl(result, 0x0000000f);  //   tail count (in bytes)
4434     andl(limit, 0xfffffff0);   // vector count (in bytes)
4435     jcc(Assembler::zero, COMPARE_TAIL);
4436 
4437     lea(ary1, Address(ary1, limit, Address::times_1));
4438     lea(ary2, Address(ary2, limit, Address::times_1));
4439     negptr(limit);
4440 
4441     bind(COMPARE_WIDE_VECTORS);
4442     movdqu(vec1, Address(ary1, limit, Address::times_1));
4443     movdqu(vec2, Address(ary2, limit, Address::times_1));
4444     pxor(vec1, vec2);
4445 
4446     ptest(vec1, vec1);
4447     jcc(Assembler::notZero, FALSE_LABEL);
4448     addptr(limit, 16);
4449     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4450 
4451     testl(result, result);
4452     jcc(Assembler::zero, TRUE_LABEL);
4453 
4454     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4455     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4456     pxor(vec1, vec2);
4457 
4458     ptest(vec1, vec1);
4459     jccb(Assembler::notZero, FALSE_LABEL);
4460     jmpb(TRUE_LABEL);
4461 
4462     bind(COMPARE_TAIL); // limit is zero
4463     movl(limit, result);
4464     // Fallthru to tail compare
4465   }
4466 
4467   // Compare 4-byte vectors
4468   if (expand_ary2) {
4469     testl(result, result);
4470     jccb(Assembler::zero, TRUE_LABEL);
4471   } else {
4472     andl(limit, 0xfffffffc); // vector count (in bytes)
4473     jccb(Assembler::zero, COMPARE_CHAR);
4474   }
4475 
4476   lea(ary1, Address(ary1, limit, scaleFactor));
4477   lea(ary2, Address(ary2, limit, Address::times_1));
4478   negptr(limit);
4479 
4480   bind(COMPARE_VECTORS);
4481   if (expand_ary2) {
4482     // There are no "vector" operations for bytes to shorts
4483     movzbl(chr, Address(ary2, limit, Address::times_1));
4484     cmpw(Address(ary1, limit, Address::times_2), chr);
4485     jccb(Assembler::notEqual, FALSE_LABEL);
4486     addptr(limit, 1);
4487     jcc(Assembler::notZero, COMPARE_VECTORS);
4488     jmp(TRUE_LABEL);
4489   } else {
4490     movl(chr, Address(ary1, limit, Address::times_1));
4491     cmpl(chr, Address(ary2, limit, Address::times_1));
4492     jccb(Assembler::notEqual, FALSE_LABEL);
4493     addptr(limit, 4);
4494     jcc(Assembler::notZero, COMPARE_VECTORS);
4495   }
4496 
4497   // Compare trailing char (final 2 bytes), if any
4498   bind(COMPARE_CHAR);
4499   testl(result, 0x2);   // tail  char
4500   jccb(Assembler::zero, COMPARE_BYTE);
4501   load_unsigned_short(chr, Address(ary1, 0));
4502   load_unsigned_short(limit, Address(ary2, 0));
4503   cmpl(chr, limit);
4504   jccb(Assembler::notEqual, FALSE_LABEL);
4505 
4506   if (is_array_equ && is_char) {
4507     bind(COMPARE_BYTE);
4508   } else {
4509     lea(ary1, Address(ary1, 2));
4510     lea(ary2, Address(ary2, 2));
4511 
4512     bind(COMPARE_BYTE);
4513     testl(result, 0x1);   // tail  byte
4514     jccb(Assembler::zero, TRUE_LABEL);
4515     load_unsigned_byte(chr, Address(ary1, 0));
4516     load_unsigned_byte(limit, Address(ary2, 0));
4517     cmpl(chr, limit);
4518     jccb(Assembler::notEqual, FALSE_LABEL);
4519   }
4520   bind(TRUE_LABEL);
4521   movl(result, 1);   // return true
4522   jmpb(DONE);
4523 
4524   bind(FALSE_LABEL);
4525   xorl(result, result); // return false
4526 
4527   // That's it
4528   bind(DONE);
4529   if (UseAVX >= 2) {
4530     // clean upper bits of YMM registers
4531     vpxor(vec1, vec1);
4532     vpxor(vec2, vec2);
4533   }
4534 }
4535 
4536 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4537 #define __ masm.
4538   Register dst = stub.data<0>();
4539   XMMRegister src = stub.data<1>();
4540   address target = stub.data<2>();
4541   __ bind(stub.entry());
4542   __ subptr(rsp, 8);
4543   __ movdbl(Address(rsp), src);
4544   __ call(RuntimeAddress(target));
4545   __ pop(dst);
4546   __ jmp(stub.continuation());
4547 #undef __
4548 }
4549 
4550 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4551   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4552   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4553 
4554   address slowpath_target;
4555   if (dst_bt == T_INT) {
4556     if (src_bt == T_FLOAT) {
4557       cvttss2sil(dst, src);
4558       cmpl(dst, 0x80000000);
4559       slowpath_target = StubRoutines::x86::f2i_fixup();
4560     } else {
4561       cvttsd2sil(dst, src);
4562       cmpl(dst, 0x80000000);
4563       slowpath_target = StubRoutines::x86::d2i_fixup();
4564     }
4565   } else {
4566     if (src_bt == T_FLOAT) {
4567       cvttss2siq(dst, src);
4568       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4569       slowpath_target = StubRoutines::x86::f2l_fixup();
4570     } else {
4571       cvttsd2siq(dst, src);
4572       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4573       slowpath_target = StubRoutines::x86::d2l_fixup();
4574     }
4575   }
4576 
4577   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4578   jcc(Assembler::equal, stub->entry());
4579   bind(stub->continuation());
4580 }
4581 
4582 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4583                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4584   switch(ideal_opc) {
4585     case Op_LShiftVS:
4586       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4587     case Op_LShiftVI:
4588       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4589     case Op_LShiftVL:
4590       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4591     case Op_RShiftVS:
4592       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4593     case Op_RShiftVI:
4594       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4595     case Op_RShiftVL:
4596       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4597     case Op_URShiftVS:
4598       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4599     case Op_URShiftVI:
4600       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4601     case Op_URShiftVL:
4602       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4603     case Op_RotateRightV:
4604       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4605     case Op_RotateLeftV:
4606       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4607     default:
4608       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4609       break;
4610   }
4611 }
4612 
4613 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4614                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4615   if (is_unsigned) {
4616     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4617   } else {
4618     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4619   }
4620 }
4621 
4622 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4623                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4624   switch (elem_bt) {
4625     case T_BYTE:
4626       if (ideal_opc == Op_SaturatingAddV) {
4627         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4628       } else {
4629         assert(ideal_opc == Op_SaturatingSubV, "");
4630         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4631       }
4632       break;
4633     case T_SHORT:
4634       if (ideal_opc == Op_SaturatingAddV) {
4635         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4636       } else {
4637         assert(ideal_opc == Op_SaturatingSubV, "");
4638         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4639       }
4640       break;
4641     default:
4642       fatal("Unsupported type %s", type2name(elem_bt));
4643       break;
4644   }
4645 }
4646 
4647 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4648                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4649   switch (elem_bt) {
4650     case T_BYTE:
4651       if (ideal_opc == Op_SaturatingAddV) {
4652         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4653       } else {
4654         assert(ideal_opc == Op_SaturatingSubV, "");
4655         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4656       }
4657       break;
4658     case T_SHORT:
4659       if (ideal_opc == Op_SaturatingAddV) {
4660         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4661       } else {
4662         assert(ideal_opc == Op_SaturatingSubV, "");
4663         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4664       }
4665       break;
4666     default:
4667       fatal("Unsupported type %s", type2name(elem_bt));
4668       break;
4669   }
4670 }
4671 
4672 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4673                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4674   if (is_unsigned) {
4675     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4676   } else {
4677     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4678   }
4679 }
4680 
4681 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4682                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4683   switch (elem_bt) {
4684     case T_BYTE:
4685       if (ideal_opc == Op_SaturatingAddV) {
4686         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4687       } else {
4688         assert(ideal_opc == Op_SaturatingSubV, "");
4689         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4690       }
4691       break;
4692     case T_SHORT:
4693       if (ideal_opc == Op_SaturatingAddV) {
4694         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4695       } else {
4696         assert(ideal_opc == Op_SaturatingSubV, "");
4697         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4698       }
4699       break;
4700     default:
4701       fatal("Unsupported type %s", type2name(elem_bt));
4702       break;
4703   }
4704 }
4705 
4706 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4707                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4708   switch (elem_bt) {
4709     case T_BYTE:
4710       if (ideal_opc == Op_SaturatingAddV) {
4711         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4712       } else {
4713         assert(ideal_opc == Op_SaturatingSubV, "");
4714         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4715       }
4716       break;
4717     case T_SHORT:
4718       if (ideal_opc == Op_SaturatingAddV) {
4719         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4720       } else {
4721         assert(ideal_opc == Op_SaturatingSubV, "");
4722         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4723       }
4724       break;
4725     default:
4726       fatal("Unsupported type %s", type2name(elem_bt));
4727       break;
4728   }
4729 }
4730 
4731 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4732                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4733                                     bool is_varshift) {
4734   switch (ideal_opc) {
4735     case Op_AddVB:
4736       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4737     case Op_AddVS:
4738       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4739     case Op_AddVI:
4740       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4741     case Op_AddVL:
4742       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4743     case Op_AddVF:
4744       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4745     case Op_AddVD:
4746       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4747     case Op_SubVB:
4748       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4749     case Op_SubVS:
4750       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4751     case Op_SubVI:
4752       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4753     case Op_SubVL:
4754       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4755     case Op_SubVF:
4756       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4757     case Op_SubVD:
4758       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4759     case Op_MulVS:
4760       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4761     case Op_MulVI:
4762       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4763     case Op_MulVL:
4764       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4765     case Op_MulVF:
4766       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4767     case Op_MulVD:
4768       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4769     case Op_DivVF:
4770       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4771     case Op_DivVD:
4772       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4773     case Op_SqrtVF:
4774       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4775     case Op_SqrtVD:
4776       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4777     case Op_AbsVB:
4778       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4779     case Op_AbsVS:
4780       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4781     case Op_AbsVI:
4782       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4783     case Op_AbsVL:
4784       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4785     case Op_FmaVF:
4786       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4787     case Op_FmaVD:
4788       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4789     case Op_VectorRearrange:
4790       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4791     case Op_LShiftVS:
4792       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4793     case Op_LShiftVI:
4794       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4795     case Op_LShiftVL:
4796       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4797     case Op_RShiftVS:
4798       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4799     case Op_RShiftVI:
4800       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4801     case Op_RShiftVL:
4802       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4803     case Op_URShiftVS:
4804       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4805     case Op_URShiftVI:
4806       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4807     case Op_URShiftVL:
4808       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4809     case Op_RotateLeftV:
4810       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4811     case Op_RotateRightV:
4812       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4813     case Op_MaxV:
4814       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4815     case Op_MinV:
4816       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4817     case Op_UMinV:
4818       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4819     case Op_UMaxV:
4820       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4821     case Op_XorV:
4822       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4823     case Op_OrV:
4824       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4825     case Op_AndV:
4826       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4827     default:
4828       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4829       break;
4830   }
4831 }
4832 
4833 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4834                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4835   switch (ideal_opc) {
4836     case Op_AddVB:
4837       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4838     case Op_AddVS:
4839       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4840     case Op_AddVI:
4841       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4842     case Op_AddVL:
4843       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4844     case Op_AddVF:
4845       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4846     case Op_AddVD:
4847       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4848     case Op_SubVB:
4849       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4850     case Op_SubVS:
4851       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4852     case Op_SubVI:
4853       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4854     case Op_SubVL:
4855       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4856     case Op_SubVF:
4857       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4858     case Op_SubVD:
4859       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4860     case Op_MulVS:
4861       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4862     case Op_MulVI:
4863       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4864     case Op_MulVL:
4865       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4866     case Op_MulVF:
4867       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4868     case Op_MulVD:
4869       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4870     case Op_DivVF:
4871       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4872     case Op_DivVD:
4873       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4874     case Op_FmaVF:
4875       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4876     case Op_FmaVD:
4877       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4878     case Op_MaxV:
4879       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4880     case Op_MinV:
4881       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4882     case Op_UMaxV:
4883       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4884     case Op_UMinV:
4885       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4886     case Op_XorV:
4887       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4888     case Op_OrV:
4889       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4890     case Op_AndV:
4891       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4892     default:
4893       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4894       break;
4895   }
4896 }
4897 
4898 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4899                                   KRegister src1, KRegister src2) {
4900   BasicType etype = T_ILLEGAL;
4901   switch(mask_len) {
4902     case 2:
4903     case 4:
4904     case 8:  etype = T_BYTE; break;
4905     case 16: etype = T_SHORT; break;
4906     case 32: etype = T_INT; break;
4907     case 64: etype = T_LONG; break;
4908     default: fatal("Unsupported type"); break;
4909   }
4910   assert(etype != T_ILLEGAL, "");
4911   switch(ideal_opc) {
4912     case Op_AndVMask:
4913       kand(etype, dst, src1, src2); break;
4914     case Op_OrVMask:
4915       kor(etype, dst, src1, src2); break;
4916     case Op_XorVMask:
4917       kxor(etype, dst, src1, src2); break;
4918     default:
4919       fatal("Unsupported masked operation"); break;
4920   }
4921 }
4922 
4923 /*
4924  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4925  * If src is NaN, the result is 0.
4926  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4927  * the result is equal to the value of Integer.MIN_VALUE.
4928  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4929  * the result is equal to the value of Integer.MAX_VALUE.
4930  */
4931 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4932                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4933                                                                    Register rscratch, AddressLiteral float_sign_flip,
4934                                                                    int vec_enc) {
4935   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4936   Label done;
4937   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4938   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4939   vptest(xtmp2, xtmp2, vec_enc);
4940   jccb(Assembler::equal, done);
4941 
4942   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4943   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4944 
4945   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4946   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4947   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4948 
4949   // Recompute the mask for remaining special value.
4950   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4951   // Extract SRC values corresponding to TRUE mask lanes.
4952   vpand(xtmp4, xtmp2, src, vec_enc);
4953   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4954   // values are set.
4955   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4956 
4957   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4958   bind(done);
4959 }
4960 
4961 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4962                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4963                                                                     Register rscratch, AddressLiteral float_sign_flip,
4964                                                                     int vec_enc) {
4965   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4966   Label done;
4967   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4968   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4969   kortestwl(ktmp1, ktmp1);
4970   jccb(Assembler::equal, done);
4971 
4972   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4973   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4974   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4975 
4976   kxorwl(ktmp1, ktmp1, ktmp2);
4977   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4978   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4979   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4980   bind(done);
4981 }
4982 
4983 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4984                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4985                                                                      Register rscratch, AddressLiteral double_sign_flip,
4986                                                                      int vec_enc) {
4987   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4988 
4989   Label done;
4990   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4991   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4992   kortestwl(ktmp1, ktmp1);
4993   jccb(Assembler::equal, done);
4994 
4995   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4996   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4997   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4998 
4999   kxorwl(ktmp1, ktmp1, ktmp2);
5000   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5001   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5002   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5003   bind(done);
5004 }
5005 
5006 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5007                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5008                                                                      Register rscratch, AddressLiteral float_sign_flip,
5009                                                                      int vec_enc) {
5010   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5011   Label done;
5012   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5013   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5014   kortestwl(ktmp1, ktmp1);
5015   jccb(Assembler::equal, done);
5016 
5017   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5018   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5019   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5020 
5021   kxorwl(ktmp1, ktmp1, ktmp2);
5022   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5023   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5024   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5025   bind(done);
5026 }
5027 
5028 /*
5029  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5030  * If src is NaN, the result is 0.
5031  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5032  * the result is equal to the value of Long.MIN_VALUE.
5033  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5034  * the result is equal to the value of Long.MAX_VALUE.
5035  */
5036 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5037                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5038                                                                       Register rscratch, AddressLiteral double_sign_flip,
5039                                                                       int vec_enc) {
5040   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5041 
5042   Label done;
5043   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5044   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5045   kortestwl(ktmp1, ktmp1);
5046   jccb(Assembler::equal, done);
5047 
5048   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5049   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5050   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5051 
5052   kxorwl(ktmp1, ktmp1, ktmp2);
5053   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5054   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5055   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5056   bind(done);
5057 }
5058 
5059 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5060                                                              XMMRegister xtmp, int index, int vec_enc) {
5061    assert(vec_enc < Assembler::AVX_512bit, "");
5062    if (vec_enc == Assembler::AVX_256bit) {
5063      vextractf128_high(xtmp, src);
5064      vshufps(dst, src, xtmp, index, vec_enc);
5065    } else {
5066      vshufps(dst, src, zero, index, vec_enc);
5067    }
5068 }
5069 
5070 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5071                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5072                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5073   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5074 
5075   Label done;
5076   // Compare the destination lanes with float_sign_flip
5077   // value to get mask for all special values.
5078   movdqu(xtmp1, float_sign_flip, rscratch);
5079   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5080   ptest(xtmp2, xtmp2);
5081   jccb(Assembler::equal, done);
5082 
5083   // Flip float_sign_flip to get max integer value.
5084   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5085   pxor(xtmp1, xtmp4);
5086 
5087   // Set detination lanes corresponding to unordered source lanes as zero.
5088   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5089   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5090 
5091   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5092   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5093   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5094 
5095   // Recompute the mask for remaining special value.
5096   pxor(xtmp2, xtmp3);
5097   // Extract mask corresponding to non-negative source lanes.
5098   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5099 
5100   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5101   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5102   pand(xtmp3, xtmp2);
5103 
5104   // Replace destination lanes holding special value(0x80000000) with max int
5105   // if corresponding source lane holds a +ve value.
5106   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5107   bind(done);
5108 }
5109 
5110 
5111 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5112                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5113   switch(to_elem_bt) {
5114     case T_SHORT:
5115       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5116       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5117       vpackusdw(dst, dst, zero, vec_enc);
5118       if (vec_enc == Assembler::AVX_256bit) {
5119         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5120       }
5121       break;
5122     case  T_BYTE:
5123       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5124       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5125       vpackusdw(dst, dst, zero, vec_enc);
5126       if (vec_enc == Assembler::AVX_256bit) {
5127         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5128       }
5129       vpackuswb(dst, dst, zero, vec_enc);
5130       break;
5131     default: assert(false, "%s", type2name(to_elem_bt));
5132   }
5133 }
5134 
5135 /*
5136  * Algorithm for vector D2L and F2I conversions:-
5137  * a) Perform vector D2L/F2I cast.
5138  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5139  *    It signifies that source value could be any of the special floating point
5140  *    values(NaN,-Inf,Inf,Max,-Min).
5141  * c) Set destination to zero if source is NaN value.
5142  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5143  */
5144 
5145 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5146                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5147                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5148   int to_elem_sz = type2aelembytes(to_elem_bt);
5149   assert(to_elem_sz <= 4, "");
5150   vcvttps2dq(dst, src, vec_enc);
5151   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5152   if (to_elem_sz < 4) {
5153     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5154     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5155   }
5156 }
5157 
5158 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5159                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5160                                             Register rscratch, int vec_enc) {
5161   int to_elem_sz = type2aelembytes(to_elem_bt);
5162   assert(to_elem_sz <= 4, "");
5163   vcvttps2dq(dst, src, vec_enc);
5164   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5165   switch(to_elem_bt) {
5166     case T_INT:
5167       break;
5168     case T_SHORT:
5169       evpmovdw(dst, dst, vec_enc);
5170       break;
5171     case T_BYTE:
5172       evpmovdb(dst, dst, vec_enc);
5173       break;
5174     default: assert(false, "%s", type2name(to_elem_bt));
5175   }
5176 }
5177 
5178 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5179                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5180                                             Register rscratch, int vec_enc) {
5181   evcvttps2qq(dst, src, vec_enc);
5182   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5183 }
5184 
5185 // Handling for downcasting from double to integer or sub-word types on AVX2.
5186 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5187                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5188                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5189   int to_elem_sz = type2aelembytes(to_elem_bt);
5190   assert(to_elem_sz < 8, "");
5191   vcvttpd2dq(dst, src, vec_enc);
5192   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5193                                               float_sign_flip, vec_enc);
5194   if (to_elem_sz < 4) {
5195     // xtmp4 holds all zero lanes.
5196     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5197   }
5198 }
5199 
5200 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5201                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5202                                             KRegister ktmp2, AddressLiteral sign_flip,
5203                                             Register rscratch, int vec_enc) {
5204   if (VM_Version::supports_avx512dq()) {
5205     evcvttpd2qq(dst, src, vec_enc);
5206     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5207     switch(to_elem_bt) {
5208       case T_LONG:
5209         break;
5210       case T_INT:
5211         evpmovsqd(dst, dst, vec_enc);
5212         break;
5213       case T_SHORT:
5214         evpmovsqd(dst, dst, vec_enc);
5215         evpmovdw(dst, dst, vec_enc);
5216         break;
5217       case T_BYTE:
5218         evpmovsqd(dst, dst, vec_enc);
5219         evpmovdb(dst, dst, vec_enc);
5220         break;
5221       default: assert(false, "%s", type2name(to_elem_bt));
5222     }
5223   } else {
5224     assert(type2aelembytes(to_elem_bt) <= 4, "");
5225     vcvttpd2dq(dst, src, vec_enc);
5226     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5227     switch(to_elem_bt) {
5228       case T_INT:
5229         break;
5230       case T_SHORT:
5231         evpmovdw(dst, dst, vec_enc);
5232         break;
5233       case T_BYTE:
5234         evpmovdb(dst, dst, vec_enc);
5235         break;
5236       default: assert(false, "%s", type2name(to_elem_bt));
5237     }
5238   }
5239 }
5240 
5241 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5242                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5243                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5244   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5245   // and re-instantiate original MXCSR.RC mode after that.
5246   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5247 
5248   mov64(tmp, julong_cast(0.5L));
5249   evpbroadcastq(xtmp1, tmp, vec_enc);
5250   vaddpd(xtmp1, src , xtmp1, vec_enc);
5251   evcvtpd2qq(dst, xtmp1, vec_enc);
5252   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5253                                                 double_sign_flip, vec_enc);;
5254 
5255   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5256 }
5257 
5258 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5259                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5260                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5261   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5262   // and re-instantiate original MXCSR.RC mode after that.
5263   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5264 
5265   movl(tmp, jint_cast(0.5));
5266   movq(xtmp1, tmp);
5267   vbroadcastss(xtmp1, xtmp1, vec_enc);
5268   vaddps(xtmp1, src , xtmp1, vec_enc);
5269   vcvtps2dq(dst, xtmp1, vec_enc);
5270   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5271                                               float_sign_flip, vec_enc);
5272 
5273   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5274 }
5275 
5276 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5277                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5278                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5279   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5280   // and re-instantiate original MXCSR.RC mode after that.
5281   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5282 
5283   movl(tmp, jint_cast(0.5));
5284   movq(xtmp1, tmp);
5285   vbroadcastss(xtmp1, xtmp1, vec_enc);
5286   vaddps(xtmp1, src , xtmp1, vec_enc);
5287   vcvtps2dq(dst, xtmp1, vec_enc);
5288   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5289 
5290   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5291 }
5292 
5293 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5294                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5295   switch (from_elem_bt) {
5296     case T_BYTE:
5297       switch (to_elem_bt) {
5298         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5299         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5300         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5301         default: ShouldNotReachHere();
5302       }
5303       break;
5304     case T_SHORT:
5305       switch (to_elem_bt) {
5306         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5307         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5308         default: ShouldNotReachHere();
5309       }
5310       break;
5311     case T_INT:
5312       assert(to_elem_bt == T_LONG, "");
5313       vpmovzxdq(dst, src, vlen_enc);
5314       break;
5315     default:
5316       ShouldNotReachHere();
5317   }
5318 }
5319 
5320 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5321                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5322   switch (from_elem_bt) {
5323     case T_BYTE:
5324       switch (to_elem_bt) {
5325         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5326         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5327         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5328         default: ShouldNotReachHere();
5329       }
5330       break;
5331     case T_SHORT:
5332       switch (to_elem_bt) {
5333         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5334         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5335         default: ShouldNotReachHere();
5336       }
5337       break;
5338     case T_INT:
5339       assert(to_elem_bt == T_LONG, "");
5340       vpmovsxdq(dst, src, vlen_enc);
5341       break;
5342     default:
5343       ShouldNotReachHere();
5344   }
5345 }
5346 
5347 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5348                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5349   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5350   assert(vlen_enc != AVX_512bit, "");
5351 
5352   int dst_bt_size = type2aelembytes(dst_bt);
5353   int src_bt_size = type2aelembytes(src_bt);
5354   if (dst_bt_size > src_bt_size) {
5355     switch (dst_bt_size / src_bt_size) {
5356       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5357       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5358       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5359       default: ShouldNotReachHere();
5360     }
5361   } else {
5362     assert(dst_bt_size < src_bt_size, "");
5363     switch (src_bt_size / dst_bt_size) {
5364       case 2: {
5365         if (vlen_enc == AVX_128bit) {
5366           vpacksswb(dst, src, src, vlen_enc);
5367         } else {
5368           vpacksswb(dst, src, src, vlen_enc);
5369           vpermq(dst, dst, 0x08, vlen_enc);
5370         }
5371         break;
5372       }
5373       case 4: {
5374         if (vlen_enc == AVX_128bit) {
5375           vpackssdw(dst, src, src, vlen_enc);
5376           vpacksswb(dst, dst, dst, vlen_enc);
5377         } else {
5378           vpackssdw(dst, src, src, vlen_enc);
5379           vpermq(dst, dst, 0x08, vlen_enc);
5380           vpacksswb(dst, dst, dst, AVX_128bit);
5381         }
5382         break;
5383       }
5384       case 8: {
5385         if (vlen_enc == AVX_128bit) {
5386           vpshufd(dst, src, 0x08, vlen_enc);
5387           vpackssdw(dst, dst, dst, vlen_enc);
5388           vpacksswb(dst, dst, dst, vlen_enc);
5389         } else {
5390           vpshufd(dst, src, 0x08, vlen_enc);
5391           vpermq(dst, dst, 0x08, vlen_enc);
5392           vpackssdw(dst, dst, dst, AVX_128bit);
5393           vpacksswb(dst, dst, dst, AVX_128bit);
5394         }
5395         break;
5396       }
5397       default: ShouldNotReachHere();
5398     }
5399   }
5400 }
5401 
5402 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5403                                    bool merge, BasicType bt, int vlen_enc) {
5404   if (bt == T_INT) {
5405     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5406   } else {
5407     assert(bt == T_LONG, "");
5408     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5409   }
5410 }
5411 
5412 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5413                                    bool merge, BasicType bt, int vlen_enc) {
5414   if (bt == T_INT) {
5415     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5416   } else {
5417     assert(bt == T_LONG, "");
5418     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5419   }
5420 }
5421 
5422 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5423                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5424                                                int vec_enc) {
5425   int index = 0;
5426   int vindex = 0;
5427   mov64(rtmp1, 0x0101010101010101L);
5428   pdepq(rtmp1, src, rtmp1);
5429   if (mask_len > 8) {
5430     movq(rtmp2, src);
5431     vpxor(xtmp, xtmp, xtmp, vec_enc);
5432     movq(xtmp, rtmp1);
5433   }
5434   movq(dst, rtmp1);
5435 
5436   mask_len -= 8;
5437   while (mask_len > 0) {
5438     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5439     index++;
5440     if ((index % 2) == 0) {
5441       pxor(xtmp, xtmp);
5442     }
5443     mov64(rtmp1, 0x0101010101010101L);
5444     shrq(rtmp2, 8);
5445     pdepq(rtmp1, rtmp2, rtmp1);
5446     pinsrq(xtmp, rtmp1, index % 2);
5447     vindex = index / 2;
5448     if (vindex) {
5449       // Write entire 16 byte vector when both 64 bit
5450       // lanes are update to save redundant instructions.
5451       if (index % 2) {
5452         vinsertf128(dst, dst, xtmp, vindex);
5453       }
5454     } else {
5455       vmovdqu(dst, xtmp);
5456     }
5457     mask_len -= 8;
5458   }
5459 }
5460 
5461 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5462   switch(opc) {
5463     case Op_VectorMaskTrueCount:
5464       popcntq(dst, tmp);
5465       break;
5466     case Op_VectorMaskLastTrue:
5467       if (VM_Version::supports_lzcnt()) {
5468         lzcntq(tmp, tmp);
5469         movl(dst, 63);
5470         subl(dst, tmp);
5471       } else {
5472         movl(dst, -1);
5473         bsrq(tmp, tmp);
5474         cmov32(Assembler::notZero, dst, tmp);
5475       }
5476       break;
5477     case Op_VectorMaskFirstTrue:
5478       if (VM_Version::supports_bmi1()) {
5479         if (masklen < 32) {
5480           orl(tmp, 1 << masklen);
5481           tzcntl(dst, tmp);
5482         } else if (masklen == 32) {
5483           tzcntl(dst, tmp);
5484         } else {
5485           assert(masklen == 64, "");
5486           tzcntq(dst, tmp);
5487         }
5488       } else {
5489         if (masklen < 32) {
5490           orl(tmp, 1 << masklen);
5491           bsfl(dst, tmp);
5492         } else {
5493           assert(masklen == 32 || masklen == 64, "");
5494           movl(dst, masklen);
5495           if (masklen == 32)  {
5496             bsfl(tmp, tmp);
5497           } else {
5498             bsfq(tmp, tmp);
5499           }
5500           cmov32(Assembler::notZero, dst, tmp);
5501         }
5502       }
5503       break;
5504     case Op_VectorMaskToLong:
5505       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5506       break;
5507     default: assert(false, "Unhandled mask operation");
5508   }
5509 }
5510 
5511 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5512                                               int masklen, int masksize, int vec_enc) {
5513   assert(VM_Version::supports_popcnt(), "");
5514 
5515   if(VM_Version::supports_avx512bw()) {
5516     kmovql(tmp, mask);
5517   } else {
5518     assert(masklen <= 16, "");
5519     kmovwl(tmp, mask);
5520   }
5521 
5522   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5523   // operations needs to be clipped.
5524   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5525     andq(tmp, (1 << masklen) - 1);
5526   }
5527 
5528   vector_mask_operation_helper(opc, dst, tmp, masklen);
5529 }
5530 
5531 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5532                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5533   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5534          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5535   assert(VM_Version::supports_popcnt(), "");
5536 
5537   bool need_clip = false;
5538   switch(bt) {
5539     case T_BOOLEAN:
5540       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5541       vpxor(xtmp, xtmp, xtmp, vec_enc);
5542       vpsubb(xtmp, xtmp, mask, vec_enc);
5543       vpmovmskb(tmp, xtmp, vec_enc);
5544       need_clip = masklen < 16;
5545       break;
5546     case T_BYTE:
5547       vpmovmskb(tmp, mask, vec_enc);
5548       need_clip = masklen < 16;
5549       break;
5550     case T_SHORT:
5551       vpacksswb(xtmp, mask, mask, vec_enc);
5552       if (masklen >= 16) {
5553         vpermpd(xtmp, xtmp, 8, vec_enc);
5554       }
5555       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5556       need_clip = masklen < 16;
5557       break;
5558     case T_INT:
5559     case T_FLOAT:
5560       vmovmskps(tmp, mask, vec_enc);
5561       need_clip = masklen < 4;
5562       break;
5563     case T_LONG:
5564     case T_DOUBLE:
5565       vmovmskpd(tmp, mask, vec_enc);
5566       need_clip = masklen < 2;
5567       break;
5568     default: assert(false, "Unhandled type, %s", type2name(bt));
5569   }
5570 
5571   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5572   // operations needs to be clipped.
5573   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5574     // need_clip implies masklen < 32
5575     andq(tmp, (1 << masklen) - 1);
5576   }
5577 
5578   vector_mask_operation_helper(opc, dst, tmp, masklen);
5579 }
5580 
5581 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5582                                              Register rtmp2, int mask_len) {
5583   kmov(rtmp1, src);
5584   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5585   mov64(rtmp2, -1L);
5586   pextq(rtmp2, rtmp2, rtmp1);
5587   kmov(dst, rtmp2);
5588 }
5589 
5590 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5591                                                     XMMRegister mask, Register rtmp, Register rscratch,
5592                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5593                                                     int vec_enc) {
5594   assert(type2aelembytes(bt) >= 4, "");
5595   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5596   address compress_perm_table = nullptr;
5597   address expand_perm_table = nullptr;
5598   if (type2aelembytes(bt) == 8) {
5599     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5600     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5601     vmovmskpd(rtmp, mask, vec_enc);
5602   } else {
5603     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5604     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5605     vmovmskps(rtmp, mask, vec_enc);
5606   }
5607   shlq(rtmp, 5); // for 32 byte permute row.
5608   if (opcode == Op_CompressV) {
5609     lea(rscratch, ExternalAddress(compress_perm_table));
5610   } else {
5611     lea(rscratch, ExternalAddress(expand_perm_table));
5612   }
5613   addptr(rtmp, rscratch);
5614   vmovdqu(permv, Address(rtmp));
5615   vpermps(dst, permv, src, Assembler::AVX_256bit);
5616   vpxor(xtmp, xtmp, xtmp, vec_enc);
5617   // Blend the result with zero vector using permute mask, each column entry
5618   // in a permute table row contains either a valid permute index or a -1 (default)
5619   // value, this can potentially be used as a blending mask after
5620   // compressing/expanding the source vector lanes.
5621   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5622 }
5623 
5624 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5625                                                bool merge, BasicType bt, int vec_enc) {
5626   if (opcode == Op_CompressV) {
5627     switch(bt) {
5628     case T_BYTE:
5629       evpcompressb(dst, mask, src, merge, vec_enc);
5630       break;
5631     case T_CHAR:
5632     case T_SHORT:
5633       evpcompressw(dst, mask, src, merge, vec_enc);
5634       break;
5635     case T_INT:
5636       evpcompressd(dst, mask, src, merge, vec_enc);
5637       break;
5638     case T_FLOAT:
5639       evcompressps(dst, mask, src, merge, vec_enc);
5640       break;
5641     case T_LONG:
5642       evpcompressq(dst, mask, src, merge, vec_enc);
5643       break;
5644     case T_DOUBLE:
5645       evcompresspd(dst, mask, src, merge, vec_enc);
5646       break;
5647     default:
5648       fatal("Unsupported type %s", type2name(bt));
5649       break;
5650     }
5651   } else {
5652     assert(opcode == Op_ExpandV, "");
5653     switch(bt) {
5654     case T_BYTE:
5655       evpexpandb(dst, mask, src, merge, vec_enc);
5656       break;
5657     case T_CHAR:
5658     case T_SHORT:
5659       evpexpandw(dst, mask, src, merge, vec_enc);
5660       break;
5661     case T_INT:
5662       evpexpandd(dst, mask, src, merge, vec_enc);
5663       break;
5664     case T_FLOAT:
5665       evexpandps(dst, mask, src, merge, vec_enc);
5666       break;
5667     case T_LONG:
5668       evpexpandq(dst, mask, src, merge, vec_enc);
5669       break;
5670     case T_DOUBLE:
5671       evexpandpd(dst, mask, src, merge, vec_enc);
5672       break;
5673     default:
5674       fatal("Unsupported type %s", type2name(bt));
5675       break;
5676     }
5677   }
5678 }
5679 
5680 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5681                                            KRegister ktmp1, int vec_enc) {
5682   if (opcode == Op_SignumVD) {
5683     vsubpd(dst, zero, one, vec_enc);
5684     // if src < 0 ? -1 : 1
5685     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5686     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5687     // if src == NaN, -0.0 or 0.0 return src.
5688     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5689     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5690   } else {
5691     assert(opcode == Op_SignumVF, "");
5692     vsubps(dst, zero, one, vec_enc);
5693     // if src < 0 ? -1 : 1
5694     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5695     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5696     // if src == NaN, -0.0 or 0.0 return src.
5697     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5698     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5699   }
5700 }
5701 
5702 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5703                                           XMMRegister xtmp1, int vec_enc) {
5704   if (opcode == Op_SignumVD) {
5705     vsubpd(dst, zero, one, vec_enc);
5706     // if src < 0 ? -1 : 1
5707     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5708     // if src == NaN, -0.0 or 0.0 return src.
5709     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5710     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5711   } else {
5712     assert(opcode == Op_SignumVF, "");
5713     vsubps(dst, zero, one, vec_enc);
5714     // if src < 0 ? -1 : 1
5715     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5716     // if src == NaN, -0.0 or 0.0 return src.
5717     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5718     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5719   }
5720 }
5721 
5722 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5723   if (VM_Version::supports_avx512bw()) {
5724     if (mask_len > 32) {
5725       kmovql(dst, src);
5726     } else {
5727       kmovdl(dst, src);
5728       if (mask_len != 32) {
5729         kshiftrdl(dst, dst, 32 - mask_len);
5730       }
5731     }
5732   } else {
5733     assert(mask_len <= 16, "");
5734     kmovwl(dst, src);
5735     if (mask_len != 16) {
5736       kshiftrwl(dst, dst, 16 - mask_len);
5737     }
5738   }
5739 }
5740 
5741 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5742   int lane_size = type2aelembytes(bt);
5743   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5744       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5745     movptr(rtmp, imm32);
5746     switch(lane_size) {
5747       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5748       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5749       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5750       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5751       fatal("Unsupported lane size %d", lane_size);
5752       break;
5753     }
5754   } else {
5755     movptr(rtmp, imm32);
5756     movq(dst, rtmp);
5757     switch(lane_size) {
5758       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5759       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5760       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5761       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5762       fatal("Unsupported lane size %d", lane_size);
5763       break;
5764     }
5765   }
5766 }
5767 
5768 //
5769 // Following is lookup table based popcount computation algorithm:-
5770 //       Index   Bit set count
5771 //     [ 0000 ->   0,
5772 //       0001 ->   1,
5773 //       0010 ->   1,
5774 //       0011 ->   2,
5775 //       0100 ->   1,
5776 //       0101 ->   2,
5777 //       0110 ->   2,
5778 //       0111 ->   3,
5779 //       1000 ->   1,
5780 //       1001 ->   2,
5781 //       1010 ->   3,
5782 //       1011 ->   3,
5783 //       1100 ->   2,
5784 //       1101 ->   3,
5785 //       1111 ->   4 ]
5786 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5787 //     shuffle indices for lookup table access.
5788 //  b. Right shift each byte of vector lane by 4 positions.
5789 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5790 //     shuffle indices for lookup table access.
5791 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5792 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5793 //     count of all the bytes of a quadword.
5794 //  f. Perform step e. for upper 128bit vector lane.
5795 //  g. Pack the bitset count of quadwords back to double word.
5796 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5797 
5798 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5799                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5800   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5801   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5802   vpsrlw(dst, src, 4, vec_enc);
5803   vpand(dst, dst, xtmp1, vec_enc);
5804   vpand(xtmp1, src, xtmp1, vec_enc);
5805   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5806   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5807   vpshufb(dst, xtmp2, dst, vec_enc);
5808   vpaddb(dst, dst, xtmp1, vec_enc);
5809 }
5810 
5811 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5812                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5813   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5814   // Following code is as per steps e,f,g and h of above algorithm.
5815   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5816   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5817   vpsadbw(dst, dst, xtmp2, vec_enc);
5818   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5819   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5820   vpackuswb(dst, xtmp1, dst, vec_enc);
5821 }
5822 
5823 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5824                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5825   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5826   // Add the popcount of upper and lower bytes of word.
5827   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5828   vpsrlw(dst, xtmp1, 8, vec_enc);
5829   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5830   vpaddw(dst, dst, xtmp1, vec_enc);
5831 }
5832 
5833 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5834                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5835   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5836   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5837   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5838 }
5839 
5840 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5841                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5842   switch(bt) {
5843     case T_LONG:
5844       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5845       break;
5846     case T_INT:
5847       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5848       break;
5849     case T_CHAR:
5850     case T_SHORT:
5851       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5852       break;
5853     case T_BYTE:
5854     case T_BOOLEAN:
5855       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5856       break;
5857     default:
5858       fatal("Unsupported type %s", type2name(bt));
5859       break;
5860   }
5861 }
5862 
5863 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5864                                                       KRegister mask, bool merge, int vec_enc) {
5865   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5866   switch(bt) {
5867     case T_LONG:
5868       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5869       evpopcntq(dst, mask, src, merge, vec_enc);
5870       break;
5871     case T_INT:
5872       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5873       evpopcntd(dst, mask, src, merge, vec_enc);
5874       break;
5875     case T_CHAR:
5876     case T_SHORT:
5877       assert(VM_Version::supports_avx512_bitalg(), "");
5878       evpopcntw(dst, mask, src, merge, vec_enc);
5879       break;
5880     case T_BYTE:
5881     case T_BOOLEAN:
5882       assert(VM_Version::supports_avx512_bitalg(), "");
5883       evpopcntb(dst, mask, src, merge, vec_enc);
5884       break;
5885     default:
5886       fatal("Unsupported type %s", type2name(bt));
5887       break;
5888   }
5889 }
5890 
5891 // Bit reversal algorithm first reverses the bits of each byte followed by
5892 // a byte level reversal for multi-byte primitive types (short/int/long).
5893 // Algorithm performs a lookup table access to get reverse bit sequence
5894 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5895 // is obtained by swapping the reverse bit sequences of upper and lower
5896 // nibble of a byte.
5897 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5898                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5899   if (VM_Version::supports_avx512vlbw()) {
5900 
5901     // Get the reverse bit sequence of lower nibble of each byte.
5902     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5903     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5904     evpandq(dst, xtmp2, src, vec_enc);
5905     vpshufb(dst, xtmp1, dst, vec_enc);
5906     vpsllq(dst, dst, 4, vec_enc);
5907 
5908     // Get the reverse bit sequence of upper nibble of each byte.
5909     vpandn(xtmp2, xtmp2, src, vec_enc);
5910     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5911     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5912 
5913     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5914     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5915     evporq(xtmp2, dst, xtmp2, vec_enc);
5916     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5917 
5918   } else if(vec_enc == Assembler::AVX_512bit) {
5919     // Shift based bit reversal.
5920     assert(bt == T_LONG || bt == T_INT, "");
5921 
5922     // Swap lower and upper nibble of each byte.
5923     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5924 
5925     // Swap two least and most significant bits of each nibble.
5926     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5927 
5928     // Swap adjacent pair of bits.
5929     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5930     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5931 
5932     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5933     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5934   } else {
5935     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5936     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5937 
5938     // Get the reverse bit sequence of lower nibble of each byte.
5939     vpand(dst, xtmp2, src, vec_enc);
5940     vpshufb(dst, xtmp1, dst, vec_enc);
5941     vpsllq(dst, dst, 4, vec_enc);
5942 
5943     // Get the reverse bit sequence of upper nibble of each byte.
5944     vpandn(xtmp2, xtmp2, src, vec_enc);
5945     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5946     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5947 
5948     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5949     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5950     vpor(xtmp2, dst, xtmp2, vec_enc);
5951     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5952   }
5953 }
5954 
5955 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5956                                                 XMMRegister xtmp, Register rscratch) {
5957   assert(VM_Version::supports_gfni(), "");
5958   assert(rscratch != noreg || always_reachable(mask), "missing");
5959 
5960   // Galois field instruction based bit reversal based on following algorithm.
5961   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5962   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5963   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5964   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5965 }
5966 
5967 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5968                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5969   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5970   evpandq(dst, xtmp1, src, vec_enc);
5971   vpsllq(dst, dst, nbits, vec_enc);
5972   vpandn(xtmp1, xtmp1, src, vec_enc);
5973   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5974   evporq(dst, dst, xtmp1, vec_enc);
5975 }
5976 
5977 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5978                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5979   // Shift based bit reversal.
5980   assert(VM_Version::supports_evex(), "");
5981   switch(bt) {
5982     case T_LONG:
5983       // Swap upper and lower double word of each quad word.
5984       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5985       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5986       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5987       break;
5988     case T_INT:
5989       // Swap upper and lower word of each double word.
5990       evprord(xtmp1, k0, src, 16, true, vec_enc);
5991       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5992       break;
5993     case T_CHAR:
5994     case T_SHORT:
5995       // Swap upper and lower byte of each word.
5996       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5997       break;
5998     case T_BYTE:
5999       evmovdquq(dst, k0, src, true, vec_enc);
6000       break;
6001     default:
6002       fatal("Unsupported type %s", type2name(bt));
6003       break;
6004   }
6005 }
6006 
6007 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6008   if (bt == T_BYTE) {
6009     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6010       evmovdquq(dst, k0, src, true, vec_enc);
6011     } else {
6012       vmovdqu(dst, src);
6013     }
6014     return;
6015   }
6016   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6017   // pre-computed shuffle indices.
6018   switch(bt) {
6019     case T_LONG:
6020       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6021       break;
6022     case T_INT:
6023       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6024       break;
6025     case T_CHAR:
6026     case T_SHORT:
6027       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6028       break;
6029     default:
6030       fatal("Unsupported type %s", type2name(bt));
6031       break;
6032   }
6033   vpshufb(dst, src, dst, vec_enc);
6034 }
6035 
6036 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6037                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6038                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6039   assert(is_integral_type(bt), "");
6040   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6041   assert(VM_Version::supports_avx512cd(), "");
6042   switch(bt) {
6043     case T_LONG:
6044       evplzcntq(dst, ktmp, src, merge, vec_enc);
6045       break;
6046     case T_INT:
6047       evplzcntd(dst, ktmp, src, merge, vec_enc);
6048       break;
6049     case T_SHORT:
6050       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6051       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6052       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6053       vpunpckhwd(dst, xtmp1, src, vec_enc);
6054       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6055       vpackusdw(dst, xtmp2, dst, vec_enc);
6056       break;
6057     case T_BYTE:
6058       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6059       // accessing the lookup table.
6060       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6061       // accessing the lookup table.
6062       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6063       assert(VM_Version::supports_avx512bw(), "");
6064       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6065       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6066       vpand(xtmp2, dst, src, vec_enc);
6067       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6068       vpsrlw(xtmp3, src, 4, vec_enc);
6069       vpand(xtmp3, dst, xtmp3, vec_enc);
6070       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6071       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6072       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6073       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6074       break;
6075     default:
6076       fatal("Unsupported type %s", type2name(bt));
6077       break;
6078   }
6079 }
6080 
6081 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6082                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6083   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6084   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6085   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6086   // accessing the lookup table.
6087   vpand(dst, xtmp2, src, vec_enc);
6088   vpshufb(dst, xtmp1, dst, vec_enc);
6089   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6090   // accessing the lookup table.
6091   vpsrlw(xtmp3, src, 4, vec_enc);
6092   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6093   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6094   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6095   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6096   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6097   vpaddb(dst, dst, xtmp2, vec_enc);
6098   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6099 }
6100 
6101 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6102                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6103   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6104   // Add zero counts of lower byte and upper byte of a word if
6105   // upper byte holds a zero value.
6106   vpsrlw(xtmp3, src, 8, vec_enc);
6107   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6108   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6109   vpsllw(xtmp2, dst, 8, vec_enc);
6110   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6111   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6112   vpsrlw(dst, dst, 8, vec_enc);
6113 }
6114 
6115 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6116                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6117   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6118   // hence biased exponent can be used to compute leading zero count as per
6119   // following formula:-
6120   // LZCNT = 31 - (biased_exp - 127)
6121   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6122 
6123   // Broadcast 0xFF
6124   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6125   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6126 
6127   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6128   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6129   // contributes to the leading number of zeros.
6130   vpsrld(xtmp2, src, 1, vec_enc);
6131   vpandn(xtmp3, xtmp2, src, vec_enc);
6132 
6133   // Extract biased exponent.
6134   vcvtdq2ps(dst, xtmp3, vec_enc);
6135   vpsrld(dst, dst, 23, vec_enc);
6136   vpand(dst, dst, xtmp1, vec_enc);
6137 
6138   // Broadcast 127.
6139   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6140   // Exponent = biased_exp - 127
6141   vpsubd(dst, dst, xtmp1, vec_enc);
6142 
6143   // Exponent_plus_one = Exponent + 1
6144   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6145   vpaddd(dst, dst, xtmp3, vec_enc);
6146 
6147   // Replace -ve exponent with zero, exponent is -ve when src
6148   // lane contains a zero value.
6149   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6150   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6151 
6152   // Rematerialize broadcast 32.
6153   vpslld(xtmp1, xtmp3, 5, vec_enc);
6154   // Exponent is 32 if corresponding source lane contains max_int value.
6155   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6156   // LZCNT = 32 - exponent_plus_one
6157   vpsubd(dst, xtmp1, dst, vec_enc);
6158 
6159   // Replace LZCNT with a value 1 if corresponding source lane
6160   // contains max_int value.
6161   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6162 
6163   // Replace biased_exp with 0 if source lane value is less than zero.
6164   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6165   vblendvps(dst, dst, xtmp2, src, vec_enc);
6166 }
6167 
6168 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6169                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6170   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6171   // Add zero counts of lower word and upper word of a double word if
6172   // upper word holds a zero value.
6173   vpsrld(xtmp3, src, 16, vec_enc);
6174   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6175   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6176   vpslld(xtmp2, dst, 16, vec_enc);
6177   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6178   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6179   vpsrld(dst, dst, 16, vec_enc);
6180   // Add zero counts of lower doubleword and upper doubleword of a
6181   // quadword if upper doubleword holds a zero value.
6182   vpsrlq(xtmp3, src, 32, vec_enc);
6183   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6184   vpsllq(xtmp2, dst, 32, vec_enc);
6185   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6186   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6187   vpsrlq(dst, dst, 32, vec_enc);
6188 }
6189 
6190 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6191                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6192                                                        Register rtmp, int vec_enc) {
6193   assert(is_integral_type(bt), "unexpected type");
6194   assert(vec_enc < Assembler::AVX_512bit, "");
6195   switch(bt) {
6196     case T_LONG:
6197       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6198       break;
6199     case T_INT:
6200       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6201       break;
6202     case T_SHORT:
6203       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6204       break;
6205     case T_BYTE:
6206       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6207       break;
6208     default:
6209       fatal("Unsupported type %s", type2name(bt));
6210       break;
6211   }
6212 }
6213 
6214 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6215   switch(bt) {
6216     case T_BYTE:
6217       vpsubb(dst, src1, src2, vec_enc);
6218       break;
6219     case T_SHORT:
6220       vpsubw(dst, src1, src2, vec_enc);
6221       break;
6222     case T_INT:
6223       vpsubd(dst, src1, src2, vec_enc);
6224       break;
6225     case T_LONG:
6226       vpsubq(dst, src1, src2, vec_enc);
6227       break;
6228     default:
6229       fatal("Unsupported type %s", type2name(bt));
6230       break;
6231   }
6232 }
6233 
6234 // Trailing zero count computation is based on leading zero count operation as per
6235 // following equation. All AVX3 targets support AVX512CD feature which offers
6236 // direct vector instruction to compute leading zero count.
6237 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6238 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6239                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6240                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6241   assert(is_integral_type(bt), "");
6242   // xtmp = -1
6243   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6244   // xtmp = xtmp + src
6245   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6246   // xtmp = xtmp & ~src
6247   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6248   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6249   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6250   vpsub(bt, dst, xtmp4, dst, vec_enc);
6251 }
6252 
6253 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6254 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6255 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6256                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6257   assert(is_integral_type(bt), "");
6258   // xtmp = 0
6259   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6260   // xtmp = 0 - src
6261   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6262   // xtmp = xtmp | src
6263   vpor(xtmp3, xtmp3, src, vec_enc);
6264   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6265   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6266   vpsub(bt, dst, xtmp1, dst, vec_enc);
6267 }
6268 
6269 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6270   Label done;
6271   Label neg_divisor_fastpath;
6272   cmpl(divisor, 0);
6273   jccb(Assembler::less, neg_divisor_fastpath);
6274   xorl(rdx, rdx);
6275   divl(divisor);
6276   jmpb(done);
6277   bind(neg_divisor_fastpath);
6278   // Fastpath for divisor < 0:
6279   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6280   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6281   movl(rdx, rax);
6282   subl(rdx, divisor);
6283   if (VM_Version::supports_bmi1()) {
6284     andnl(rax, rdx, rax);
6285   } else {
6286     notl(rdx);
6287     andl(rax, rdx);
6288   }
6289   shrl(rax, 31);
6290   bind(done);
6291 }
6292 
6293 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6294   Label done;
6295   Label neg_divisor_fastpath;
6296   cmpl(divisor, 0);
6297   jccb(Assembler::less, neg_divisor_fastpath);
6298   xorl(rdx, rdx);
6299   divl(divisor);
6300   jmpb(done);
6301   bind(neg_divisor_fastpath);
6302   // Fastpath when divisor < 0:
6303   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6304   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6305   movl(rdx, rax);
6306   subl(rax, divisor);
6307   if (VM_Version::supports_bmi1()) {
6308     andnl(rax, rax, rdx);
6309   } else {
6310     notl(rax);
6311     andl(rax, rdx);
6312   }
6313   sarl(rax, 31);
6314   andl(rax, divisor);
6315   subl(rdx, rax);
6316   bind(done);
6317 }
6318 
6319 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6320   Label done;
6321   Label neg_divisor_fastpath;
6322 
6323   cmpl(divisor, 0);
6324   jccb(Assembler::less, neg_divisor_fastpath);
6325   xorl(rdx, rdx);
6326   divl(divisor);
6327   jmpb(done);
6328   bind(neg_divisor_fastpath);
6329   // Fastpath for divisor < 0:
6330   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6331   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6332   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6333   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6334   movl(rdx, rax);
6335   subl(rax, divisor);
6336   if (VM_Version::supports_bmi1()) {
6337     andnl(rax, rax, rdx);
6338   } else {
6339     notl(rax);
6340     andl(rax, rdx);
6341   }
6342   movl(tmp, rax);
6343   shrl(rax, 31); // quotient
6344   sarl(tmp, 31);
6345   andl(tmp, divisor);
6346   subl(rdx, tmp); // remainder
6347   bind(done);
6348 }
6349 
6350 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6351                                  XMMRegister xtmp2, Register rtmp) {
6352   if(VM_Version::supports_gfni()) {
6353     // Galois field instruction based bit reversal based on following algorithm.
6354     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6355     mov64(rtmp, 0x8040201008040201L);
6356     movq(xtmp1, src);
6357     movq(xtmp2, rtmp);
6358     gf2p8affineqb(xtmp1, xtmp2, 0);
6359     movq(dst, xtmp1);
6360   } else {
6361     // Swap even and odd numbered bits.
6362     movl(rtmp, src);
6363     andl(rtmp, 0x55555555);
6364     shll(rtmp, 1);
6365     movl(dst, src);
6366     andl(dst, 0xAAAAAAAA);
6367     shrl(dst, 1);
6368     orl(dst, rtmp);
6369 
6370     // Swap LSB and MSB 2 bits of each nibble.
6371     movl(rtmp, dst);
6372     andl(rtmp, 0x33333333);
6373     shll(rtmp, 2);
6374     andl(dst, 0xCCCCCCCC);
6375     shrl(dst, 2);
6376     orl(dst, rtmp);
6377 
6378     // Swap LSB and MSB 4 bits of each byte.
6379     movl(rtmp, dst);
6380     andl(rtmp, 0x0F0F0F0F);
6381     shll(rtmp, 4);
6382     andl(dst, 0xF0F0F0F0);
6383     shrl(dst, 4);
6384     orl(dst, rtmp);
6385   }
6386   bswapl(dst);
6387 }
6388 
6389 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6390                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6391   if(VM_Version::supports_gfni()) {
6392     // Galois field instruction based bit reversal based on following algorithm.
6393     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6394     mov64(rtmp1, 0x8040201008040201L);
6395     movq(xtmp1, src);
6396     movq(xtmp2, rtmp1);
6397     gf2p8affineqb(xtmp1, xtmp2, 0);
6398     movq(dst, xtmp1);
6399   } else {
6400     // Swap even and odd numbered bits.
6401     movq(rtmp1, src);
6402     mov64(rtmp2, 0x5555555555555555L);
6403     andq(rtmp1, rtmp2);
6404     shlq(rtmp1, 1);
6405     movq(dst, src);
6406     notq(rtmp2);
6407     andq(dst, rtmp2);
6408     shrq(dst, 1);
6409     orq(dst, rtmp1);
6410 
6411     // Swap LSB and MSB 2 bits of each nibble.
6412     movq(rtmp1, dst);
6413     mov64(rtmp2, 0x3333333333333333L);
6414     andq(rtmp1, rtmp2);
6415     shlq(rtmp1, 2);
6416     notq(rtmp2);
6417     andq(dst, rtmp2);
6418     shrq(dst, 2);
6419     orq(dst, rtmp1);
6420 
6421     // Swap LSB and MSB 4 bits of each byte.
6422     movq(rtmp1, dst);
6423     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6424     andq(rtmp1, rtmp2);
6425     shlq(rtmp1, 4);
6426     notq(rtmp2);
6427     andq(dst, rtmp2);
6428     shrq(dst, 4);
6429     orq(dst, rtmp1);
6430   }
6431   bswapq(dst);
6432 }
6433 
6434 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6435   Label done;
6436   Label neg_divisor_fastpath;
6437   cmpq(divisor, 0);
6438   jccb(Assembler::less, neg_divisor_fastpath);
6439   xorl(rdx, rdx);
6440   divq(divisor);
6441   jmpb(done);
6442   bind(neg_divisor_fastpath);
6443   // Fastpath for divisor < 0:
6444   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6445   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6446   movq(rdx, rax);
6447   subq(rdx, divisor);
6448   if (VM_Version::supports_bmi1()) {
6449     andnq(rax, rdx, rax);
6450   } else {
6451     notq(rdx);
6452     andq(rax, rdx);
6453   }
6454   shrq(rax, 63);
6455   bind(done);
6456 }
6457 
6458 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6459   Label done;
6460   Label neg_divisor_fastpath;
6461   cmpq(divisor, 0);
6462   jccb(Assembler::less, neg_divisor_fastpath);
6463   xorq(rdx, rdx);
6464   divq(divisor);
6465   jmp(done);
6466   bind(neg_divisor_fastpath);
6467   // Fastpath when divisor < 0:
6468   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6469   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6470   movq(rdx, rax);
6471   subq(rax, divisor);
6472   if (VM_Version::supports_bmi1()) {
6473     andnq(rax, rax, rdx);
6474   } else {
6475     notq(rax);
6476     andq(rax, rdx);
6477   }
6478   sarq(rax, 63);
6479   andq(rax, divisor);
6480   subq(rdx, rax);
6481   bind(done);
6482 }
6483 
6484 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6485   Label done;
6486   Label neg_divisor_fastpath;
6487   cmpq(divisor, 0);
6488   jccb(Assembler::less, neg_divisor_fastpath);
6489   xorq(rdx, rdx);
6490   divq(divisor);
6491   jmp(done);
6492   bind(neg_divisor_fastpath);
6493   // Fastpath for divisor < 0:
6494   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6495   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6496   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6497   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6498   movq(rdx, rax);
6499   subq(rax, divisor);
6500   if (VM_Version::supports_bmi1()) {
6501     andnq(rax, rax, rdx);
6502   } else {
6503     notq(rax);
6504     andq(rax, rdx);
6505   }
6506   movq(tmp, rax);
6507   shrq(rax, 63); // quotient
6508   sarq(tmp, 63);
6509   andq(tmp, divisor);
6510   subq(rdx, tmp); // remainder
6511   bind(done);
6512 }
6513 
6514 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6515                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6516                                         int vlen_enc) {
6517   assert(VM_Version::supports_avx512bw(), "");
6518   // Byte shuffles are inlane operations and indices are determined using
6519   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6520   // normalized to index range 0-15. This makes sure that all the multiples
6521   // of an index value are placed at same relative position in 128 bit
6522   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6523   // will be 16th element in their respective 128 bit lanes.
6524   movl(rtmp, 16);
6525   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6526 
6527   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6528   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6529   // original shuffle indices and move the shuffled lanes corresponding to true
6530   // mask to destination vector.
6531   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6532   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6533   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6534 
6535   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6536   // and broadcasting second 128 bit lane.
6537   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6538   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6539   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6540   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6541   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6542 
6543   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6544   // and broadcasting third 128 bit lane.
6545   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6546   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6547   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6548   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6549   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6550 
6551   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6552   // and broadcasting third 128 bit lane.
6553   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6554   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6555   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6556   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6557   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6558 }
6559 
6560 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6561                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6562   if (vlen_enc == AVX_128bit) {
6563     vpermilps(dst, src, shuffle, vlen_enc);
6564   } else if (bt == T_INT) {
6565     vpermd(dst, shuffle, src, vlen_enc);
6566   } else {
6567     assert(bt == T_FLOAT, "");
6568     vpermps(dst, shuffle, src, vlen_enc);
6569   }
6570 }
6571 
6572 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6573   switch(opcode) {
6574     case Op_AddHF: vaddsh(dst, src1, src2); break;
6575     case Op_SubHF: vsubsh(dst, src1, src2); break;
6576     case Op_MulHF: vmulsh(dst, src1, src2); break;
6577     case Op_DivHF: vdivsh(dst, src1, src2); break;
6578     default: assert(false, "%s", NodeClassNames[opcode]); break;
6579   }
6580 }
6581 
6582 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6583   switch(elem_bt) {
6584     case T_BYTE:
6585       if (ideal_opc == Op_SaturatingAddV) {
6586         vpaddsb(dst, src1, src2, vlen_enc);
6587       } else {
6588         assert(ideal_opc == Op_SaturatingSubV, "");
6589         vpsubsb(dst, src1, src2, vlen_enc);
6590       }
6591       break;
6592     case T_SHORT:
6593       if (ideal_opc == Op_SaturatingAddV) {
6594         vpaddsw(dst, src1, src2, vlen_enc);
6595       } else {
6596         assert(ideal_opc == Op_SaturatingSubV, "");
6597         vpsubsw(dst, src1, src2, vlen_enc);
6598       }
6599       break;
6600     default:
6601       fatal("Unsupported type %s", type2name(elem_bt));
6602       break;
6603   }
6604 }
6605 
6606 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6607   switch(elem_bt) {
6608     case T_BYTE:
6609       if (ideal_opc == Op_SaturatingAddV) {
6610         vpaddusb(dst, src1, src2, vlen_enc);
6611       } else {
6612         assert(ideal_opc == Op_SaturatingSubV, "");
6613         vpsubusb(dst, src1, src2, vlen_enc);
6614       }
6615       break;
6616     case T_SHORT:
6617       if (ideal_opc == Op_SaturatingAddV) {
6618         vpaddusw(dst, src1, src2, vlen_enc);
6619       } else {
6620         assert(ideal_opc == Op_SaturatingSubV, "");
6621         vpsubusw(dst, src1, src2, vlen_enc);
6622       }
6623       break;
6624     default:
6625       fatal("Unsupported type %s", type2name(elem_bt));
6626       break;
6627   }
6628 }
6629 
6630 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6631                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6632   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6633   // overflow_mask = Inp1 <u Inp2
6634   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6635   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6636   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6637 }
6638 
6639 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6640                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6641   // Emulate unsigned comparison using signed comparison
6642   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6643   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6644   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6645   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6646 
6647   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6648 
6649   // Res = INP1 - INP2 (non-commutative and non-associative)
6650   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6651   // Res = Mask ? Zero : Res
6652   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6653   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6654 }
6655 
6656 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6657                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6658   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6659   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6660   // Res = Signed Add INP1, INP2
6661   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6662   // T1 = SRC1 | SRC2
6663   vpor(xtmp1, src1, src2, vlen_enc);
6664   // Max_Unsigned = -1
6665   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6666   // Unsigned compare:  Mask = Res <u T1
6667   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6668   // res  = Mask ? Max_Unsigned : Res
6669   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6670 }
6671 
6672 //
6673 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6674 // unsigned addition operation.
6675 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6676 //
6677 // We empirically determined its semantic equivalence to following reduced expression
6678 //    overflow_mask =  (a + b) <u (a | b)
6679 //
6680 // and also verified it though Alive2 solver.
6681 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6682 //
6683 
6684 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6685                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6686   // Res = Signed Add INP1, INP2
6687   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6688   // Compute T1 = INP1 | INP2
6689   vpor(xtmp3, src1, src2, vlen_enc);
6690   // T1 = Minimum signed value.
6691   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6692   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6693   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6694   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6695   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6696   // Compute overflow detection mask = Res<1> <s T1
6697   if (elem_bt == T_INT) {
6698     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6699   } else {
6700     assert(elem_bt == T_LONG, "");
6701     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6702   }
6703   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6704 }
6705 
6706 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6707                                       int vlen_enc, bool xtmp2_hold_M1) {
6708   if (VM_Version::supports_avx512dq()) {
6709     evpmovq2m(ktmp, src, vlen_enc);
6710   } else {
6711     assert(VM_Version::supports_evex(), "");
6712     if (!xtmp2_hold_M1) {
6713       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6714     }
6715     evpsraq(xtmp1, src, 63, vlen_enc);
6716     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6717   }
6718 }
6719 
6720 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6721                                       int vlen_enc, bool xtmp2_hold_M1) {
6722   if (VM_Version::supports_avx512dq()) {
6723     evpmovd2m(ktmp, src, vlen_enc);
6724   } else {
6725     assert(VM_Version::supports_evex(), "");
6726     if (!xtmp2_hold_M1) {
6727       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6728     }
6729     vpsrad(xtmp1, src, 31, vlen_enc);
6730     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6731   }
6732 }
6733 
6734 
6735 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6736   if (elem_bt == T_LONG) {
6737     if (VM_Version::supports_evex()) {
6738       evpsraq(dst, src, 63, vlen_enc);
6739     } else {
6740       vpsrad(dst, src, 31, vlen_enc);
6741       vpshufd(dst, dst, 0xF5, vlen_enc);
6742     }
6743   } else {
6744     assert(elem_bt == T_INT, "");
6745     vpsrad(dst, src, 31, vlen_enc);
6746   }
6747 }
6748 
6749 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6750   if (compute_allones) {
6751     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6752       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6753     } else {
6754       vpcmpeqq(allones, allones, allones, vlen_enc);
6755     }
6756   }
6757   if (elem_bt == T_LONG) {
6758     vpsrlq(dst, allones, 1, vlen_enc);
6759   } else {
6760     assert(elem_bt == T_INT, "");
6761     vpsrld(dst, allones, 1, vlen_enc);
6762   }
6763 }
6764 
6765 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6766   if (compute_allones) {
6767     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6768       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6769     } else {
6770       vpcmpeqq(allones, allones, allones, vlen_enc);
6771     }
6772   }
6773   if (elem_bt == T_LONG) {
6774     vpsllq(dst, allones, 63, vlen_enc);
6775   } else {
6776     assert(elem_bt == T_INT, "");
6777     vpslld(dst, allones, 31, vlen_enc);
6778   }
6779 }
6780 
6781 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6782                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6783   switch(elem_bt) {
6784     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6785     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6786     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6787     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6788     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6789   }
6790 }
6791 
6792 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6793   switch(elem_bt) {
6794     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6795     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6796     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6797     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6798     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6799   }
6800 }
6801 
6802 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6803                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6804   if (elem_bt == T_LONG) {
6805     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6806   } else {
6807     assert(elem_bt == T_INT, "");
6808     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6809   }
6810 }
6811 
6812 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6813                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6814                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6815   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6816   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6817   // Overflow detection based on Hacker's delight section 2-13.
6818   if (ideal_opc == Op_SaturatingAddV) {
6819     // res = src1 + src2
6820     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6821     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6822     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6823     vpxor(xtmp1, dst, src1, vlen_enc);
6824     vpxor(xtmp2, dst, src2, vlen_enc);
6825     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6826   } else {
6827     assert(ideal_opc == Op_SaturatingSubV, "");
6828     // res = src1 - src2
6829     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6830     // Overflow occurs when both inputs have opposite polarity and
6831     // result polarity does not comply with first input polarity.
6832     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6833     vpxor(xtmp1, src1, src2, vlen_enc);
6834     vpxor(xtmp2, dst, src1, vlen_enc);
6835     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6836   }
6837 
6838   // Compute overflow detection mask.
6839   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6840   // Note: xtmp1 hold -1 in all its lanes after above call.
6841 
6842   // Compute mask based on first input polarity.
6843   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6844 
6845   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6846   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6847 
6848   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6849   // set bits in first input polarity mask holds a min value.
6850   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6851   // Blend destination lanes with saturated values using overflow detection mask.
6852   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6853 }
6854 
6855 
6856 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6857                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6858                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6859   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6860   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6861   // Overflow detection based on Hacker's delight section 2-13.
6862   if (ideal_opc == Op_SaturatingAddV) {
6863     // res = src1 + src2
6864     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6865     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6866     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6867     vpxor(xtmp1, dst, src1, vlen_enc);
6868     vpxor(xtmp2, dst, src2, vlen_enc);
6869     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6870   } else {
6871     assert(ideal_opc == Op_SaturatingSubV, "");
6872     // res = src1 - src2
6873     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6874     // Overflow occurs when both inputs have opposite polarity and
6875     // result polarity does not comply with first input polarity.
6876     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6877     vpxor(xtmp1, src1, src2, vlen_enc);
6878     vpxor(xtmp2, dst, src1, vlen_enc);
6879     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6880   }
6881 
6882   // Sign-extend to compute overflow detection mask.
6883   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6884 
6885   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6886   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6887   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6888 
6889   // Compose saturating min/max vector using first input polarity mask.
6890   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6891   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6892 
6893   // Blend result with saturating vector using overflow detection mask.
6894   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6895 }
6896 
6897 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6898   switch(elem_bt) {
6899     case T_BYTE:
6900       if (ideal_opc == Op_SaturatingAddV) {
6901         vpaddsb(dst, src1, src2, vlen_enc);
6902       } else {
6903         assert(ideal_opc == Op_SaturatingSubV, "");
6904         vpsubsb(dst, src1, src2, vlen_enc);
6905       }
6906       break;
6907     case T_SHORT:
6908       if (ideal_opc == Op_SaturatingAddV) {
6909         vpaddsw(dst, src1, src2, vlen_enc);
6910       } else {
6911         assert(ideal_opc == Op_SaturatingSubV, "");
6912         vpsubsw(dst, src1, src2, vlen_enc);
6913       }
6914       break;
6915     default:
6916       fatal("Unsupported type %s", type2name(elem_bt));
6917       break;
6918   }
6919 }
6920 
6921 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6922   switch(elem_bt) {
6923     case T_BYTE:
6924       if (ideal_opc == Op_SaturatingAddV) {
6925         vpaddusb(dst, src1, src2, vlen_enc);
6926       } else {
6927         assert(ideal_opc == Op_SaturatingSubV, "");
6928         vpsubusb(dst, src1, src2, vlen_enc);
6929       }
6930       break;
6931     case T_SHORT:
6932       if (ideal_opc == Op_SaturatingAddV) {
6933         vpaddusw(dst, src1, src2, vlen_enc);
6934       } else {
6935         assert(ideal_opc == Op_SaturatingSubV, "");
6936         vpsubusw(dst, src1, src2, vlen_enc);
6937       }
6938       break;
6939     default:
6940       fatal("Unsupported type %s", type2name(elem_bt));
6941       break;
6942   }
6943 }
6944 
6945 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6946                                                      XMMRegister src2, int vlen_enc) {
6947   switch(elem_bt) {
6948     case T_BYTE:
6949       evpermi2b(dst, src1, src2, vlen_enc);
6950       break;
6951     case T_SHORT:
6952       evpermi2w(dst, src1, src2, vlen_enc);
6953       break;
6954     case T_INT:
6955       evpermi2d(dst, src1, src2, vlen_enc);
6956       break;
6957     case T_LONG:
6958       evpermi2q(dst, src1, src2, vlen_enc);
6959       break;
6960     case T_FLOAT:
6961       evpermi2ps(dst, src1, src2, vlen_enc);
6962       break;
6963     case T_DOUBLE:
6964       evpermi2pd(dst, src1, src2, vlen_enc);
6965       break;
6966     default:
6967       fatal("Unsupported type %s", type2name(elem_bt));
6968       break;
6969   }
6970 }
6971 
6972 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
6973   if (is_unsigned) {
6974     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6975   } else {
6976     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6977   }
6978 }
6979 
6980 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
6981   if (is_unsigned) {
6982     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6983   } else {
6984     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6985   }
6986 }
6987 
6988 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6989   switch(opcode) {
6990     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
6991     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
6992     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
6993     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
6994     default: assert(false, "%s", NodeClassNames[opcode]); break;
6995   }
6996 }
6997 
6998 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6999   switch(opcode) {
7000     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7001     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7002     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7003     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7004     default: assert(false, "%s", NodeClassNames[opcode]); break;
7005   }
7006 }
7007 
7008 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7009                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7010   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7011 }
7012 
7013 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7014                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7015   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7016     // Move sign bits of src2 to mask register.
7017     evpmovw2m(ktmp, src2, vlen_enc);
7018     // xtmp1 = src2 < 0 ? src2 : src1
7019     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7020     // xtmp2 = src2 < 0 ? ? src1 : src2
7021     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7022     // Idea behind above swapping is to make seconds source operand a +ve value.
7023     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7024     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7025     // the second source operand, either a NaN or a valid floating-point value, is returned
7026     // dst = max(xtmp1, xtmp2)
7027     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7028     // isNaN = is_unordered_quiet(xtmp1)
7029     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7030     // Final result is same as first source if its a NaN value,
7031     // in case second operand holds a NaN value then as per above semantics
7032     // result is same as second operand.
7033     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7034   } else {
7035     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7036     // Move sign bits of src1 to mask register.
7037     evpmovw2m(ktmp, src1, vlen_enc);
7038     // xtmp1 = src1 < 0 ? src2 : src1
7039     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7040     // xtmp2 = src1 < 0 ? src1 : src2
7041     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7042     // Idea behind above swapping is to make seconds source operand a -ve value.
7043     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7044     // the second source operand is returned.
7045     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7046     // or a valid floating-point value, is written to the result.
7047     // dst = min(xtmp1, xtmp2)
7048     evminph(dst, xtmp1, xtmp2, vlen_enc);
7049     // isNaN = is_unordered_quiet(xtmp1)
7050     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7051     // Final result is same as first source if its a NaN value,
7052     // in case second operand holds a NaN value then as per above semantics
7053     // result is same as second operand.
7054     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7055   }
7056 }