New src/hotspot/cpu/x86/c2_MacroAssembler

   1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  53   if (C->clinit_barrier_on_entry()) {
  54     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  55     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  56 
  57     Label L_skip_barrier;
  58     Register klass = rscratch1;
  59 
  60     mov_metadata(klass, C->method()->holder()->constant_encoding());
  61     clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
  62 
  63     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  64 
  65     bind(L_skip_barrier);
  66   }
  67 
  68   int framesize = C->output()->frame_size_in_bytes();
  69   int bangsize = C->output()->bang_size_in_bytes();
  70   bool fp_mode_24b = false;
  71   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  72 
  73   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  74   // NativeJump::patch_verified_entry will be able to patch out the entry
  75   // code safely. The push to verify stack depth is ok at 5 bytes,
  76   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  77   // stack bang then we must use the 6 byte frame allocation even if
  78   // we have no frame. :-(
  79   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  80 
  81   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  82   // Remove word for return addr
  83   framesize -= wordSize;
  84   stack_bang_size -= wordSize;
  85 
  86   // Calls to C2R adapters often do not accept exceptional returns.
  87   // We require that their callers must bang for them.  But be careful, because
  88   // some VM calls (such as call site linkage) can use several kilobytes of
  89   // stack.  But the stack safety zone should account for that.
  90   // See bugs 4446381, 4468289, 4497237.
  91   if (stack_bang_size > 0) {
  92     generate_stack_overflow_check(stack_bang_size);
  93 
  94     // We always push rbp, so that on return to interpreter rbp, will be
  95     // restored correctly and we can correct the stack.
  96     push(rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       mov(rbp, rsp);
 100     }
 101     // Remove word for ebp
 102     framesize -= wordSize;
 103 
 104     // Create frame
 105     if (framesize) {
 106       subptr(rsp, framesize);
 107     }
 108   } else {
 109     // Create frame (force generation of a 4 byte immediate value)
 110     subptr_imm32(rsp, framesize);
 111 
 112     // Save RBP register now.
 113     framesize -= wordSize;
 114     movptr(Address(rsp, framesize), rbp);
 115     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 116     if (PreserveFramePointer) {
 117       movptr(rbp, rsp);
 118       if (framesize > 0) {
 119         addptr(rbp, framesize);
 120       }
 121     }
 122   }
 123 
 124   if (C->needs_stack_repair()) {
 125     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 126     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 127     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
 128   }
 129 
 130   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 131     framesize -= wordSize;
 132     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 133   }
 134 
 135 #ifdef ASSERT
 136   if (VerifyStackAtCalls) {
 137     Label L;
 138     push(rax);
 139     mov(rax, rsp);
 140     andptr(rax, StackAlignmentInBytes-1);
 141     cmpptr(rax, StackAlignmentInBytes-wordSize);
 142     pop(rax);
 143     jcc(Assembler::equal, L);
 144     STOP("Stack is not properly aligned!");
 145     bind(L);
 146   }
 147 #endif
 148 }
 149 
 150 void C2_MacroAssembler::entry_barrier() {
 151   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 152   // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 153   Label dummy_slow_path;
 154   Label dummy_continuation;
 155   Label* slow_path = &dummy_slow_path;
 156   Label* continuation = &dummy_continuation;
 157   if (!Compile::current()->output()->in_scratch_emit_size()) {
 158     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 159     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 160     Compile::current()->output()->add_stub(stub);
 161     slow_path = &stub->entry();
 162     continuation = &stub->continuation();
 163   }
 164   bs->nmethod_entry_barrier(this, slow_path, continuation);
 165 }
 166 
 167 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 168   switch (vlen_in_bytes) {
 169     case  4: // fall-through
 170     case  8: // fall-through
 171     case 16: return Assembler::AVX_128bit;
 172     case 32: return Assembler::AVX_256bit;
 173     case 64: return Assembler::AVX_512bit;
 174 
 175     default: {
 176       ShouldNotReachHere();
 177       return Assembler::AVX_NoVec;
 178     }
 179   }
 180 }
 181 
 182 // fast_lock and fast_unlock used by C2
 183 
 184 // Because the transitions from emitted code to the runtime
 185 // monitorenter/exit helper stubs are so slow it's critical that
 186 // we inline both the stack-locking fast path and the inflated fast path.
 187 //
 188 // See also: cmpFastLock and cmpFastUnlock.
 189 //
 190 // What follows is a specialized inline transliteration of the code
 191 // in enter() and exit(). If we're concerned about I$ bloat another
 192 // option would be to emit TrySlowEnter and TrySlowExit methods
 193 // at startup-time.  These methods would accept arguments as
 194 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 195 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 196 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 197 // In practice, however, the # of lock sites is bounded and is usually small.
 198 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 199 // if the processor uses simple bimodal branch predictors keyed by EIP
 200 // Since the helper routines would be called from multiple synchronization
 201 // sites.
 202 //
 203 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 204 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 205 // to those specialized methods.  That'd give us a mostly platform-independent
 206 // implementation that the JITs could optimize and inline at their pleasure.
 207 // Done correctly, the only time we'd need to cross to native could would be
 208 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 209 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 210 // (b) explicit barriers or fence operations.
 211 //
 212 // TODO:
 213 //
 214 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 215 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 216 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 217 //    the lock operators would typically be faster than reifying Self.
 218 //
 219 // *  Ideally I'd define the primitives as:
 220 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 221 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 222 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 223 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 224 //    Furthermore the register assignments are overconstrained, possibly resulting in
 225 //    sub-optimal code near the synchronization site.
 226 //
 227 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 228 //    Alternately, use a better sp-proximity test.
 229 //
 230 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 231 //    Either one is sufficient to uniquely identify a thread.
 232 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 233 //
 234 // *  Intrinsify notify() and notifyAll() for the common cases where the
 235 //    object is locked by the calling thread but the waitlist is empty.
 236 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 237 //
 238 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 239 //    But beware of excessive branch density on AMD Opterons.
 240 //
 241 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 242 //    or failure of the fast path.  If the fast path fails then we pass
 243 //    control to the slow path, typically in C.  In fast_lock and
 244 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 245 //    will emit a conditional branch immediately after the node.
 246 //    So we have branches to branches and lots of ICC.ZF games.
 247 //    Instead, it might be better to have C2 pass a "FailureLabel"
 248 //    into fast_lock and fast_unlock.  In the case of success, control
 249 //    will drop through the node.  ICC.ZF is undefined at exit.
 250 //    In the case of failure, the node will branch directly to the
 251 //    FailureLabel
 252 
 253 
 254 // obj: object to lock
 255 // box: on-stack box address (displaced header location) - KILLED
 256 // rax,: tmp -- KILLED
 257 // scr: tmp -- KILLED
 258 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 259                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 260                                  Metadata* method_data) {
 261   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 262   // Ensure the register assignments are disjoint
 263   assert(tmpReg == rax, "");
 264   assert(cx1Reg == noreg, "");
 265   assert(cx2Reg == noreg, "");
 266   assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 267 
 268   // Possible cases that we'll encounter in fast_lock
 269   // ------------------------------------------------
 270   // * Inflated
 271   //    -- unlocked
 272   //    -- Locked
 273   //       = by self
 274   //       = by other
 275   // * neutral
 276   // * stack-locked
 277   //    -- by self
 278   //       = sp-proximity test hits
 279   //       = sp-proximity test generates false-negative
 280   //    -- by other
 281   //
 282 
 283   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 284 
 285   if (DiagnoseSyncOnValueBasedClasses != 0) {
 286     load_klass(tmpReg, objReg, scrReg);
 287     testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 288     jcc(Assembler::notZero, DONE_LABEL);
 289   }
 290 
 291   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 292   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 293   jcc(Assembler::notZero, IsInflated);
 294 
 295   if (LockingMode == LM_MONITOR) {
 296     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 297     testptr(objReg, objReg);
 298   } else {
 299     assert(LockingMode == LM_LEGACY, "must be");
 300     // Attempt stack-locking ...
 301     orptr (tmpReg, markWord::unlocked_value);
 302     if (EnableValhalla) {
 303       // Mask inline_type bit such that we go to the slow path if object is an inline type
 304       andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place));
 305     }
 306     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 307     lock();
 308     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 309     jcc(Assembler::equal, COUNT);           // Success
 310 
 311     // Recursive locking.
 312     // The object is stack-locked: markword contains stack pointer to BasicLock.
 313     // Locked by current thread if difference with current SP is less than one page.
 314     subptr(tmpReg, rsp);
 315     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 316     andptr(tmpReg, (int32_t) (7 - (int)os::vm_page_size()) );
 317     movptr(Address(boxReg, 0), tmpReg);
 318   }
 319   jmp(DONE_LABEL);
 320 
 321   bind(IsInflated);
 322   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 323 
 324   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 325   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 326   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 327 
 328   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 329   movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset()));
 330   movq(scrReg, tmpReg);
 331   xorq(tmpReg, tmpReg);
 332   lock();
 333   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 334 
 335   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 336   jccb(Assembler::equal, COUNT);    // CAS above succeeded; propagate ZF = 1 (success)
 337 
 338   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 339   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 340   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 341   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 342   bind(DONE_LABEL);
 343 
 344   // ZFlag == 1 count in fast path
 345   // ZFlag == 0 count in slow path
 346   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 347 
 348   bind(COUNT);
 349   if (LockingMode == LM_LEGACY) {
 350     // Count monitors in fast path
 351     increment(Address(thread, JavaThread::held_monitor_count_offset()));
 352   }
 353   xorl(tmpReg, tmpReg); // Set ZF == 1
 354 
 355   bind(NO_COUNT);
 356 
 357   // At NO_COUNT the icc ZFlag is set as follows ...
 358   // fast_unlock uses the same protocol.
 359   // ZFlag == 1 -> Success
 360   // ZFlag == 0 -> Failure - force control through the slow path
 361 }
 362 
 363 // obj: object to unlock
 364 // box: box address (displaced header location), killed.  Must be EAX.
 365 // tmp: killed, cannot be obj nor box.
 366 //
 367 // Some commentary on balanced locking:
 368 //
 369 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 370 // Methods that don't have provably balanced locking are forced to run in the
 371 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 372 // The interpreter provides two properties:
 373 // I1:  At return-time the interpreter automatically and quietly unlocks any
 374 //      objects acquired the current activation (frame).  Recall that the
 375 //      interpreter maintains an on-stack list of locks currently held by
 376 //      a frame.
 377 // I2:  If a method attempts to unlock an object that is not held by the
 378 //      the frame the interpreter throws IMSX.
 379 //
 380 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 381 // B() doesn't have provably balanced locking so it runs in the interpreter.
 382 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 383 // is still locked by A().
 384 //
 385 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 386 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 387 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 388 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 389 // Arguably given that the spec legislates the JNI case as undefined our implementation
 390 // could reasonably *avoid* checking owner in fast_unlock().
 391 // In the interest of performance we elide m->Owner==Self check in unlock.
 392 // A perfectly viable alternative is to elide the owner check except when
 393 // Xcheck:jni is enabled.
 394 
 395 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
 396   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 397   assert(boxReg == rax, "");
 398   assert_different_registers(objReg, boxReg, tmpReg);
 399 
 400   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 401 
 402   if (LockingMode == LM_LEGACY) {
 403     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 404     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 405   }
 406   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 407   if (LockingMode != LM_MONITOR) {
 408     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 409     jcc(Assembler::zero, Stacked);
 410   }
 411 
 412   // It's inflated.
 413 
 414   // Despite our balanced locking property we still check that m->_owner == Self
 415   // as java routines or native JNI code called by this thread might
 416   // have released the lock.
 417   //
 418   // If there's no contention try a 1-0 exit.  That is, exit without
 419   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 420   // we detect and recover from the race that the 1-0 exit admits.
 421   //
 422   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 423   // before it STs null into _owner, releasing the lock.  Updates
 424   // to data protected by the critical section must be visible before
 425   // we drop the lock (and thus before any other thread could acquire
 426   // the lock and observe the fields protected by the lock).
 427   // IA32's memory-model is SPO, so STs are ordered with respect to
 428   // each other and there's no need for an explicit barrier (fence).
 429   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 430   Label LSuccess, LNotRecursive;
 431 
 432   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 433   jccb(Assembler::equal, LNotRecursive);
 434 
 435   // Recursive inflated unlock
 436   decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 437   jmpb(LSuccess);
 438 
 439   bind(LNotRecursive);
 440 
 441   // Set owner to null.
 442   // Release to satisfy the JMM
 443   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 444   // We need a full fence after clearing owner to avoid stranding.
 445   // StoreLoad achieves this.
 446   membar(StoreLoad);
 447 
 448   // Check if the entry_list is empty.
 449   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD);
 450   jccb(Assembler::zero, LSuccess);    // If so we are done.
 451 
 452   // Check if there is a successor.
 453   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 454   jccb(Assembler::notZero, LSuccess); // If so we are done.
 455 
 456   // Save the monitor pointer in the current thread, so we can try to
 457   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 458   andptr(tmpReg, ~(int32_t)markWord::monitor_value);
 459   movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg);
 460 
 461   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 462   jmpb  (DONE_LABEL);
 463 
 464   bind  (LSuccess);
 465   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 466   jmpb  (DONE_LABEL);
 467 
 468   if (LockingMode == LM_LEGACY) {
 469     bind  (Stacked);
 470     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 471     lock();
 472     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 473     // Intentional fall-thru into DONE_LABEL
 474   }
 475 
 476   bind(DONE_LABEL);
 477 
 478   // ZFlag == 1 count in fast path
 479   // ZFlag == 0 count in slow path
 480   jccb(Assembler::notZero, NO_COUNT);
 481 
 482   bind(COUNT);
 483 
 484   if (LockingMode == LM_LEGACY) {
 485     // Count monitors in fast path
 486     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 487   }
 488 
 489   xorl(tmpReg, tmpReg); // Set ZF == 1
 490 
 491   bind(NO_COUNT);
 492 }
 493 
 494 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 495                                               Register t, Register thread) {
 496   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 497   assert(rax_reg == rax, "Used for CAS");
 498   assert_different_registers(obj, box, rax_reg, t, thread);
 499 
 500   // Handle inflated monitor.
 501   Label inflated;
 502   // Finish fast lock successfully. ZF value is irrelevant.
 503   Label locked;
 504   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 505   Label slow_path;
 506 
 507   if (UseObjectMonitorTable) {
 508     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 509     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 510   }
 511 
 512   if (DiagnoseSyncOnValueBasedClasses != 0) {
 513     load_klass(rax_reg, obj, t);
 514     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 515     jcc(Assembler::notZero, slow_path);
 516   }
 517 
 518   const Register mark = t;
 519 
 520   { // Lightweight Lock
 521 
 522     Label push;
 523 
 524     const Register top = UseObjectMonitorTable ? rax_reg : box;
 525 
 526     // Load the mark.
 527     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 528 
 529     // Prefetch top.
 530     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 531 
 532     // Check for monitor (0b10).
 533     testptr(mark, markWord::monitor_value);
 534     jcc(Assembler::notZero, inflated);
 535 
 536     // Check if lock-stack is full.
 537     cmpl(top, LockStack::end_offset() - 1);
 538     jcc(Assembler::greater, slow_path);
 539 
 540     // Check if recursive.
 541     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 542     jccb(Assembler::equal, push);
 543 
 544     // Try to lock. Transition lock bits 0b01 => 0b00
 545     movptr(rax_reg, mark);
 546     orptr(rax_reg, markWord::unlocked_value);
 547     andptr(mark, ~(int32_t)markWord::unlocked_value);
 548     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 549     jcc(Assembler::notEqual, slow_path);
 550 
 551     if (UseObjectMonitorTable) {
 552       // Need to reload top, clobbered by CAS.
 553       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 554     }
 555     bind(push);
 556     // After successful lock, push object on lock-stack.
 557     movptr(Address(thread, top), obj);
 558     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 559     jmpb(locked);
 560   }
 561 
 562   { // Handle inflated monitor.
 563     bind(inflated);
 564 
 565     const Register monitor = t;
 566 
 567     if (!UseObjectMonitorTable) {
 568       assert(mark == monitor, "should be the same here");
 569     } else {
 570       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 571       // Fetch ObjectMonitor* from the cache or take the slow-path.
 572       Label monitor_found;
 573 
 574       // Load cache address
 575       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 576 
 577       const int num_unrolled = 2;
 578       for (int i = 0; i < num_unrolled; i++) {
 579         cmpptr(obj, Address(t));
 580         jccb(Assembler::equal, monitor_found);
 581         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 582       }
 583 
 584       Label loop;
 585 
 586       // Search for obj in cache.
 587       bind(loop);
 588 
 589       // Check for match.
 590       cmpptr(obj, Address(t));
 591       jccb(Assembler::equal, monitor_found);
 592 
 593       // Search until null encountered, guaranteed _null_sentinel at end.
 594       cmpptr(Address(t), 1);
 595       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 596       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 597       jmpb(loop);
 598 
 599       // Cache hit.
 600       bind(monitor_found);
 601       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 602     }
 603     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 604     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 605     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 606 
 607     Label monitor_locked;
 608     // Lock the monitor.
 609 
 610     if (UseObjectMonitorTable) {
 611       // Cache the monitor for unlock before trashing box. On failure to acquire
 612       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 613       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 614     }
 615 
 616     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 617     xorptr(rax_reg, rax_reg);
 618     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 619     lock(); cmpxchgptr(box, owner_address);
 620     jccb(Assembler::equal, monitor_locked);
 621 
 622     // Check if recursive.
 623     cmpptr(box, rax_reg);
 624     jccb(Assembler::notEqual, slow_path);
 625 
 626     // Recursive.
 627     increment(recursions_address);
 628 
 629     bind(monitor_locked);
 630   }
 631 
 632   bind(locked);
 633   // Set ZF = 1
 634   xorl(rax_reg, rax_reg);
 635 
 636 #ifdef ASSERT
 637   // Check that locked label is reached with ZF set.
 638   Label zf_correct;
 639   Label zf_bad_zero;
 640   jcc(Assembler::zero, zf_correct);
 641   jmp(zf_bad_zero);
 642 #endif
 643 
 644   bind(slow_path);
 645 #ifdef ASSERT
 646   // Check that slow_path label is reached with ZF not set.
 647   jcc(Assembler::notZero, zf_correct);
 648   stop("Fast Lock ZF != 0");
 649   bind(zf_bad_zero);
 650   stop("Fast Lock ZF != 1");
 651   bind(zf_correct);
 652 #endif
 653   // C2 uses the value of ZF to determine the continuation.
 654 }
 655 
 656 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 657   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 658   assert(reg_rax == rax, "Used for CAS");
 659   assert_different_registers(obj, reg_rax, t);
 660 
 661   // Handle inflated monitor.
 662   Label inflated, inflated_check_lock_stack;
 663   // Finish fast unlock successfully.  MUST jump with ZF == 1
 664   Label unlocked, slow_path;
 665 
 666   const Register mark = t;
 667   const Register monitor = t;
 668   const Register top = UseObjectMonitorTable ? t : reg_rax;
 669   const Register box = reg_rax;
 670 
 671   Label dummy;
 672   C2FastUnlockLightweightStub* stub = nullptr;
 673 
 674   if (!Compile::current()->output()->in_scratch_emit_size()) {
 675     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 676     Compile::current()->output()->add_stub(stub);
 677   }
 678 
 679   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 680 
 681   { // Lightweight Unlock
 682 
 683     // Load top.
 684     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 685 
 686     if (!UseObjectMonitorTable) {
 687       // Prefetch mark.
 688       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 689     }
 690 
 691     // Check if obj is top of lock-stack.
 692     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 693     // Top of lock stack was not obj. Must be monitor.
 694     jcc(Assembler::notEqual, inflated_check_lock_stack);
 695 
 696     // Pop lock-stack.
 697     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 698     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 699 
 700     // Check if recursive.
 701     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 702     jcc(Assembler::equal, unlocked);
 703 
 704     // We elide the monitor check, let the CAS fail instead.
 705 
 706     if (UseObjectMonitorTable) {
 707       // Load mark.
 708       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 709     }
 710 
 711     // Try to unlock. Transition lock bits 0b00 => 0b01
 712     movptr(reg_rax, mark);
 713     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 714     orptr(mark, markWord::unlocked_value);
 715     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 716     jcc(Assembler::notEqual, push_and_slow_path);
 717     jmp(unlocked);
 718   }
 719 
 720 
 721   { // Handle inflated monitor.
 722     bind(inflated_check_lock_stack);
 723 #ifdef ASSERT
 724     Label check_done;
 725     subl(top, oopSize);
 726     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 727     jcc(Assembler::below, check_done);
 728     cmpptr(obj, Address(thread, top));
 729     jccb(Assembler::notEqual, inflated_check_lock_stack);
 730     stop("Fast Unlock lock on stack");
 731     bind(check_done);
 732     if (UseObjectMonitorTable) {
 733       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 734     }
 735     testptr(mark, markWord::monitor_value);
 736     jccb(Assembler::notZero, inflated);
 737     stop("Fast Unlock not monitor");
 738 #endif
 739 
 740     bind(inflated);
 741 
 742     if (!UseObjectMonitorTable) {
 743       assert(mark == monitor, "should be the same here");
 744     } else {
 745       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 746       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 747       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 748       cmpptr(monitor, alignof(ObjectMonitor*));
 749       jcc(Assembler::below, slow_path);
 750     }
 751     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 752     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 753     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 754     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 755     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 756 
 757     Label recursive;
 758 
 759     // Check if recursive.
 760     cmpptr(recursions_address, 0);
 761     jccb(Assembler::notZero, recursive);
 762 
 763     // Set owner to null.
 764     // Release to satisfy the JMM
 765     movptr(owner_address, NULL_WORD);
 766     // We need a full fence after clearing owner to avoid stranding.
 767     // StoreLoad achieves this.
 768     membar(StoreLoad);
 769 
 770     // Check if the entry_list is empty.
 771     cmpptr(entry_list_address, NULL_WORD);
 772     jccb(Assembler::zero, unlocked);    // If so we are done.
 773 
 774     // Check if there is a successor.
 775     cmpptr(succ_address, NULL_WORD);
 776     jccb(Assembler::notZero, unlocked); // If so we are done.
 777 
 778     // Save the monitor pointer in the current thread, so we can try to
 779     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 780     if (!UseObjectMonitorTable) {
 781       andptr(monitor, ~(int32_t)markWord::monitor_value);
 782     }
 783     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 784 
 785     orl(t, 1); // Fast Unlock ZF = 0
 786     jmpb(slow_path);
 787 
 788     // Recursive unlock.
 789     bind(recursive);
 790     decrement(recursions_address);
 791   }
 792 
 793   bind(unlocked);
 794   xorl(t, t); // Fast Unlock ZF = 1
 795 
 796 #ifdef ASSERT
 797   // Check that unlocked label is reached with ZF set.
 798   Label zf_correct;
 799   Label zf_bad_zero;
 800   jcc(Assembler::zero, zf_correct);
 801   jmp(zf_bad_zero);
 802 #endif
 803 
 804   bind(slow_path);
 805   if (stub != nullptr) {
 806     bind(stub->slow_path_continuation());
 807   }
 808 #ifdef ASSERT
 809   // Check that stub->continuation() label is reached with ZF not set.
 810   jcc(Assembler::notZero, zf_correct);
 811   stop("Fast Unlock ZF != 0");
 812   bind(zf_bad_zero);
 813   stop("Fast Unlock ZF != 1");
 814   bind(zf_correct);
 815 #endif
 816   // C2 uses the value of ZF to determine the continuation.
 817 }
 818 
 819 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 820   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 821 }
 822 
 823 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 824   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 825   masm->movptr(dst, rsp);
 826   if (framesize > 2 * wordSize) {
 827     masm->addptr(dst, framesize - 2 * wordSize);
 828   }
 829 }
 830 
 831 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 832   if (PreserveFramePointer) {
 833     // frame pointer is valid
 834 #ifdef ASSERT
 835     // Verify frame pointer value in rbp.
 836     reconstruct_frame_pointer_helper(this, rtmp);
 837     Label L_success;
 838     cmpq(rbp, rtmp);
 839     jccb(Assembler::equal, L_success);
 840     STOP("frame pointer mismatch");
 841     bind(L_success);
 842 #endif // ASSERT
 843   } else {
 844     reconstruct_frame_pointer_helper(this, rbp);
 845   }
 846 }
 847 
 848 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 849   jint lo = t->_lo;
 850   jint hi = t->_hi;
 851   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 852   if (t == TypeInt::INT) {
 853     return;
 854   }
 855 
 856   BLOCK_COMMENT("CastII {");
 857   Label fail;
 858   Label succeed;
 859   if (hi == max_jint) {
 860     cmpl(val, lo);
 861     jccb(Assembler::greaterEqual, succeed);
 862   } else {
 863     if (lo != min_jint) {
 864       cmpl(val, lo);
 865       jccb(Assembler::less, fail);
 866     }
 867     cmpl(val, hi);
 868     jccb(Assembler::lessEqual, succeed);
 869   }
 870 
 871   bind(fail);
 872   movl(c_rarg0, idx);
 873   movl(c_rarg1, val);
 874   movl(c_rarg2, lo);
 875   movl(c_rarg3, hi);
 876   reconstruct_frame_pointer(rscratch1);
 877   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 878   hlt();
 879   bind(succeed);
 880   BLOCK_COMMENT("} // CastII");
 881 }
 882 
 883 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 884   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 885 }
 886 
 887 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 888   jlong lo = t->_lo;
 889   jlong hi = t->_hi;
 890   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 891   if (t == TypeLong::LONG) {
 892     return;
 893   }
 894 
 895   BLOCK_COMMENT("CastLL {");
 896   Label fail;
 897   Label succeed;
 898 
 899   auto cmp_val = [&](jlong bound) {
 900     if (is_simm32(bound)) {
 901       cmpq(val, checked_cast<int>(bound));
 902     } else {
 903       mov64(tmp, bound);
 904       cmpq(val, tmp);
 905     }
 906   };
 907 
 908   if (hi == max_jlong) {
 909     cmp_val(lo);
 910     jccb(Assembler::greaterEqual, succeed);
 911   } else {
 912     if (lo != min_jlong) {
 913       cmp_val(lo);
 914       jccb(Assembler::less, fail);
 915     }
 916     cmp_val(hi);
 917     jccb(Assembler::lessEqual, succeed);
 918   }
 919 
 920   bind(fail);
 921   movl(c_rarg0, idx);
 922   movq(c_rarg1, val);
 923   mov64(c_rarg2, lo);
 924   mov64(c_rarg3, hi);
 925   reconstruct_frame_pointer(rscratch1);
 926   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 927   hlt();
 928   bind(succeed);
 929   BLOCK_COMMENT("} // CastLL");
 930 }
 931 
 932 //-------------------------------------------------------------------------------------------
 933 // Generic instructions support for use in .ad files C2 code generation
 934 
 935 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 936   if (dst != src) {
 937     movdqu(dst, src);
 938   }
 939   if (opcode == Op_AbsVD) {
 940     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 941   } else {
 942     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 943     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 944   }
 945 }
 946 
 947 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 948   if (opcode == Op_AbsVD) {
 949     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 950   } else {
 951     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 952     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 953   }
 954 }
 955 
 956 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 957   if (dst != src) {
 958     movdqu(dst, src);
 959   }
 960   if (opcode == Op_AbsVF) {
 961     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 962   } else {
 963     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 964     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 965   }
 966 }
 967 
 968 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 969   if (opcode == Op_AbsVF) {
 970     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 971   } else {
 972     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 973     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 974   }
 975 }
 976 
 977 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 978   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 979   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 980 
 981   if (opcode == Op_MinV) {
 982     if (elem_bt == T_BYTE) {
 983       pminsb(dst, src);
 984     } else if (elem_bt == T_SHORT) {
 985       pminsw(dst, src);
 986     } else if (elem_bt == T_INT) {
 987       pminsd(dst, src);
 988     } else {
 989       assert(elem_bt == T_LONG, "required");
 990       assert(tmp == xmm0, "required");
 991       assert_different_registers(dst, src, tmp);
 992       movdqu(xmm0, dst);
 993       pcmpgtq(xmm0, src);
 994       blendvpd(dst, src);  // xmm0 as mask
 995     }
 996   } else { // opcode == Op_MaxV
 997     if (elem_bt == T_BYTE) {
 998       pmaxsb(dst, src);
 999     } else if (elem_bt == T_SHORT) {
1000       pmaxsw(dst, src);
1001     } else if (elem_bt == T_INT) {
1002       pmaxsd(dst, src);
1003     } else {
1004       assert(elem_bt == T_LONG, "required");
1005       assert(tmp == xmm0, "required");
1006       assert_different_registers(dst, src, tmp);
1007       movdqu(xmm0, src);
1008       pcmpgtq(xmm0, dst);
1009       blendvpd(dst, src);  // xmm0 as mask
1010     }
1011   }
1012 }
1013 
1014 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
1015                                   XMMRegister src1, Address src2, int vlen_enc) {
1016   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
1017   if (opcode == Op_UMinV) {
1018     switch(elem_bt) {
1019       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
1020       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
1021       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
1022       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
1023       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1024     }
1025   } else {
1026     assert(opcode == Op_UMaxV, "required");
1027     switch(elem_bt) {
1028       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
1029       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
1030       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
1031       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
1032       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1033     }
1034   }
1035 }
1036 
1037 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
1038   // For optimality, leverage a full vector width of 512 bits
1039   // for operations over smaller vector sizes on AVX512 targets.
1040   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
1041     if (opcode == Op_UMaxV) {
1042       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
1043     } else {
1044       assert(opcode == Op_UMinV, "required");
1045       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
1046     }
1047   } else {
1048     // T1 = -1
1049     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
1050     // T1 = -1 << 63
1051     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
1052     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
1053     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
1054     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
1055     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
1056     // Mask = T2 > T1
1057     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
1058     if (opcode == Op_UMaxV) {
1059       // Res = Mask ? Src2 : Src1
1060       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
1061     } else {
1062       // Res = Mask ? Src1 : Src2
1063       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
1064     }
1065   }
1066 }
1067 
1068 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
1069                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
1070   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
1071   if (opcode == Op_UMinV) {
1072     switch(elem_bt) {
1073       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
1074       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
1075       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
1076       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
1077       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1078     }
1079   } else {
1080     assert(opcode == Op_UMaxV, "required");
1081     switch(elem_bt) {
1082       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
1083       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
1084       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
1085       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
1086       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
1087     }
1088   }
1089 }
1090 
1091 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1092                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1093                                  int vlen_enc) {
1094   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1095 
1096   if (opcode == Op_MinV) {
1097     if (elem_bt == T_BYTE) {
1098       vpminsb(dst, src1, src2, vlen_enc);
1099     } else if (elem_bt == T_SHORT) {
1100       vpminsw(dst, src1, src2, vlen_enc);
1101     } else if (elem_bt == T_INT) {
1102       vpminsd(dst, src1, src2, vlen_enc);
1103     } else {
1104       assert(elem_bt == T_LONG, "required");
1105       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1106         vpminsq(dst, src1, src2, vlen_enc);
1107       } else {
1108         assert_different_registers(dst, src1, src2);
1109         vpcmpgtq(dst, src1, src2, vlen_enc);
1110         vblendvpd(dst, src1, src2, dst, vlen_enc);
1111       }
1112     }
1113   } else { // opcode == Op_MaxV
1114     if (elem_bt == T_BYTE) {
1115       vpmaxsb(dst, src1, src2, vlen_enc);
1116     } else if (elem_bt == T_SHORT) {
1117       vpmaxsw(dst, src1, src2, vlen_enc);
1118     } else if (elem_bt == T_INT) {
1119       vpmaxsd(dst, src1, src2, vlen_enc);
1120     } else {
1121       assert(elem_bt == T_LONG, "required");
1122       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1123         vpmaxsq(dst, src1, src2, vlen_enc);
1124       } else {
1125         assert_different_registers(dst, src1, src2);
1126         vpcmpgtq(dst, src1, src2, vlen_enc);
1127         vblendvpd(dst, src2, src1, dst, vlen_enc);
1128       }
1129     }
1130   }
1131 }
1132 
1133 // Float/Double min max
1134 
1135 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1136                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1137                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1138                                    int vlen_enc) {
1139   assert(UseAVX > 0, "required");
1140   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1141          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1142   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1143   assert_different_registers(a, tmp, atmp, btmp);
1144   assert_different_registers(b, tmp, atmp, btmp);
1145 
1146   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1147   bool is_double_word = is_double_word_type(elem_bt);
1148 
1149   /* Note on 'non-obvious' assembly sequence:
1150    *
1151    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1152    * and Java on how they handle floats:
1153    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1154    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1155    *
1156    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1157    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1158    *                (only useful when signs differ, noop otherwise)
1159    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1160 
1161    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1162    *   btmp = (b < +0.0) ? a : b
1163    *   atmp = (b < +0.0) ? b : a
1164    *   Tmp  = Max_Float(atmp , btmp)
1165    *   Res  = (atmp == NaN) ? atmp : Tmp
1166    */
1167 
1168   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1169   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1170   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1171   XMMRegister mask;
1172 
1173   if (!is_double_word && is_min) {
1174     mask = a;
1175     vblend = &MacroAssembler::vblendvps;
1176     vmaxmin = &MacroAssembler::vminps;
1177     vcmp = &MacroAssembler::vcmpps;
1178   } else if (!is_double_word && !is_min) {
1179     mask = b;
1180     vblend = &MacroAssembler::vblendvps;
1181     vmaxmin = &MacroAssembler::vmaxps;
1182     vcmp = &MacroAssembler::vcmpps;
1183   } else if (is_double_word && is_min) {
1184     mask = a;
1185     vblend = &MacroAssembler::vblendvpd;
1186     vmaxmin = &MacroAssembler::vminpd;
1187     vcmp = &MacroAssembler::vcmppd;
1188   } else {
1189     assert(is_double_word && !is_min, "sanity");
1190     mask = b;
1191     vblend = &MacroAssembler::vblendvpd;
1192     vmaxmin = &MacroAssembler::vmaxpd;
1193     vcmp = &MacroAssembler::vcmppd;
1194   }
1195 
1196   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1197   XMMRegister maxmin, scratch;
1198   if (dst == btmp) {
1199     maxmin = btmp;
1200     scratch = tmp;
1201   } else {
1202     maxmin = tmp;
1203     scratch = btmp;
1204   }
1205 
1206   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1207   if (precompute_mask && !is_double_word) {
1208     vpsrad(tmp, mask, 32, vlen_enc);
1209     mask = tmp;
1210   } else if (precompute_mask && is_double_word) {
1211     vpxor(tmp, tmp, tmp, vlen_enc);
1212     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1213     mask = tmp;
1214   }
1215 
1216   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1217   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1218   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1219   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1220   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1221 }
1222 
1223 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1224                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1225                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1226                                     int vlen_enc) {
1227   assert(UseAVX > 2, "required");
1228   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1229          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1230   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1231   assert_different_registers(dst, a, atmp, btmp);
1232   assert_different_registers(dst, b, atmp, btmp);
1233 
1234   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1235   bool is_double_word = is_double_word_type(elem_bt);
1236   bool merge = true;
1237 
1238   if (!is_double_word && is_min) {
1239     evpmovd2m(ktmp, a, vlen_enc);
1240     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1241     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1242     vminps(dst, atmp, btmp, vlen_enc);
1243     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1244     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1245   } else if (!is_double_word && !is_min) {
1246     evpmovd2m(ktmp, b, vlen_enc);
1247     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1248     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1249     vmaxps(dst, atmp, btmp, vlen_enc);
1250     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1251     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1252   } else if (is_double_word && is_min) {
1253     evpmovq2m(ktmp, a, vlen_enc);
1254     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1255     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1256     vminpd(dst, atmp, btmp, vlen_enc);
1257     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1258     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1259   } else {
1260     assert(is_double_word && !is_min, "sanity");
1261     evpmovq2m(ktmp, b, vlen_enc);
1262     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1263     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1264     vmaxpd(dst, atmp, btmp, vlen_enc);
1265     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1266     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1267   }
1268 }
1269 
1270 // Float/Double signum
1271 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1272   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1273 
1274   Label DONE_LABEL;
1275 
1276   if (opcode == Op_SignumF) {
1277     ucomiss(dst, zero);
1278     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1279     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1280     movflt(dst, one);
1281     jcc(Assembler::above, DONE_LABEL);
1282     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1283   } else if (opcode == Op_SignumD) {
1284     ucomisd(dst, zero);
1285     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1286     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1287     movdbl(dst, one);
1288     jcc(Assembler::above, DONE_LABEL);
1289     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1290   }
1291 
1292   bind(DONE_LABEL);
1293 }
1294 
1295 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1296   if (sign) {
1297     pmovsxbw(dst, src);
1298   } else {
1299     pmovzxbw(dst, src);
1300   }
1301 }
1302 
1303 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1304   if (sign) {
1305     vpmovsxbw(dst, src, vector_len);
1306   } else {
1307     vpmovzxbw(dst, src, vector_len);
1308   }
1309 }
1310 
1311 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1312   if (sign) {
1313     vpmovsxbd(dst, src, vector_len);
1314   } else {
1315     vpmovzxbd(dst, src, vector_len);
1316   }
1317 }
1318 
1319 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1320   if (sign) {
1321     vpmovsxwd(dst, src, vector_len);
1322   } else {
1323     vpmovzxwd(dst, src, vector_len);
1324   }
1325 }
1326 
1327 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1328                                      int shift, int vector_len) {
1329   if (opcode == Op_RotateLeftV) {
1330     if (etype == T_INT) {
1331       evprold(dst, src, shift, vector_len);
1332     } else {
1333       assert(etype == T_LONG, "expected type T_LONG");
1334       evprolq(dst, src, shift, vector_len);
1335     }
1336   } else {
1337     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1338     if (etype == T_INT) {
1339       evprord(dst, src, shift, vector_len);
1340     } else {
1341       assert(etype == T_LONG, "expected type T_LONG");
1342       evprorq(dst, src, shift, vector_len);
1343     }
1344   }
1345 }
1346 
1347 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1348                                      XMMRegister shift, int vector_len) {
1349   if (opcode == Op_RotateLeftV) {
1350     if (etype == T_INT) {
1351       evprolvd(dst, src, shift, vector_len);
1352     } else {
1353       assert(etype == T_LONG, "expected type T_LONG");
1354       evprolvq(dst, src, shift, vector_len);
1355     }
1356   } else {
1357     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1358     if (etype == T_INT) {
1359       evprorvd(dst, src, shift, vector_len);
1360     } else {
1361       assert(etype == T_LONG, "expected type T_LONG");
1362       evprorvq(dst, src, shift, vector_len);
1363     }
1364   }
1365 }
1366 
1367 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1368   if (opcode == Op_RShiftVI) {
1369     psrad(dst, shift);
1370   } else if (opcode == Op_LShiftVI) {
1371     pslld(dst, shift);
1372   } else {
1373     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1374     psrld(dst, shift);
1375   }
1376 }
1377 
1378 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1379   switch (opcode) {
1380     case Op_RShiftVI:  psrad(dst, shift); break;
1381     case Op_LShiftVI:  pslld(dst, shift); break;
1382     case Op_URShiftVI: psrld(dst, shift); break;
1383 
1384     default: assert(false, "%s", NodeClassNames[opcode]);
1385   }
1386 }
1387 
1388 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1389   if (opcode == Op_RShiftVI) {
1390     vpsrad(dst, nds, shift, vector_len);
1391   } else if (opcode == Op_LShiftVI) {
1392     vpslld(dst, nds, shift, vector_len);
1393   } else {
1394     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1395     vpsrld(dst, nds, shift, vector_len);
1396   }
1397 }
1398 
1399 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1400   switch (opcode) {
1401     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1402     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1403     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1404 
1405     default: assert(false, "%s", NodeClassNames[opcode]);
1406   }
1407 }
1408 
1409 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1410   switch (opcode) {
1411     case Op_RShiftVB:  // fall-through
1412     case Op_RShiftVS:  psraw(dst, shift); break;
1413 
1414     case Op_LShiftVB:  // fall-through
1415     case Op_LShiftVS:  psllw(dst, shift);   break;
1416 
1417     case Op_URShiftVS: // fall-through
1418     case Op_URShiftVB: psrlw(dst, shift);  break;
1419 
1420     default: assert(false, "%s", NodeClassNames[opcode]);
1421   }
1422 }
1423 
1424 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1425   switch (opcode) {
1426     case Op_RShiftVB:  // fall-through
1427     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1428 
1429     case Op_LShiftVB:  // fall-through
1430     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1431 
1432     case Op_URShiftVS: // fall-through
1433     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1434 
1435     default: assert(false, "%s", NodeClassNames[opcode]);
1436   }
1437 }
1438 
1439 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1440   switch (opcode) {
1441     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1442     case Op_LShiftVL:  psllq(dst, shift); break;
1443     case Op_URShiftVL: psrlq(dst, shift); break;
1444 
1445     default: assert(false, "%s", NodeClassNames[opcode]);
1446   }
1447 }
1448 
1449 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1450   if (opcode == Op_RShiftVL) {
1451     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1452   } else if (opcode == Op_LShiftVL) {
1453     psllq(dst, shift);
1454   } else {
1455     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1456     psrlq(dst, shift);
1457   }
1458 }
1459 
1460 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1461   switch (opcode) {
1462     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1463     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1464     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1465 
1466     default: assert(false, "%s", NodeClassNames[opcode]);
1467   }
1468 }
1469 
1470 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1471   if (opcode == Op_RShiftVL) {
1472     evpsraq(dst, nds, shift, vector_len);
1473   } else if (opcode == Op_LShiftVL) {
1474     vpsllq(dst, nds, shift, vector_len);
1475   } else {
1476     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1477     vpsrlq(dst, nds, shift, vector_len);
1478   }
1479 }
1480 
1481 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1482   switch (opcode) {
1483     case Op_RShiftVB:  // fall-through
1484     case Op_RShiftVS:  // fall-through
1485     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1486 
1487     case Op_LShiftVB:  // fall-through
1488     case Op_LShiftVS:  // fall-through
1489     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1490 
1491     case Op_URShiftVB: // fall-through
1492     case Op_URShiftVS: // fall-through
1493     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1494 
1495     default: assert(false, "%s", NodeClassNames[opcode]);
1496   }
1497 }
1498 
1499 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1500   switch (opcode) {
1501     case Op_RShiftVB:  // fall-through
1502     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1503 
1504     case Op_LShiftVB:  // fall-through
1505     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1506 
1507     case Op_URShiftVB: // fall-through
1508     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1509 
1510     default: assert(false, "%s", NodeClassNames[opcode]);
1511   }
1512 }
1513 
1514 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1515   assert(UseAVX >= 2, "required");
1516   switch (opcode) {
1517     case Op_RShiftVL: {
1518       if (UseAVX > 2) {
1519         assert(tmp == xnoreg, "not used");
1520         if (!VM_Version::supports_avx512vl()) {
1521           vlen_enc = Assembler::AVX_512bit;
1522         }
1523         evpsravq(dst, src, shift, vlen_enc);
1524       } else {
1525         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1526         vpsrlvq(dst, src, shift, vlen_enc);
1527         vpsrlvq(tmp, tmp, shift, vlen_enc);
1528         vpxor(dst, dst, tmp, vlen_enc);
1529         vpsubq(dst, dst, tmp, vlen_enc);
1530       }
1531       break;
1532     }
1533     case Op_LShiftVL: {
1534       assert(tmp == xnoreg, "not used");
1535       vpsllvq(dst, src, shift, vlen_enc);
1536       break;
1537     }
1538     case Op_URShiftVL: {
1539       assert(tmp == xnoreg, "not used");
1540       vpsrlvq(dst, src, shift, vlen_enc);
1541       break;
1542     }
1543     default: assert(false, "%s", NodeClassNames[opcode]);
1544   }
1545 }
1546 
1547 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1548 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1549   assert(opcode == Op_LShiftVB ||
1550          opcode == Op_RShiftVB ||
1551          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1552   bool sign = (opcode != Op_URShiftVB);
1553   assert(vector_len == 0, "required");
1554   vextendbd(sign, dst, src, 1);
1555   vpmovzxbd(vtmp, shift, 1);
1556   varshiftd(opcode, dst, dst, vtmp, 1);
1557   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1558   vextracti128_high(vtmp, dst);
1559   vpackusdw(dst, dst, vtmp, 0);
1560 }
1561 
1562 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1563 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1564   assert(opcode == Op_LShiftVB ||
1565          opcode == Op_RShiftVB ||
1566          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1567   bool sign = (opcode != Op_URShiftVB);
1568   int ext_vector_len = vector_len + 1;
1569   vextendbw(sign, dst, src, ext_vector_len);
1570   vpmovzxbw(vtmp, shift, ext_vector_len);
1571   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1572   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1573   if (vector_len == 0) {
1574     vextracti128_high(vtmp, dst);
1575     vpackuswb(dst, dst, vtmp, vector_len);
1576   } else {
1577     vextracti64x4_high(vtmp, dst);
1578     vpackuswb(dst, dst, vtmp, vector_len);
1579     vpermq(dst, dst, 0xD8, vector_len);
1580   }
1581 }
1582 
1583 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1584   switch(typ) {
1585     case T_BYTE:
1586       pinsrb(dst, val, idx);
1587       break;
1588     case T_SHORT:
1589       pinsrw(dst, val, idx);
1590       break;
1591     case T_INT:
1592       pinsrd(dst, val, idx);
1593       break;
1594     case T_LONG:
1595       pinsrq(dst, val, idx);
1596       break;
1597     default:
1598       assert(false,"Should not reach here.");
1599       break;
1600   }
1601 }
1602 
1603 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1604   switch(typ) {
1605     case T_BYTE:
1606       vpinsrb(dst, src, val, idx);
1607       break;
1608     case T_SHORT:
1609       vpinsrw(dst, src, val, idx);
1610       break;
1611     case T_INT:
1612       vpinsrd(dst, src, val, idx);
1613       break;
1614     case T_LONG:
1615       vpinsrq(dst, src, val, idx);
1616       break;
1617     default:
1618       assert(false,"Should not reach here.");
1619       break;
1620   }
1621 }
1622 
1623 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1624                                                 XMMRegister dst, Register base,
1625                                                 Register idx_base,
1626                                                 Register offset, Register mask,
1627                                                 Register mask_idx, Register rtmp,
1628                                                 int vlen_enc) {
1629   vpxor(dst, dst, dst, vlen_enc);
1630   if (elem_bt == T_SHORT) {
1631     for (int i = 0; i < 4; i++) {
1632       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1633       Label skip_load;
1634       btq(mask, mask_idx);
1635       jccb(Assembler::carryClear, skip_load);
1636       movl(rtmp, Address(idx_base, i * 4));
1637       if (offset != noreg) {
1638         addl(rtmp, offset);
1639       }
1640       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1641       bind(skip_load);
1642       incq(mask_idx);
1643     }
1644   } else {
1645     assert(elem_bt == T_BYTE, "");
1646     for (int i = 0; i < 8; i++) {
1647       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1648       Label skip_load;
1649       btq(mask, mask_idx);
1650       jccb(Assembler::carryClear, skip_load);
1651       movl(rtmp, Address(idx_base, i * 4));
1652       if (offset != noreg) {
1653         addl(rtmp, offset);
1654       }
1655       pinsrb(dst, Address(base, rtmp), i);
1656       bind(skip_load);
1657       incq(mask_idx);
1658     }
1659   }
1660 }
1661 
1662 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1663                                          Register base, Register idx_base,
1664                                          Register offset, Register rtmp,
1665                                          int vlen_enc) {
1666   vpxor(dst, dst, dst, vlen_enc);
1667   if (elem_bt == T_SHORT) {
1668     for (int i = 0; i < 4; i++) {
1669       // dst[i] = src[offset + idx_base[i]]
1670       movl(rtmp, Address(idx_base, i * 4));
1671       if (offset != noreg) {
1672         addl(rtmp, offset);
1673       }
1674       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1675     }
1676   } else {
1677     assert(elem_bt == T_BYTE, "");
1678     for (int i = 0; i < 8; i++) {
1679       // dst[i] = src[offset + idx_base[i]]
1680       movl(rtmp, Address(idx_base, i * 4));
1681       if (offset != noreg) {
1682         addl(rtmp, offset);
1683       }
1684       pinsrb(dst, Address(base, rtmp), i);
1685     }
1686   }
1687 }
1688 
1689 /*
1690  * Gather using hybrid algorithm, first partially unroll scalar loop
1691  * to accumulate values from gather indices into a quad-word(64bit) slice.
1692  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1693  * permutation to place the slice into appropriate vector lane
1694  * locations in destination vector. Following pseudo code describes the
1695  * algorithm in detail:
1696  *
1697  * DST_VEC = ZERO_VEC
1698  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1699  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1700  * FOREACH_ITER:
1701  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1702  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1703  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1704  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1705  *
1706  * With each iteration, doubleword permute indices (0,1) corresponding
1707  * to gathered quadword gets right shifted by two lane positions.
1708  *
1709  */
1710 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1711                                         Register base, Register idx_base,
1712                                         Register offset, Register mask,
1713                                         XMMRegister xtmp1, XMMRegister xtmp2,
1714                                         XMMRegister temp_dst, Register rtmp,
1715                                         Register mask_idx, Register length,
1716                                         int vector_len, int vlen_enc) {
1717   Label GATHER8_LOOP;
1718   assert(is_subword_type(elem_ty), "");
1719   movl(length, vector_len);
1720   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1721   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1722   vallones(xtmp2, vlen_enc);
1723   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1724   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1725   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1726 
1727   bind(GATHER8_LOOP);
1728     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1729     if (mask == noreg) {
1730       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1731     } else {
1732       vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc);
1733     }
1734     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1735     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1736     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1737     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1738     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1739     vpor(dst, dst, temp_dst, vlen_enc);
1740     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1741     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1742     jcc(Assembler::notEqual, GATHER8_LOOP);
1743 }
1744 
1745 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1746   switch(typ) {
1747     case T_INT:
1748       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1749       break;
1750     case T_FLOAT:
1751       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1752       break;
1753     case T_LONG:
1754       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1755       break;
1756     case T_DOUBLE:
1757       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1758       break;
1759     default:
1760       assert(false,"Should not reach here.");
1761       break;
1762   }
1763 }
1764 
1765 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1766   switch(typ) {
1767     case T_INT:
1768       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1769       break;
1770     case T_FLOAT:
1771       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1772       break;
1773     case T_LONG:
1774       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1775       break;
1776     case T_DOUBLE:
1777       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1778       break;
1779     default:
1780       assert(false,"Should not reach here.");
1781       break;
1782   }
1783 }
1784 
1785 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1786   switch(typ) {
1787     case T_INT:
1788       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1789       break;
1790     case T_FLOAT:
1791       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1792       break;
1793     case T_LONG:
1794       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1795       break;
1796     case T_DOUBLE:
1797       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1798       break;
1799     default:
1800       assert(false,"Should not reach here.");
1801       break;
1802   }
1803 }
1804 
1805 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1806   if (vlen_in_bytes <= 16) {
1807     pxor (dst, dst);
1808     psubb(dst, src);
1809     switch (elem_bt) {
1810       case T_BYTE:   /* nothing to do */ break;
1811       case T_SHORT:  pmovsxbw(dst, dst); break;
1812       case T_INT:    pmovsxbd(dst, dst); break;
1813       case T_FLOAT:  pmovsxbd(dst, dst); break;
1814       case T_LONG:   pmovsxbq(dst, dst); break;
1815       case T_DOUBLE: pmovsxbq(dst, dst); break;
1816 
1817       default: assert(false, "%s", type2name(elem_bt));
1818     }
1819   } else {
1820     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1821     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1822 
1823     vpxor (dst, dst, dst, vlen_enc);
1824     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1825 
1826     switch (elem_bt) {
1827       case T_BYTE:   /* nothing to do */            break;
1828       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1829       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1830       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1831       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1832       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1833 
1834       default: assert(false, "%s", type2name(elem_bt));
1835     }
1836   }
1837 }
1838 
1839 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1840   if (novlbwdq) {
1841     vpmovsxbd(xtmp, src, vlen_enc);
1842     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1843             Assembler::eq, true, vlen_enc, noreg);
1844   } else {
1845     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1846     vpsubb(xtmp, xtmp, src, vlen_enc);
1847     evpmovb2m(dst, xtmp, vlen_enc);
1848   }
1849 }
1850 
1851 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1852   if (is_integral_type(bt)) {
1853     switch (vlen_in_bytes) {
1854       case 4:  movdl(dst, src);   break;
1855       case 8:  movq(dst, src);    break;
1856       case 16: movdqu(dst, src);  break;
1857       case 32: vmovdqu(dst, src); break;
1858       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1859       default: ShouldNotReachHere();
1860     }
1861   } else {
1862     switch (vlen_in_bytes) {
1863       case 4:  movflt(dst, src); break;
1864       case 8:  movdbl(dst, src); break;
1865       case 16: movups(dst, src); break;
1866       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1867       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1868       default: ShouldNotReachHere();
1869     }
1870   }
1871 }
1872 
1873 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1874   assert(rscratch != noreg || always_reachable(src), "missing");
1875 
1876   if (reachable(src)) {
1877     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1878   } else {
1879     lea(rscratch, src);
1880     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1881   }
1882 }
1883 
1884 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1885   int vlen_enc = vector_length_encoding(vlen);
1886   if (VM_Version::supports_avx()) {
1887     if (bt == T_LONG) {
1888       if (VM_Version::supports_avx2()) {
1889         vpbroadcastq(dst, src, vlen_enc);
1890       } else {
1891         vmovddup(dst, src, vlen_enc);
1892       }
1893     } else if (bt == T_DOUBLE) {
1894       if (vlen_enc != Assembler::AVX_128bit) {
1895         vbroadcastsd(dst, src, vlen_enc, noreg);
1896       } else {
1897         vmovddup(dst, src, vlen_enc);
1898       }
1899     } else {
1900       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1901         vpbroadcastd(dst, src, vlen_enc);
1902       } else {
1903         vbroadcastss(dst, src, vlen_enc);
1904       }
1905     }
1906   } else if (VM_Version::supports_sse3()) {
1907     movddup(dst, src);
1908   } else {
1909     load_vector(bt, dst, src, vlen);
1910   }
1911 }
1912 
1913 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1914   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1915   int offset = exact_log2(type2aelembytes(bt)) << 6;
1916   if (is_floating_point_type(bt)) {
1917     offset += 128;
1918   }
1919   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1920   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1921 }
1922 
1923 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1924 
1925 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1926   int vector_len = Assembler::AVX_128bit;
1927 
1928   switch (opcode) {
1929     case Op_AndReductionV:  pand(dst, src); break;
1930     case Op_OrReductionV:   por (dst, src); break;
1931     case Op_XorReductionV:  pxor(dst, src); break;
1932     case Op_MinReductionV:
1933       switch (typ) {
1934         case T_BYTE:        pminsb(dst, src); break;
1935         case T_SHORT:       pminsw(dst, src); break;
1936         case T_INT:         pminsd(dst, src); break;
1937         case T_LONG:        assert(UseAVX > 2, "required");
1938                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1939         default:            assert(false, "wrong type");
1940       }
1941       break;
1942     case Op_MaxReductionV:
1943       switch (typ) {
1944         case T_BYTE:        pmaxsb(dst, src); break;
1945         case T_SHORT:       pmaxsw(dst, src); break;
1946         case T_INT:         pmaxsd(dst, src); break;
1947         case T_LONG:        assert(UseAVX > 2, "required");
1948                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1949         default:            assert(false, "wrong type");
1950       }
1951       break;
1952     case Op_AddReductionVF: addss(dst, src); break;
1953     case Op_AddReductionVD: addsd(dst, src); break;
1954     case Op_AddReductionVI:
1955       switch (typ) {
1956         case T_BYTE:        paddb(dst, src); break;
1957         case T_SHORT:       paddw(dst, src); break;
1958         case T_INT:         paddd(dst, src); break;
1959         default:            assert(false, "wrong type");
1960       }
1961       break;
1962     case Op_AddReductionVL: paddq(dst, src); break;
1963     case Op_MulReductionVF: mulss(dst, src); break;
1964     case Op_MulReductionVD: mulsd(dst, src); break;
1965     case Op_MulReductionVI:
1966       switch (typ) {
1967         case T_SHORT:       pmullw(dst, src); break;
1968         case T_INT:         pmulld(dst, src); break;
1969         default:            assert(false, "wrong type");
1970       }
1971       break;
1972     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1973                             evpmullq(dst, dst, src, vector_len); break;
1974     default:                assert(false, "wrong opcode");
1975   }
1976 }
1977 
1978 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1979   switch (opcode) {
1980     case Op_AddReductionVF: addps(dst, src); break;
1981     case Op_AddReductionVD: addpd(dst, src); break;
1982     case Op_MulReductionVF: mulps(dst, src); break;
1983     case Op_MulReductionVD: mulpd(dst, src); break;
1984     default:                assert(false, "%s", NodeClassNames[opcode]);
1985   }
1986 }
1987 
1988 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1989   int vector_len = Assembler::AVX_256bit;
1990 
1991   switch (opcode) {
1992     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1993     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1994     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1995     case Op_MinReductionV:
1996       switch (typ) {
1997         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1998         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1999         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
2000         case T_LONG:        assert(UseAVX > 2, "required");
2001                             vpminsq(dst, src1, src2, vector_len); break;
2002         default:            assert(false, "wrong type");
2003       }
2004       break;
2005     case Op_MaxReductionV:
2006       switch (typ) {
2007         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
2008         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
2009         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
2010         case T_LONG:        assert(UseAVX > 2, "required");
2011                             vpmaxsq(dst, src1, src2, vector_len); break;
2012         default:            assert(false, "wrong type");
2013       }
2014       break;
2015     case Op_AddReductionVI:
2016       switch (typ) {
2017         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
2018         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
2019         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
2020         default:            assert(false, "wrong type");
2021       }
2022       break;
2023     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
2024     case Op_MulReductionVI:
2025       switch (typ) {
2026         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
2027         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
2028         default:            assert(false, "wrong type");
2029       }
2030       break;
2031     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
2032     default:                assert(false, "wrong opcode");
2033   }
2034 }
2035 
2036 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
2037   int vector_len = Assembler::AVX_256bit;
2038 
2039   switch (opcode) {
2040     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
2041     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
2042     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
2043     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
2044     default:                assert(false, "%s", NodeClassNames[opcode]);
2045   }
2046 }
2047 
2048 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
2049                                   XMMRegister dst, XMMRegister src,
2050                                   XMMRegister vtmp1, XMMRegister vtmp2) {
2051   switch (opcode) {
2052     case Op_AddReductionVF:
2053     case Op_MulReductionVF:
2054       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2055       break;
2056 
2057     case Op_AddReductionVD:
2058     case Op_MulReductionVD:
2059       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2060       break;
2061 
2062     default: assert(false, "wrong opcode");
2063   }
2064 }
2065 
2066 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
2067                                             XMMRegister dst, XMMRegister src,
2068                                             XMMRegister vtmp1, XMMRegister vtmp2) {
2069   switch (opcode) {
2070     case Op_AddReductionVF:
2071     case Op_MulReductionVF:
2072       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2073       break;
2074 
2075     case Op_AddReductionVD:
2076     case Op_MulReductionVD:
2077       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2078       break;
2079 
2080     default: assert(false, "%s", NodeClassNames[opcode]);
2081   }
2082 }
2083 
2084 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2085                              Register dst, Register src1, XMMRegister src2,
2086                              XMMRegister vtmp1, XMMRegister vtmp2) {
2087   switch (vlen) {
2088     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2089     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2090     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2091     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2092 
2093     default: assert(false, "wrong vector length");
2094   }
2095 }
2096 
2097 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2098                              Register dst, Register src1, XMMRegister src2,
2099                              XMMRegister vtmp1, XMMRegister vtmp2) {
2100   switch (vlen) {
2101     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2102     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2103     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2104     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2105 
2106     default: assert(false, "wrong vector length");
2107   }
2108 }
2109 
2110 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2111                              Register dst, Register src1, XMMRegister src2,
2112                              XMMRegister vtmp1, XMMRegister vtmp2) {
2113   switch (vlen) {
2114     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2115     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2116     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2117     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2118 
2119     default: assert(false, "wrong vector length");
2120   }
2121 }
2122 
2123 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2124                              Register dst, Register src1, XMMRegister src2,
2125                              XMMRegister vtmp1, XMMRegister vtmp2) {
2126   switch (vlen) {
2127     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2128     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2129     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2130     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2131 
2132     default: assert(false, "wrong vector length");
2133   }
2134 }
2135 
2136 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2137                              Register dst, Register src1, XMMRegister src2,
2138                              XMMRegister vtmp1, XMMRegister vtmp2) {
2139   switch (vlen) {
2140     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2141     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2142     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2143 
2144     default: assert(false, "wrong vector length");
2145   }
2146 }
2147 
2148 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2149   switch (vlen) {
2150     case 2:
2151       assert(vtmp2 == xnoreg, "");
2152       reduce2F(opcode, dst, src, vtmp1);
2153       break;
2154     case 4:
2155       assert(vtmp2 == xnoreg, "");
2156       reduce4F(opcode, dst, src, vtmp1);
2157       break;
2158     case 8:
2159       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2160       break;
2161     case 16:
2162       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2163       break;
2164     default: assert(false, "wrong vector length");
2165   }
2166 }
2167 
2168 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2169   switch (vlen) {
2170     case 2:
2171       assert(vtmp2 == xnoreg, "");
2172       reduce2D(opcode, dst, src, vtmp1);
2173       break;
2174     case 4:
2175       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2176       break;
2177     case 8:
2178       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2179       break;
2180     default: assert(false, "wrong vector length");
2181   }
2182 }
2183 
2184 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2185   switch (vlen) {
2186     case 2:
2187       assert(vtmp1 == xnoreg, "");
2188       assert(vtmp2 == xnoreg, "");
2189       unorderedReduce2F(opcode, dst, src);
2190       break;
2191     case 4:
2192       assert(vtmp2 == xnoreg, "");
2193       unorderedReduce4F(opcode, dst, src, vtmp1);
2194       break;
2195     case 8:
2196       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2197       break;
2198     case 16:
2199       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2200       break;
2201     default: assert(false, "wrong vector length");
2202   }
2203 }
2204 
2205 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2206   switch (vlen) {
2207     case 2:
2208       assert(vtmp1 == xnoreg, "");
2209       assert(vtmp2 == xnoreg, "");
2210       unorderedReduce2D(opcode, dst, src);
2211       break;
2212     case 4:
2213       assert(vtmp2 == xnoreg, "");
2214       unorderedReduce4D(opcode, dst, src, vtmp1);
2215       break;
2216     case 8:
2217       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2218       break;
2219     default: assert(false, "wrong vector length");
2220   }
2221 }
2222 
2223 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2224   if (opcode == Op_AddReductionVI) {
2225     if (vtmp1 != src2) {
2226       movdqu(vtmp1, src2);
2227     }
2228     phaddd(vtmp1, vtmp1);
2229   } else {
2230     pshufd(vtmp1, src2, 0x1);
2231     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2232   }
2233   movdl(vtmp2, src1);
2234   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2235   movdl(dst, vtmp1);
2236 }
2237 
2238 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2239   if (opcode == Op_AddReductionVI) {
2240     if (vtmp1 != src2) {
2241       movdqu(vtmp1, src2);
2242     }
2243     phaddd(vtmp1, src2);
2244     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2245   } else {
2246     pshufd(vtmp2, src2, 0xE);
2247     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2248     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2249   }
2250 }
2251 
2252 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2253   if (opcode == Op_AddReductionVI) {
2254     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2255     vextracti128_high(vtmp2, vtmp1);
2256     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2257     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2258   } else {
2259     vextracti128_high(vtmp1, src2);
2260     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2261     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2262   }
2263 }
2264 
2265 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2266   vextracti64x4_high(vtmp2, src2);
2267   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2268   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2269 }
2270 
2271 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2272   pshufd(vtmp2, src2, 0x1);
2273   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2274   movdqu(vtmp1, vtmp2);
2275   psrldq(vtmp1, 2);
2276   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2277   movdqu(vtmp2, vtmp1);
2278   psrldq(vtmp2, 1);
2279   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2280   movdl(vtmp2, src1);
2281   pmovsxbd(vtmp1, vtmp1);
2282   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2283   pextrb(dst, vtmp1, 0x0);
2284   movsbl(dst, dst);
2285 }
2286 
2287 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2288   pshufd(vtmp1, src2, 0xE);
2289   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2290   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2291 }
2292 
2293 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2294   vextracti128_high(vtmp2, src2);
2295   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2296   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2297 }
2298 
2299 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2300   vextracti64x4_high(vtmp1, src2);
2301   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2302   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2303 }
2304 
2305 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2306   pmovsxbw(vtmp2, src2);
2307   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2308 }
2309 
2310 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2311   if (UseAVX > 1) {
2312     int vector_len = Assembler::AVX_256bit;
2313     vpmovsxbw(vtmp1, src2, vector_len);
2314     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2315   } else {
2316     pmovsxbw(vtmp2, src2);
2317     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2318     pshufd(vtmp2, src2, 0x1);
2319     pmovsxbw(vtmp2, src2);
2320     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2321   }
2322 }
2323 
2324 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2325   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2326     int vector_len = Assembler::AVX_512bit;
2327     vpmovsxbw(vtmp1, src2, vector_len);
2328     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2329   } else {
2330     assert(UseAVX >= 2,"Should not reach here.");
2331     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2332     vextracti128_high(vtmp2, src2);
2333     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2334   }
2335 }
2336 
2337 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2338   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2339   vextracti64x4_high(vtmp2, src2);
2340   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2341 }
2342 
2343 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2344   if (opcode == Op_AddReductionVI) {
2345     if (vtmp1 != src2) {
2346       movdqu(vtmp1, src2);
2347     }
2348     phaddw(vtmp1, vtmp1);
2349     phaddw(vtmp1, vtmp1);
2350   } else {
2351     pshufd(vtmp2, src2, 0x1);
2352     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2353     movdqu(vtmp1, vtmp2);
2354     psrldq(vtmp1, 2);
2355     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2356   }
2357   movdl(vtmp2, src1);
2358   pmovsxwd(vtmp1, vtmp1);
2359   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2360   pextrw(dst, vtmp1, 0x0);
2361   movswl(dst, dst);
2362 }
2363 
2364 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2365   if (opcode == Op_AddReductionVI) {
2366     if (vtmp1 != src2) {
2367       movdqu(vtmp1, src2);
2368     }
2369     phaddw(vtmp1, src2);
2370   } else {
2371     pshufd(vtmp1, src2, 0xE);
2372     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2373   }
2374   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2375 }
2376 
2377 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2378   if (opcode == Op_AddReductionVI) {
2379     int vector_len = Assembler::AVX_256bit;
2380     vphaddw(vtmp2, src2, src2, vector_len);
2381     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2382   } else {
2383     vextracti128_high(vtmp2, src2);
2384     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2385   }
2386   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2387 }
2388 
2389 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2390   int vector_len = Assembler::AVX_256bit;
2391   vextracti64x4_high(vtmp1, src2);
2392   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2393   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2394 }
2395 
2396 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2397   pshufd(vtmp2, src2, 0xE);
2398   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2399   movdq(vtmp1, src1);
2400   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2401   movdq(dst, vtmp1);
2402 }
2403 
2404 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2405   vextracti128_high(vtmp1, src2);
2406   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2407   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2408 }
2409 
2410 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2411   vextracti64x4_high(vtmp2, src2);
2412   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2413   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2414 }
2415 
2416 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2417   mov64(temp, -1L);
2418   bzhiq(temp, temp, len);
2419   kmovql(dst, temp);
2420 }
2421 
2422 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2423   reduce_operation_128(T_FLOAT, opcode, dst, src);
2424   pshufd(vtmp, src, 0x1);
2425   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2426 }
2427 
2428 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2429   reduce2F(opcode, dst, src, vtmp);
2430   pshufd(vtmp, src, 0x2);
2431   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2432   pshufd(vtmp, src, 0x3);
2433   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2434 }
2435 
2436 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2437   reduce4F(opcode, dst, src, vtmp2);
2438   vextractf128_high(vtmp2, src);
2439   reduce4F(opcode, dst, vtmp2, vtmp1);
2440 }
2441 
2442 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2443   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2444   vextracti64x4_high(vtmp1, src);
2445   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2446 }
2447 
2448 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2449   pshufd(dst, src, 0x1);
2450   reduce_operation_128(T_FLOAT, opcode, dst, src);
2451 }
2452 
2453 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2454   pshufd(vtmp, src, 0xE);
2455   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2456   unorderedReduce2F(opcode, dst, vtmp);
2457 }
2458 
2459 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2460   vextractf128_high(vtmp1, src);
2461   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2462   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2463 }
2464 
2465 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2466   vextractf64x4_high(vtmp2, src);
2467   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2468   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2469 }
2470 
2471 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2472   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2473   pshufd(vtmp, src, 0xE);
2474   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2475 }
2476 
2477 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2478   reduce2D(opcode, dst, src, vtmp2);
2479   vextractf128_high(vtmp2, src);
2480   reduce2D(opcode, dst, vtmp2, vtmp1);
2481 }
2482 
2483 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2484   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2485   vextracti64x4_high(vtmp1, src);
2486   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2487 }
2488 
2489 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2490   pshufd(dst, src, 0xE);
2491   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2492 }
2493 
2494 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2495   vextractf128_high(vtmp, src);
2496   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2497   unorderedReduce2D(opcode, dst, vtmp);
2498 }
2499 
2500 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2501   vextractf64x4_high(vtmp2, src);
2502   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2503   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2504 }
2505 
2506 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2507   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2508 }
2509 
2510 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2511   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2512 }
2513 
2514 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2515   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2516 }
2517 
2518 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2519                                  int vec_enc) {
2520   switch(elem_bt) {
2521     case T_INT:
2522     case T_FLOAT:
2523       vmaskmovps(dst, src, mask, vec_enc);
2524       break;
2525     case T_LONG:
2526     case T_DOUBLE:
2527       vmaskmovpd(dst, src, mask, vec_enc);
2528       break;
2529     default:
2530       fatal("Unsupported type %s", type2name(elem_bt));
2531       break;
2532   }
2533 }
2534 
2535 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2536                                  int vec_enc) {
2537   switch(elem_bt) {
2538     case T_INT:
2539     case T_FLOAT:
2540       vmaskmovps(dst, src, mask, vec_enc);
2541       break;
2542     case T_LONG:
2543     case T_DOUBLE:
2544       vmaskmovpd(dst, src, mask, vec_enc);
2545       break;
2546     default:
2547       fatal("Unsupported type %s", type2name(elem_bt));
2548       break;
2549   }
2550 }
2551 
2552 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2553                                           XMMRegister dst, XMMRegister src,
2554                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2555                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2556   const int permconst[] = {1, 14};
2557   XMMRegister wsrc = src;
2558   XMMRegister wdst = xmm_0;
2559   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2560 
2561   int vlen_enc = Assembler::AVX_128bit;
2562   if (vlen == 16) {
2563     vlen_enc = Assembler::AVX_256bit;
2564   }
2565 
2566   for (int i = log2(vlen) - 1; i >=0; i--) {
2567     if (i == 0 && !is_dst_valid) {
2568       wdst = dst;
2569     }
2570     if (i == 3) {
2571       vextracti64x4_high(wtmp, wsrc);
2572     } else if (i == 2) {
2573       vextracti128_high(wtmp, wsrc);
2574     } else { // i = [0,1]
2575       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2576     }
2577     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2578     wsrc = wdst;
2579     vlen_enc = Assembler::AVX_128bit;
2580   }
2581   if (is_dst_valid) {
2582     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2583   }
2584 }
2585 
2586 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2587                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2588                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2589   XMMRegister wsrc = src;
2590   XMMRegister wdst = xmm_0;
2591   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2592   int vlen_enc = Assembler::AVX_128bit;
2593   if (vlen == 8) {
2594     vlen_enc = Assembler::AVX_256bit;
2595   }
2596   for (int i = log2(vlen) - 1; i >=0; i--) {
2597     if (i == 0 && !is_dst_valid) {
2598       wdst = dst;
2599     }
2600     if (i == 1) {
2601       vextracti128_high(wtmp, wsrc);
2602     } else if (i == 2) {
2603       vextracti64x4_high(wtmp, wsrc);
2604     } else {
2605       assert(i == 0, "%d", i);
2606       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2607     }
2608     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2609     wsrc = wdst;
2610     vlen_enc = Assembler::AVX_128bit;
2611   }
2612   if (is_dst_valid) {
2613     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2614   }
2615 }
2616 
2617 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2618   switch (bt) {
2619     case T_BYTE:  pextrb(dst, src, idx); break;
2620     case T_SHORT: pextrw(dst, src, idx); break;
2621     case T_INT:   pextrd(dst, src, idx); break;
2622     case T_LONG:  pextrq(dst, src, idx); break;
2623 
2624     default:
2625       assert(false,"Should not reach here.");
2626       break;
2627   }
2628 }
2629 
2630 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2631   int esize =  type2aelembytes(typ);
2632   int elem_per_lane = 16/esize;
2633   int lane = elemindex / elem_per_lane;
2634   int eindex = elemindex % elem_per_lane;
2635 
2636   if (lane >= 2) {
2637     assert(UseAVX > 2, "required");
2638     vextractf32x4(dst, src, lane & 3);
2639     return dst;
2640   } else if (lane > 0) {
2641     assert(UseAVX > 0, "required");
2642     vextractf128(dst, src, lane);
2643     return dst;
2644   } else {
2645     return src;
2646   }
2647 }
2648 
2649 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2650   if (typ == T_BYTE) {
2651     movsbl(dst, dst);
2652   } else if (typ == T_SHORT) {
2653     movswl(dst, dst);
2654   }
2655 }
2656 
2657 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2658   int esize =  type2aelembytes(typ);
2659   int elem_per_lane = 16/esize;
2660   int eindex = elemindex % elem_per_lane;
2661   assert(is_integral_type(typ),"required");
2662 
2663   if (eindex == 0) {
2664     if (typ == T_LONG) {
2665       movq(dst, src);
2666     } else {
2667       movdl(dst, src);
2668       movsxl(typ, dst);
2669     }
2670   } else {
2671     extract(typ, dst, src, eindex);
2672     movsxl(typ, dst);
2673   }
2674 }
2675 
2676 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2677   int esize =  type2aelembytes(typ);
2678   int elem_per_lane = 16/esize;
2679   int eindex = elemindex % elem_per_lane;
2680   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2681 
2682   if (eindex == 0) {
2683     movq(dst, src);
2684   } else {
2685     if (typ == T_FLOAT) {
2686       if (UseAVX == 0) {
2687         movdqu(dst, src);
2688         shufps(dst, dst, eindex);
2689       } else {
2690         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2691       }
2692     } else {
2693       if (UseAVX == 0) {
2694         movdqu(dst, src);
2695         psrldq(dst, eindex*esize);
2696       } else {
2697         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2698       }
2699       movq(dst, dst);
2700     }
2701   }
2702   // Zero upper bits
2703   if (typ == T_FLOAT) {
2704     if (UseAVX == 0) {
2705       assert(vtmp != xnoreg, "required.");
2706       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2707       pand(dst, vtmp);
2708     } else {
2709       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2710     }
2711   }
2712 }
2713 
2714 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2715   switch(typ) {
2716     case T_BYTE:
2717     case T_BOOLEAN:
2718       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2719       break;
2720     case T_SHORT:
2721     case T_CHAR:
2722       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2723       break;
2724     case T_INT:
2725     case T_FLOAT:
2726       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2727       break;
2728     case T_LONG:
2729     case T_DOUBLE:
2730       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2731       break;
2732     default:
2733       assert(false,"Should not reach here.");
2734       break;
2735   }
2736 }
2737 
2738 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2739   assert(rscratch != noreg || always_reachable(src2), "missing");
2740 
2741   switch(typ) {
2742     case T_BOOLEAN:
2743     case T_BYTE:
2744       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2745       break;
2746     case T_CHAR:
2747     case T_SHORT:
2748       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2749       break;
2750     case T_INT:
2751     case T_FLOAT:
2752       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2753       break;
2754     case T_LONG:
2755     case T_DOUBLE:
2756       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2757       break;
2758     default:
2759       assert(false,"Should not reach here.");
2760       break;
2761   }
2762 }
2763 
2764 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2765   switch(typ) {
2766     case T_BYTE:
2767       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2768       break;
2769     case T_SHORT:
2770       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2771       break;
2772     case T_INT:
2773     case T_FLOAT:
2774       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2775       break;
2776     case T_LONG:
2777     case T_DOUBLE:
2778       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2779       break;
2780     default:
2781       assert(false,"Should not reach here.");
2782       break;
2783   }
2784 }
2785 
2786 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2787   assert(vlen_in_bytes <= 32, "");
2788   int esize = type2aelembytes(bt);
2789   if (vlen_in_bytes == 32) {
2790     assert(vtmp == xnoreg, "required.");
2791     if (esize >= 4) {
2792       vtestps(src1, src2, AVX_256bit);
2793     } else {
2794       vptest(src1, src2, AVX_256bit);
2795     }
2796     return;
2797   }
2798   if (vlen_in_bytes < 16) {
2799     // Duplicate the lower part to fill the whole register,
2800     // Don't need to do so for src2
2801     assert(vtmp != xnoreg, "required");
2802     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2803     pshufd(vtmp, src1, shuffle_imm);
2804   } else {
2805     assert(vtmp == xnoreg, "required");
2806     vtmp = src1;
2807   }
2808   if (esize >= 4 && VM_Version::supports_avx()) {
2809     vtestps(vtmp, src2, AVX_128bit);
2810   } else {
2811     ptest(vtmp, src2);
2812   }
2813 }
2814 
2815 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2816 #ifdef ASSERT
2817   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2818   bool is_bw_supported = VM_Version::supports_avx512bw();
2819   if (is_bw && !is_bw_supported) {
2820     assert(vlen_enc != Assembler::AVX_512bit, "required");
2821     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2822            "XMM register should be 0-15");
2823   }
2824 #endif // ASSERT
2825   switch (elem_bt) {
2826     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2827     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2828     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2829     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2830     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2831     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2832     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2833   }
2834 }
2835 
2836 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2837   assert(UseAVX >= 2, "required");
2838   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2839   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2840   if ((UseAVX > 2) &&
2841       (!is_bw || VM_Version::supports_avx512bw()) &&
2842       (!is_vl || VM_Version::supports_avx512vl())) {
2843     switch (elem_bt) {
2844       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2845       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2846       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2847       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2848       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2849     }
2850   } else {
2851     assert(vlen_enc != Assembler::AVX_512bit, "required");
2852     assert((dst->encoding() < 16),"XMM register should be 0-15");
2853     switch (elem_bt) {
2854       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2855       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2856       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2857       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2858       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2859       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2860       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2861     }
2862   }
2863 }
2864 
2865 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2866   switch (to_elem_bt) {
2867     case T_SHORT:
2868       vpmovsxbw(dst, src, vlen_enc);
2869       break;
2870     case T_INT:
2871       vpmovsxbd(dst, src, vlen_enc);
2872       break;
2873     case T_FLOAT:
2874       vpmovsxbd(dst, src, vlen_enc);
2875       vcvtdq2ps(dst, dst, vlen_enc);
2876       break;
2877     case T_LONG:
2878       vpmovsxbq(dst, src, vlen_enc);
2879       break;
2880     case T_DOUBLE: {
2881       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2882       vpmovsxbd(dst, src, mid_vlen_enc);
2883       vcvtdq2pd(dst, dst, vlen_enc);
2884       break;
2885     }
2886     default:
2887       fatal("Unsupported type %s", type2name(to_elem_bt));
2888       break;
2889   }
2890 }
2891 
2892 //-------------------------------------------------------------------------------------------
2893 
2894 // IndexOf for constant substrings with size >= 8 chars
2895 // which don't need to be loaded through stack.
2896 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2897                                          Register cnt1, Register cnt2,
2898                                          int int_cnt2,  Register result,
2899                                          XMMRegister vec, Register tmp,
2900                                          int ae) {
2901   ShortBranchVerifier sbv(this);
2902   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2903   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2904 
2905   // This method uses the pcmpestri instruction with bound registers
2906   //   inputs:
2907   //     xmm - substring
2908   //     rax - substring length (elements count)
2909   //     mem - scanned string
2910   //     rdx - string length (elements count)
2911   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2912   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2913   //   outputs:
2914   //     rcx - matched index in string
2915   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2916   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2917   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2918   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2919   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2920 
2921   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2922         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2923         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2924 
2925   // Note, inline_string_indexOf() generates checks:
2926   // if (substr.count > string.count) return -1;
2927   // if (substr.count == 0) return 0;
2928   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2929 
2930   // Load substring.
2931   if (ae == StrIntrinsicNode::UL) {
2932     pmovzxbw(vec, Address(str2, 0));
2933   } else {
2934     movdqu(vec, Address(str2, 0));
2935   }
2936   movl(cnt2, int_cnt2);
2937   movptr(result, str1); // string addr
2938 
2939   if (int_cnt2 > stride) {
2940     jmpb(SCAN_TO_SUBSTR);
2941 
2942     // Reload substr for rescan, this code
2943     // is executed only for large substrings (> 8 chars)
2944     bind(RELOAD_SUBSTR);
2945     if (ae == StrIntrinsicNode::UL) {
2946       pmovzxbw(vec, Address(str2, 0));
2947     } else {
2948       movdqu(vec, Address(str2, 0));
2949     }
2950     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2951 
2952     bind(RELOAD_STR);
2953     // We came here after the beginning of the substring was
2954     // matched but the rest of it was not so we need to search
2955     // again. Start from the next element after the previous match.
2956 
2957     // cnt2 is number of substring reminding elements and
2958     // cnt1 is number of string reminding elements when cmp failed.
2959     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2960     subl(cnt1, cnt2);
2961     addl(cnt1, int_cnt2);
2962     movl(cnt2, int_cnt2); // Now restore cnt2
2963 
2964     decrementl(cnt1);     // Shift to next element
2965     cmpl(cnt1, cnt2);
2966     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2967 
2968     addptr(result, (1<<scale1));
2969 
2970   } // (int_cnt2 > 8)
2971 
2972   // Scan string for start of substr in 16-byte vectors
2973   bind(SCAN_TO_SUBSTR);
2974   pcmpestri(vec, Address(result, 0), mode);
2975   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2976   subl(cnt1, stride);
2977   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2978   cmpl(cnt1, cnt2);
2979   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2980   addptr(result, 16);
2981   jmpb(SCAN_TO_SUBSTR);
2982 
2983   // Found a potential substr
2984   bind(FOUND_CANDIDATE);
2985   // Matched whole vector if first element matched (tmp(rcx) == 0).
2986   if (int_cnt2 == stride) {
2987     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2988   } else { // int_cnt2 > 8
2989     jccb(Assembler::overflow, FOUND_SUBSTR);
2990   }
2991   // After pcmpestri tmp(rcx) contains matched element index
2992   // Compute start addr of substr
2993   lea(result, Address(result, tmp, scale1));
2994 
2995   // Make sure string is still long enough
2996   subl(cnt1, tmp);
2997   cmpl(cnt1, cnt2);
2998   if (int_cnt2 == stride) {
2999     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3000   } else { // int_cnt2 > 8
3001     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
3002   }
3003   // Left less then substring.
3004 
3005   bind(RET_NOT_FOUND);
3006   movl(result, -1);
3007   jmp(EXIT);
3008 
3009   if (int_cnt2 > stride) {
3010     // This code is optimized for the case when whole substring
3011     // is matched if its head is matched.
3012     bind(MATCH_SUBSTR_HEAD);
3013     pcmpestri(vec, Address(result, 0), mode);
3014     // Reload only string if does not match
3015     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
3016 
3017     Label CONT_SCAN_SUBSTR;
3018     // Compare the rest of substring (> 8 chars).
3019     bind(FOUND_SUBSTR);
3020     // First 8 chars are already matched.
3021     negptr(cnt2);
3022     addptr(cnt2, stride);
3023 
3024     bind(SCAN_SUBSTR);
3025     subl(cnt1, stride);
3026     cmpl(cnt2, -stride); // Do not read beyond substring
3027     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
3028     // Back-up strings to avoid reading beyond substring:
3029     // cnt1 = cnt1 - cnt2 + 8
3030     addl(cnt1, cnt2); // cnt2 is negative
3031     addl(cnt1, stride);
3032     movl(cnt2, stride); negptr(cnt2);
3033     bind(CONT_SCAN_SUBSTR);
3034     if (int_cnt2 < (int)G) {
3035       int tail_off1 = int_cnt2<<scale1;
3036       int tail_off2 = int_cnt2<<scale2;
3037       if (ae == StrIntrinsicNode::UL) {
3038         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
3039       } else {
3040         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
3041       }
3042       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
3043     } else {
3044       // calculate index in register to avoid integer overflow (int_cnt2*2)
3045       movl(tmp, int_cnt2);
3046       addptr(tmp, cnt2);
3047       if (ae == StrIntrinsicNode::UL) {
3048         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
3049       } else {
3050         movdqu(vec, Address(str2, tmp, scale2, 0));
3051       }
3052       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
3053     }
3054     // Need to reload strings pointers if not matched whole vector
3055     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3056     addptr(cnt2, stride);
3057     jcc(Assembler::negative, SCAN_SUBSTR);
3058     // Fall through if found full substring
3059 
3060   } // (int_cnt2 > 8)
3061 
3062   bind(RET_FOUND);
3063   // Found result if we matched full small substring.
3064   // Compute substr offset
3065   subptr(result, str1);
3066   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3067     shrl(result, 1); // index
3068   }
3069   bind(EXIT);
3070 
3071 } // string_indexofC8
3072 
3073 // Small strings are loaded through stack if they cross page boundary.
3074 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
3075                                        Register cnt1, Register cnt2,
3076                                        int int_cnt2,  Register result,
3077                                        XMMRegister vec, Register tmp,
3078                                        int ae) {
3079   ShortBranchVerifier sbv(this);
3080   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3081   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3082 
3083   //
3084   // int_cnt2 is length of small (< 8 chars) constant substring
3085   // or (-1) for non constant substring in which case its length
3086   // is in cnt2 register.
3087   //
3088   // Note, inline_string_indexOf() generates checks:
3089   // if (substr.count > string.count) return -1;
3090   // if (substr.count == 0) return 0;
3091   //
3092   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3093   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3094   // This method uses the pcmpestri instruction with bound registers
3095   //   inputs:
3096   //     xmm - substring
3097   //     rax - substring length (elements count)
3098   //     mem - scanned string
3099   //     rdx - string length (elements count)
3100   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3101   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3102   //   outputs:
3103   //     rcx - matched index in string
3104   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3105   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3106   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3107   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3108 
3109   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3110         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3111         FOUND_CANDIDATE;
3112 
3113   { //========================================================
3114     // We don't know where these strings are located
3115     // and we can't read beyond them. Load them through stack.
3116     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3117 
3118     movptr(tmp, rsp); // save old SP
3119 
3120     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3121       if (int_cnt2 == (1>>scale2)) { // One byte
3122         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3123         load_unsigned_byte(result, Address(str2, 0));
3124         movdl(vec, result); // move 32 bits
3125       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3126         // Not enough header space in 32-bit VM: 12+3 = 15.
3127         movl(result, Address(str2, -1));
3128         shrl(result, 8);
3129         movdl(vec, result); // move 32 bits
3130       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3131         load_unsigned_short(result, Address(str2, 0));
3132         movdl(vec, result); // move 32 bits
3133       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3134         movdl(vec, Address(str2, 0)); // move 32 bits
3135       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3136         movq(vec, Address(str2, 0));  // move 64 bits
3137       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3138         // Array header size is 12 bytes in 32-bit VM
3139         // + 6 bytes for 3 chars == 18 bytes,
3140         // enough space to load vec and shift.
3141         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3142         if (ae == StrIntrinsicNode::UL) {
3143           int tail_off = int_cnt2-8;
3144           pmovzxbw(vec, Address(str2, tail_off));
3145           psrldq(vec, -2*tail_off);
3146         }
3147         else {
3148           int tail_off = int_cnt2*(1<<scale2);
3149           movdqu(vec, Address(str2, tail_off-16));
3150           psrldq(vec, 16-tail_off);
3151         }
3152       }
3153     } else { // not constant substring
3154       cmpl(cnt2, stride);
3155       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3156 
3157       // We can read beyond string if srt+16 does not cross page boundary
3158       // since heaps are aligned and mapped by pages.
3159       assert(os::vm_page_size() < (int)G, "default page should be small");
3160       movl(result, str2); // We need only low 32 bits
3161       andl(result, ((int)os::vm_page_size()-1));
3162       cmpl(result, ((int)os::vm_page_size()-16));
3163       jccb(Assembler::belowEqual, CHECK_STR);
3164 
3165       // Move small strings to stack to allow load 16 bytes into vec.
3166       subptr(rsp, 16);
3167       int stk_offset = wordSize-(1<<scale2);
3168       push(cnt2);
3169 
3170       bind(COPY_SUBSTR);
3171       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3172         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3173         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3174       } else if (ae == StrIntrinsicNode::UU) {
3175         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3176         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3177       }
3178       decrement(cnt2);
3179       jccb(Assembler::notZero, COPY_SUBSTR);
3180 
3181       pop(cnt2);
3182       movptr(str2, rsp);  // New substring address
3183     } // non constant
3184 
3185     bind(CHECK_STR);
3186     cmpl(cnt1, stride);
3187     jccb(Assembler::aboveEqual, BIG_STRINGS);
3188 
3189     // Check cross page boundary.
3190     movl(result, str1); // We need only low 32 bits
3191     andl(result, ((int)os::vm_page_size()-1));
3192     cmpl(result, ((int)os::vm_page_size()-16));
3193     jccb(Assembler::belowEqual, BIG_STRINGS);
3194 
3195     subptr(rsp, 16);
3196     int stk_offset = -(1<<scale1);
3197     if (int_cnt2 < 0) { // not constant
3198       push(cnt2);
3199       stk_offset += wordSize;
3200     }
3201     movl(cnt2, cnt1);
3202 
3203     bind(COPY_STR);
3204     if (ae == StrIntrinsicNode::LL) {
3205       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3206       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3207     } else {
3208       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3209       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3210     }
3211     decrement(cnt2);
3212     jccb(Assembler::notZero, COPY_STR);
3213 
3214     if (int_cnt2 < 0) { // not constant
3215       pop(cnt2);
3216     }
3217     movptr(str1, rsp);  // New string address
3218 
3219     bind(BIG_STRINGS);
3220     // Load substring.
3221     if (int_cnt2 < 0) { // -1
3222       if (ae == StrIntrinsicNode::UL) {
3223         pmovzxbw(vec, Address(str2, 0));
3224       } else {
3225         movdqu(vec, Address(str2, 0));
3226       }
3227       push(cnt2);       // substr count
3228       push(str2);       // substr addr
3229       push(str1);       // string addr
3230     } else {
3231       // Small (< 8 chars) constant substrings are loaded already.
3232       movl(cnt2, int_cnt2);
3233     }
3234     push(tmp);  // original SP
3235 
3236   } // Finished loading
3237 
3238   //========================================================
3239   // Start search
3240   //
3241 
3242   movptr(result, str1); // string addr
3243 
3244   if (int_cnt2  < 0) {  // Only for non constant substring
3245     jmpb(SCAN_TO_SUBSTR);
3246 
3247     // SP saved at sp+0
3248     // String saved at sp+1*wordSize
3249     // Substr saved at sp+2*wordSize
3250     // Substr count saved at sp+3*wordSize
3251 
3252     // Reload substr for rescan, this code
3253     // is executed only for large substrings (> 8 chars)
3254     bind(RELOAD_SUBSTR);
3255     movptr(str2, Address(rsp, 2*wordSize));
3256     movl(cnt2, Address(rsp, 3*wordSize));
3257     if (ae == StrIntrinsicNode::UL) {
3258       pmovzxbw(vec, Address(str2, 0));
3259     } else {
3260       movdqu(vec, Address(str2, 0));
3261     }
3262     // We came here after the beginning of the substring was
3263     // matched but the rest of it was not so we need to search
3264     // again. Start from the next element after the previous match.
3265     subptr(str1, result); // Restore counter
3266     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3267       shrl(str1, 1);
3268     }
3269     addl(cnt1, str1);
3270     decrementl(cnt1);   // Shift to next element
3271     cmpl(cnt1, cnt2);
3272     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3273 
3274     addptr(result, (1<<scale1));
3275   } // non constant
3276 
3277   // Scan string for start of substr in 16-byte vectors
3278   bind(SCAN_TO_SUBSTR);
3279   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3280   pcmpestri(vec, Address(result, 0), mode);
3281   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3282   subl(cnt1, stride);
3283   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3284   cmpl(cnt1, cnt2);
3285   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3286   addptr(result, 16);
3287 
3288   bind(ADJUST_STR);
3289   cmpl(cnt1, stride); // Do not read beyond string
3290   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3291   // Back-up string to avoid reading beyond string.
3292   lea(result, Address(result, cnt1, scale1, -16));
3293   movl(cnt1, stride);
3294   jmpb(SCAN_TO_SUBSTR);
3295 
3296   // Found a potential substr
3297   bind(FOUND_CANDIDATE);
3298   // After pcmpestri tmp(rcx) contains matched element index
3299 
3300   // Make sure string is still long enough
3301   subl(cnt1, tmp);
3302   cmpl(cnt1, cnt2);
3303   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3304   // Left less then substring.
3305 
3306   bind(RET_NOT_FOUND);
3307   movl(result, -1);
3308   jmp(CLEANUP);
3309 
3310   bind(FOUND_SUBSTR);
3311   // Compute start addr of substr
3312   lea(result, Address(result, tmp, scale1));
3313   if (int_cnt2 > 0) { // Constant substring
3314     // Repeat search for small substring (< 8 chars)
3315     // from new point without reloading substring.
3316     // Have to check that we don't read beyond string.
3317     cmpl(tmp, stride-int_cnt2);
3318     jccb(Assembler::greater, ADJUST_STR);
3319     // Fall through if matched whole substring.
3320   } else { // non constant
3321     assert(int_cnt2 == -1, "should be != 0");
3322 
3323     addl(tmp, cnt2);
3324     // Found result if we matched whole substring.
3325     cmpl(tmp, stride);
3326     jcc(Assembler::lessEqual, RET_FOUND);
3327 
3328     // Repeat search for small substring (<= 8 chars)
3329     // from new point 'str1' without reloading substring.
3330     cmpl(cnt2, stride);
3331     // Have to check that we don't read beyond string.
3332     jccb(Assembler::lessEqual, ADJUST_STR);
3333 
3334     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3335     // Compare the rest of substring (> 8 chars).
3336     movptr(str1, result);
3337 
3338     cmpl(tmp, cnt2);
3339     // First 8 chars are already matched.
3340     jccb(Assembler::equal, CHECK_NEXT);
3341 
3342     bind(SCAN_SUBSTR);
3343     pcmpestri(vec, Address(str1, 0), mode);
3344     // Need to reload strings pointers if not matched whole vector
3345     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3346 
3347     bind(CHECK_NEXT);
3348     subl(cnt2, stride);
3349     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3350     addptr(str1, 16);
3351     if (ae == StrIntrinsicNode::UL) {
3352       addptr(str2, 8);
3353     } else {
3354       addptr(str2, 16);
3355     }
3356     subl(cnt1, stride);
3357     cmpl(cnt2, stride); // Do not read beyond substring
3358     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3359     // Back-up strings to avoid reading beyond substring.
3360 
3361     if (ae == StrIntrinsicNode::UL) {
3362       lea(str2, Address(str2, cnt2, scale2, -8));
3363       lea(str1, Address(str1, cnt2, scale1, -16));
3364     } else {
3365       lea(str2, Address(str2, cnt2, scale2, -16));
3366       lea(str1, Address(str1, cnt2, scale1, -16));
3367     }
3368     subl(cnt1, cnt2);
3369     movl(cnt2, stride);
3370     addl(cnt1, stride);
3371     bind(CONT_SCAN_SUBSTR);
3372     if (ae == StrIntrinsicNode::UL) {
3373       pmovzxbw(vec, Address(str2, 0));
3374     } else {
3375       movdqu(vec, Address(str2, 0));
3376     }
3377     jmp(SCAN_SUBSTR);
3378 
3379     bind(RET_FOUND_LONG);
3380     movptr(str1, Address(rsp, wordSize));
3381   } // non constant
3382 
3383   bind(RET_FOUND);
3384   // Compute substr offset
3385   subptr(result, str1);
3386   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3387     shrl(result, 1); // index
3388   }
3389   bind(CLEANUP);
3390   pop(rsp); // restore SP
3391 
3392 } // string_indexof
3393 
3394 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3395                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3396   ShortBranchVerifier sbv(this);
3397   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3398 
3399   int stride = 8;
3400 
3401   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3402         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3403         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3404         FOUND_SEQ_CHAR, DONE_LABEL;
3405 
3406   movptr(result, str1);
3407   if (UseAVX >= 2) {
3408     cmpl(cnt1, stride);
3409     jcc(Assembler::less, SCAN_TO_CHAR);
3410     cmpl(cnt1, 2*stride);
3411     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3412     movdl(vec1, ch);
3413     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3414     vpxor(vec2, vec2);
3415     movl(tmp, cnt1);
3416     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3417     andl(cnt1,0x0000000F);  //tail count (in chars)
3418 
3419     bind(SCAN_TO_16_CHAR_LOOP);
3420     vmovdqu(vec3, Address(result, 0));
3421     vpcmpeqw(vec3, vec3, vec1, 1);
3422     vptest(vec2, vec3);
3423     jcc(Assembler::carryClear, FOUND_CHAR);
3424     addptr(result, 32);
3425     subl(tmp, 2*stride);
3426     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3427     jmp(SCAN_TO_8_CHAR);
3428     bind(SCAN_TO_8_CHAR_INIT);
3429     movdl(vec1, ch);
3430     pshuflw(vec1, vec1, 0x00);
3431     pshufd(vec1, vec1, 0);
3432     pxor(vec2, vec2);
3433   }
3434   bind(SCAN_TO_8_CHAR);
3435   cmpl(cnt1, stride);
3436   jcc(Assembler::less, SCAN_TO_CHAR);
3437   if (UseAVX < 2) {
3438     movdl(vec1, ch);
3439     pshuflw(vec1, vec1, 0x00);
3440     pshufd(vec1, vec1, 0);
3441     pxor(vec2, vec2);
3442   }
3443   movl(tmp, cnt1);
3444   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3445   andl(cnt1,0x00000007);  //tail count (in chars)
3446 
3447   bind(SCAN_TO_8_CHAR_LOOP);
3448   movdqu(vec3, Address(result, 0));
3449   pcmpeqw(vec3, vec1);
3450   ptest(vec2, vec3);
3451   jcc(Assembler::carryClear, FOUND_CHAR);
3452   addptr(result, 16);
3453   subl(tmp, stride);
3454   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3455   bind(SCAN_TO_CHAR);
3456   testl(cnt1, cnt1);
3457   jcc(Assembler::zero, RET_NOT_FOUND);
3458   bind(SCAN_TO_CHAR_LOOP);
3459   load_unsigned_short(tmp, Address(result, 0));
3460   cmpl(ch, tmp);
3461   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3462   addptr(result, 2);
3463   subl(cnt1, 1);
3464   jccb(Assembler::zero, RET_NOT_FOUND);
3465   jmp(SCAN_TO_CHAR_LOOP);
3466 
3467   bind(RET_NOT_FOUND);
3468   movl(result, -1);
3469   jmpb(DONE_LABEL);
3470 
3471   bind(FOUND_CHAR);
3472   if (UseAVX >= 2) {
3473     vpmovmskb(tmp, vec3);
3474   } else {
3475     pmovmskb(tmp, vec3);
3476   }
3477   bsfl(ch, tmp);
3478   addptr(result, ch);
3479 
3480   bind(FOUND_SEQ_CHAR);
3481   subptr(result, str1);
3482   shrl(result, 1);
3483 
3484   bind(DONE_LABEL);
3485 } // string_indexof_char
3486 
3487 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3488                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3489   ShortBranchVerifier sbv(this);
3490   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3491 
3492   int stride = 16;
3493 
3494   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3495         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3496         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3497         FOUND_SEQ_CHAR, DONE_LABEL;
3498 
3499   movptr(result, str1);
3500   if (UseAVX >= 2) {
3501     cmpl(cnt1, stride);
3502     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3503     cmpl(cnt1, stride*2);
3504     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3505     movdl(vec1, ch);
3506     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3507     vpxor(vec2, vec2);
3508     movl(tmp, cnt1);
3509     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3510     andl(cnt1,0x0000001F);  //tail count (in chars)
3511 
3512     bind(SCAN_TO_32_CHAR_LOOP);
3513     vmovdqu(vec3, Address(result, 0));
3514     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3515     vptest(vec2, vec3);
3516     jcc(Assembler::carryClear, FOUND_CHAR);
3517     addptr(result, 32);
3518     subl(tmp, stride*2);
3519     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3520     jmp(SCAN_TO_16_CHAR);
3521 
3522     bind(SCAN_TO_16_CHAR_INIT);
3523     movdl(vec1, ch);
3524     pxor(vec2, vec2);
3525     pshufb(vec1, vec2);
3526   }
3527 
3528   bind(SCAN_TO_16_CHAR);
3529   cmpl(cnt1, stride);
3530   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3531   if (UseAVX < 2) {
3532     movdl(vec1, ch);
3533     pxor(vec2, vec2);
3534     pshufb(vec1, vec2);
3535   }
3536   movl(tmp, cnt1);
3537   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3538   andl(cnt1,0x0000000F);  //tail count (in bytes)
3539 
3540   bind(SCAN_TO_16_CHAR_LOOP);
3541   movdqu(vec3, Address(result, 0));
3542   pcmpeqb(vec3, vec1);
3543   ptest(vec2, vec3);
3544   jcc(Assembler::carryClear, FOUND_CHAR);
3545   addptr(result, 16);
3546   subl(tmp, stride);
3547   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3548 
3549   bind(SCAN_TO_CHAR_INIT);
3550   testl(cnt1, cnt1);
3551   jcc(Assembler::zero, RET_NOT_FOUND);
3552   bind(SCAN_TO_CHAR_LOOP);
3553   load_unsigned_byte(tmp, Address(result, 0));
3554   cmpl(ch, tmp);
3555   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3556   addptr(result, 1);
3557   subl(cnt1, 1);
3558   jccb(Assembler::zero, RET_NOT_FOUND);
3559   jmp(SCAN_TO_CHAR_LOOP);
3560 
3561   bind(RET_NOT_FOUND);
3562   movl(result, -1);
3563   jmpb(DONE_LABEL);
3564 
3565   bind(FOUND_CHAR);
3566   if (UseAVX >= 2) {
3567     vpmovmskb(tmp, vec3);
3568   } else {
3569     pmovmskb(tmp, vec3);
3570   }
3571   bsfl(ch, tmp);
3572   addptr(result, ch);
3573 
3574   bind(FOUND_SEQ_CHAR);
3575   subptr(result, str1);
3576 
3577   bind(DONE_LABEL);
3578 } // stringL_indexof_char
3579 
3580 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3581   switch (eltype) {
3582   case T_BOOLEAN: return sizeof(jboolean);
3583   case T_BYTE:  return sizeof(jbyte);
3584   case T_SHORT: return sizeof(jshort);
3585   case T_CHAR:  return sizeof(jchar);
3586   case T_INT:   return sizeof(jint);
3587   default:
3588     ShouldNotReachHere();
3589     return -1;
3590   }
3591 }
3592 
3593 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3594   switch (eltype) {
3595   // T_BOOLEAN used as surrogate for unsigned byte
3596   case T_BOOLEAN: movzbl(dst, src);   break;
3597   case T_BYTE:    movsbl(dst, src);   break;
3598   case T_SHORT:   movswl(dst, src);   break;
3599   case T_CHAR:    movzwl(dst, src);   break;
3600   case T_INT:     movl(dst, src);     break;
3601   default:
3602     ShouldNotReachHere();
3603   }
3604 }
3605 
3606 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3607   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3608 }
3609 
3610 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3611   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3612 }
3613 
3614 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3615   const int vlen = Assembler::AVX_256bit;
3616   switch (eltype) {
3617   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3618   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3619   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3620   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3621   case T_INT:
3622     // do nothing
3623     break;
3624   default:
3625     ShouldNotReachHere();
3626   }
3627 }
3628 
3629 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3630                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3631                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3632                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3633                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3634                                         BasicType eltype) {
3635   ShortBranchVerifier sbv(this);
3636   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3637   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3638   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3639 
3640   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3641         SHORT_UNROLLED_LOOP_EXIT,
3642         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3643         UNROLLED_VECTOR_LOOP_BEGIN,
3644         END;
3645   switch (eltype) {
3646   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3647   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3648   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3649   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3650   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3651   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3652   }
3653 
3654   // For "renaming" for readibility of the code
3655   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3656                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3657                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3658 
3659   const int elsize = arrays_hashcode_elsize(eltype);
3660 
3661   /*
3662     if (cnt1 >= 2) {
3663       if (cnt1 >= 32) {
3664         UNROLLED VECTOR LOOP
3665       }
3666       UNROLLED SCALAR LOOP
3667     }
3668     SINGLE SCALAR
3669    */
3670 
3671   cmpl(cnt1, 32);
3672   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3673 
3674   // cnt1 >= 32 && generate_vectorized_loop
3675   xorl(index, index);
3676 
3677   // vresult = IntVector.zero(I256);
3678   for (int idx = 0; idx < 4; idx++) {
3679     vpxor(vresult[idx], vresult[idx]);
3680   }
3681   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3682   Register bound = tmp2;
3683   Register next = tmp3;
3684   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3685   movl(next, Address(tmp2, 0));
3686   movdl(vnext, next);
3687   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3688 
3689   // index = 0;
3690   // bound = cnt1 & ~(32 - 1);
3691   movl(bound, cnt1);
3692   andl(bound, ~(32 - 1));
3693   // for (; index < bound; index += 32) {
3694   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3695   // result *= next;
3696   imull(result, next);
3697   // loop fission to upfront the cost of fetching from memory, OOO execution
3698   // can then hopefully do a better job of prefetching
3699   for (int idx = 0; idx < 4; idx++) {
3700     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3701   }
3702   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3703   for (int idx = 0; idx < 4; idx++) {
3704     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3705     arrays_hashcode_elvcast(vtmp[idx], eltype);
3706     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3707   }
3708   // index += 32;
3709   addl(index, 32);
3710   // index < bound;
3711   cmpl(index, bound);
3712   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3713   // }
3714 
3715   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3716   subl(cnt1, bound);
3717   // release bound
3718 
3719   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3720   for (int idx = 0; idx < 4; idx++) {
3721     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3722     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3723     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3724   }
3725   // result += vresult.reduceLanes(ADD);
3726   for (int idx = 0; idx < 4; idx++) {
3727     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3728   }
3729 
3730   // } else if (cnt1 < 32) {
3731 
3732   bind(SHORT_UNROLLED_BEGIN);
3733   // int i = 1;
3734   movl(index, 1);
3735   cmpl(index, cnt1);
3736   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3737 
3738   // for (; i < cnt1 ; i += 2) {
3739   bind(SHORT_UNROLLED_LOOP_BEGIN);
3740   movl(tmp3, 961);
3741   imull(result, tmp3);
3742   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3743   movl(tmp3, tmp2);
3744   shll(tmp3, 5);
3745   subl(tmp3, tmp2);
3746   addl(result, tmp3);
3747   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3748   addl(result, tmp3);
3749   addl(index, 2);
3750   cmpl(index, cnt1);
3751   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3752 
3753   // }
3754   // if (i >= cnt1) {
3755   bind(SHORT_UNROLLED_LOOP_EXIT);
3756   jccb(Assembler::greater, END);
3757   movl(tmp2, result);
3758   shll(result, 5);
3759   subl(result, tmp2);
3760   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3761   addl(result, tmp3);
3762   // }
3763   bind(END);
3764 
3765   BLOCK_COMMENT("} // arrays_hashcode");
3766 
3767 } // arrays_hashcode
3768 
3769 // helper function for string_compare
3770 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3771                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3772                                            Address::ScaleFactor scale2, Register index, int ae) {
3773   if (ae == StrIntrinsicNode::LL) {
3774     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3775     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3776   } else if (ae == StrIntrinsicNode::UU) {
3777     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3778     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3779   } else {
3780     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3781     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3782   }
3783 }
3784 
3785 // Compare strings, used for char[] and byte[].
3786 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3787                                        Register cnt1, Register cnt2, Register result,
3788                                        XMMRegister vec1, int ae, KRegister mask) {
3789   ShortBranchVerifier sbv(this);
3790   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3791   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3792   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3793   int stride2x2 = 0x40;
3794   Address::ScaleFactor scale = Address::no_scale;
3795   Address::ScaleFactor scale1 = Address::no_scale;
3796   Address::ScaleFactor scale2 = Address::no_scale;
3797 
3798   if (ae != StrIntrinsicNode::LL) {
3799     stride2x2 = 0x20;
3800   }
3801 
3802   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3803     shrl(cnt2, 1);
3804   }
3805   // Compute the minimum of the string lengths and the
3806   // difference of the string lengths (stack).
3807   // Do the conditional move stuff
3808   movl(result, cnt1);
3809   subl(cnt1, cnt2);
3810   push(cnt1);
3811   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3812 
3813   // Is the minimum length zero?
3814   testl(cnt2, cnt2);
3815   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3816   if (ae == StrIntrinsicNode::LL) {
3817     // Load first bytes
3818     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3819     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3820   } else if (ae == StrIntrinsicNode::UU) {
3821     // Load first characters
3822     load_unsigned_short(result, Address(str1, 0));
3823     load_unsigned_short(cnt1, Address(str2, 0));
3824   } else {
3825     load_unsigned_byte(result, Address(str1, 0));
3826     load_unsigned_short(cnt1, Address(str2, 0));
3827   }
3828   subl(result, cnt1);
3829   jcc(Assembler::notZero,  POP_LABEL);
3830 
3831   if (ae == StrIntrinsicNode::UU) {
3832     // Divide length by 2 to get number of chars
3833     shrl(cnt2, 1);
3834   }
3835   cmpl(cnt2, 1);
3836   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3837 
3838   // Check if the strings start at the same location and setup scale and stride
3839   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3840     cmpptr(str1, str2);
3841     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3842     if (ae == StrIntrinsicNode::LL) {
3843       scale = Address::times_1;
3844       stride = 16;
3845     } else {
3846       scale = Address::times_2;
3847       stride = 8;
3848     }
3849   } else {
3850     scale1 = Address::times_1;
3851     scale2 = Address::times_2;
3852     // scale not used
3853     stride = 8;
3854   }
3855 
3856   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3857     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3858     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3859     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3860     Label COMPARE_TAIL_LONG;
3861     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3862 
3863     int pcmpmask = 0x19;
3864     if (ae == StrIntrinsicNode::LL) {
3865       pcmpmask &= ~0x01;
3866     }
3867 
3868     // Setup to compare 16-chars (32-bytes) vectors,
3869     // start from first character again because it has aligned address.
3870     if (ae == StrIntrinsicNode::LL) {
3871       stride2 = 32;
3872     } else {
3873       stride2 = 16;
3874     }
3875     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3876       adr_stride = stride << scale;
3877     } else {
3878       adr_stride1 = 8;  //stride << scale1;
3879       adr_stride2 = 16; //stride << scale2;
3880     }
3881 
3882     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3883     // rax and rdx are used by pcmpestri as elements counters
3884     movl(result, cnt2);
3885     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3886     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3887 
3888     // fast path : compare first 2 8-char vectors.
3889     bind(COMPARE_16_CHARS);
3890     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3891       movdqu(vec1, Address(str1, 0));
3892     } else {
3893       pmovzxbw(vec1, Address(str1, 0));
3894     }
3895     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3896     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3897 
3898     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3899       movdqu(vec1, Address(str1, adr_stride));
3900       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3901     } else {
3902       pmovzxbw(vec1, Address(str1, adr_stride1));
3903       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3904     }
3905     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3906     addl(cnt1, stride);
3907 
3908     // Compare the characters at index in cnt1
3909     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3910     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3911     subl(result, cnt2);
3912     jmp(POP_LABEL);
3913 
3914     // Setup the registers to start vector comparison loop
3915     bind(COMPARE_WIDE_VECTORS);
3916     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3917       lea(str1, Address(str1, result, scale));
3918       lea(str2, Address(str2, result, scale));
3919     } else {
3920       lea(str1, Address(str1, result, scale1));
3921       lea(str2, Address(str2, result, scale2));
3922     }
3923     subl(result, stride2);
3924     subl(cnt2, stride2);
3925     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3926     negptr(result);
3927 
3928     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3929     bind(COMPARE_WIDE_VECTORS_LOOP);
3930 
3931     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3932       cmpl(cnt2, stride2x2);
3933       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3934       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3935       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3936 
3937       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3938       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3939         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3940         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3941       } else {
3942         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3943         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3944       }
3945       kortestql(mask, mask);
3946       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3947       addptr(result, stride2x2);  // update since we already compared at this addr
3948       subl(cnt2, stride2x2);      // and sub the size too
3949       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3950 
3951       vpxor(vec1, vec1);
3952       jmpb(COMPARE_WIDE_TAIL);
3953     }//if (VM_Version::supports_avx512vlbw())
3954 
3955     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3956     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3957       vmovdqu(vec1, Address(str1, result, scale));
3958       vpxor(vec1, Address(str2, result, scale));
3959     } else {
3960       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3961       vpxor(vec1, Address(str2, result, scale2));
3962     }
3963     vptest(vec1, vec1);
3964     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3965     addptr(result, stride2);
3966     subl(cnt2, stride2);
3967     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3968     // clean upper bits of YMM registers
3969     vpxor(vec1, vec1);
3970 
3971     // compare wide vectors tail
3972     bind(COMPARE_WIDE_TAIL);
3973     testptr(result, result);
3974     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3975 
3976     movl(result, stride2);
3977     movl(cnt2, result);
3978     negptr(result);
3979     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3980 
3981     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3982     bind(VECTOR_NOT_EQUAL);
3983     // clean upper bits of YMM registers
3984     vpxor(vec1, vec1);
3985     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3986       lea(str1, Address(str1, result, scale));
3987       lea(str2, Address(str2, result, scale));
3988     } else {
3989       lea(str1, Address(str1, result, scale1));
3990       lea(str2, Address(str2, result, scale2));
3991     }
3992     jmp(COMPARE_16_CHARS);
3993 
3994     // Compare tail chars, length between 1 to 15 chars
3995     bind(COMPARE_TAIL_LONG);
3996     movl(cnt2, result);
3997     cmpl(cnt2, stride);
3998     jcc(Assembler::less, COMPARE_SMALL_STR);
3999 
4000     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4001       movdqu(vec1, Address(str1, 0));
4002     } else {
4003       pmovzxbw(vec1, Address(str1, 0));
4004     }
4005     pcmpestri(vec1, Address(str2, 0), pcmpmask);
4006     jcc(Assembler::below, COMPARE_INDEX_CHAR);
4007     subptr(cnt2, stride);
4008     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4009     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4010       lea(str1, Address(str1, result, scale));
4011       lea(str2, Address(str2, result, scale));
4012     } else {
4013       lea(str1, Address(str1, result, scale1));
4014       lea(str2, Address(str2, result, scale2));
4015     }
4016     negptr(cnt2);
4017     jmpb(WHILE_HEAD_LABEL);
4018 
4019     bind(COMPARE_SMALL_STR);
4020   } else if (UseSSE42Intrinsics) {
4021     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
4022     int pcmpmask = 0x19;
4023     // Setup to compare 8-char (16-byte) vectors,
4024     // start from first character again because it has aligned address.
4025     movl(result, cnt2);
4026     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
4027     if (ae == StrIntrinsicNode::LL) {
4028       pcmpmask &= ~0x01;
4029     }
4030     jcc(Assembler::zero, COMPARE_TAIL);
4031     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4032       lea(str1, Address(str1, result, scale));
4033       lea(str2, Address(str2, result, scale));
4034     } else {
4035       lea(str1, Address(str1, result, scale1));
4036       lea(str2, Address(str2, result, scale2));
4037     }
4038     negptr(result);
4039 
4040     // pcmpestri
4041     //   inputs:
4042     //     vec1- substring
4043     //     rax - negative string length (elements count)
4044     //     mem - scanned string
4045     //     rdx - string length (elements count)
4046     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
4047     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
4048     //   outputs:
4049     //     rcx - first mismatched element index
4050     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
4051 
4052     bind(COMPARE_WIDE_VECTORS);
4053     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4054       movdqu(vec1, Address(str1, result, scale));
4055       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4056     } else {
4057       pmovzxbw(vec1, Address(str1, result, scale1));
4058       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4059     }
4060     // After pcmpestri cnt1(rcx) contains mismatched element index
4061 
4062     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
4063     addptr(result, stride);
4064     subptr(cnt2, stride);
4065     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4066 
4067     // compare wide vectors tail
4068     testptr(result, result);
4069     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4070 
4071     movl(cnt2, stride);
4072     movl(result, stride);
4073     negptr(result);
4074     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4075       movdqu(vec1, Address(str1, result, scale));
4076       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4077     } else {
4078       pmovzxbw(vec1, Address(str1, result, scale1));
4079       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4080     }
4081     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4082 
4083     // Mismatched characters in the vectors
4084     bind(VECTOR_NOT_EQUAL);
4085     addptr(cnt1, result);
4086     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4087     subl(result, cnt2);
4088     jmpb(POP_LABEL);
4089 
4090     bind(COMPARE_TAIL); // limit is zero
4091     movl(cnt2, result);
4092     // Fallthru to tail compare
4093   }
4094   // Shift str2 and str1 to the end of the arrays, negate min
4095   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4096     lea(str1, Address(str1, cnt2, scale));
4097     lea(str2, Address(str2, cnt2, scale));
4098   } else {
4099     lea(str1, Address(str1, cnt2, scale1));
4100     lea(str2, Address(str2, cnt2, scale2));
4101   }
4102   decrementl(cnt2);  // first character was compared already
4103   negptr(cnt2);
4104 
4105   // Compare the rest of the elements
4106   bind(WHILE_HEAD_LABEL);
4107   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4108   subl(result, cnt1);
4109   jccb(Assembler::notZero, POP_LABEL);
4110   increment(cnt2);
4111   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4112 
4113   // Strings are equal up to min length.  Return the length difference.
4114   bind(LENGTH_DIFF_LABEL);
4115   pop(result);
4116   if (ae == StrIntrinsicNode::UU) {
4117     // Divide diff by 2 to get number of chars
4118     sarl(result, 1);
4119   }
4120   jmpb(DONE_LABEL);
4121 
4122   if (VM_Version::supports_avx512vlbw()) {
4123 
4124     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4125 
4126     kmovql(cnt1, mask);
4127     notq(cnt1);
4128     bsfq(cnt2, cnt1);
4129     if (ae != StrIntrinsicNode::LL) {
4130       // Divide diff by 2 to get number of chars
4131       sarl(cnt2, 1);
4132     }
4133     addq(result, cnt2);
4134     if (ae == StrIntrinsicNode::LL) {
4135       load_unsigned_byte(cnt1, Address(str2, result));
4136       load_unsigned_byte(result, Address(str1, result));
4137     } else if (ae == StrIntrinsicNode::UU) {
4138       load_unsigned_short(cnt1, Address(str2, result, scale));
4139       load_unsigned_short(result, Address(str1, result, scale));
4140     } else {
4141       load_unsigned_short(cnt1, Address(str2, result, scale2));
4142       load_unsigned_byte(result, Address(str1, result, scale1));
4143     }
4144     subl(result, cnt1);
4145     jmpb(POP_LABEL);
4146   }//if (VM_Version::supports_avx512vlbw())
4147 
4148   // Discard the stored length difference
4149   bind(POP_LABEL);
4150   pop(cnt1);
4151 
4152   // That's it
4153   bind(DONE_LABEL);
4154   if(ae == StrIntrinsicNode::UL) {
4155     negl(result);
4156   }
4157 
4158 }
4159 
4160 // Search for Non-ASCII character (Negative byte value) in a byte array,
4161 // return the index of the first such character, otherwise the length
4162 // of the array segment searched.
4163 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4164 //   @IntrinsicCandidate
4165 //   public static int countPositives(byte[] ba, int off, int len) {
4166 //     for (int i = off; i < off + len; i++) {
4167 //       if (ba[i] < 0) {
4168 //         return i - off;
4169 //       }
4170 //     }
4171 //     return len;
4172 //   }
4173 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4174   Register result, Register tmp1,
4175   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4176   // rsi: byte array
4177   // rcx: len
4178   // rax: result
4179   ShortBranchVerifier sbv(this);
4180   assert_different_registers(ary1, len, result, tmp1);
4181   assert_different_registers(vec1, vec2);
4182   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4183 
4184   movl(result, len); // copy
4185   // len == 0
4186   testl(len, len);
4187   jcc(Assembler::zero, DONE);
4188 
4189   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4190     VM_Version::supports_avx512vlbw() &&
4191     VM_Version::supports_bmi2()) {
4192 
4193     Label test_64_loop, test_tail, BREAK_LOOP;
4194     movl(tmp1, len);
4195     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4196 
4197     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4198     andl(len,  0xffffffc0); // vector count (in chars)
4199     jccb(Assembler::zero, test_tail);
4200 
4201     lea(ary1, Address(ary1, len, Address::times_1));
4202     negptr(len);
4203 
4204     bind(test_64_loop);
4205     // Check whether our 64 elements of size byte contain negatives
4206     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4207     kortestql(mask1, mask1);
4208     jcc(Assembler::notZero, BREAK_LOOP);
4209 
4210     addptr(len, 64);
4211     jccb(Assembler::notZero, test_64_loop);
4212 
4213     bind(test_tail);
4214     // bail out when there is nothing to be done
4215     testl(tmp1, -1);
4216     jcc(Assembler::zero, DONE);
4217 
4218 
4219     // check the tail for absense of negatives
4220     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4221     {
4222       Register tmp3_aliased = len;
4223       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4224       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4225       notq(tmp3_aliased);
4226       kmovql(mask2, tmp3_aliased);
4227     }
4228 
4229     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4230     ktestq(mask1, mask2);
4231     jcc(Assembler::zero, DONE);
4232 
4233     // do a full check for negative registers in the tail
4234     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4235                      // ary1 already pointing to the right place
4236     jmpb(TAIL_START);
4237 
4238     bind(BREAK_LOOP);
4239     // At least one byte in the last 64 byte block was negative.
4240     // Set up to look at the last 64 bytes as if they were a tail
4241     lea(ary1, Address(ary1, len, Address::times_1));
4242     addptr(result, len);
4243     // Ignore the very last byte: if all others are positive,
4244     // it must be negative, so we can skip right to the 2+1 byte
4245     // end comparison at this point
4246     orl(result, 63);
4247     movl(len, 63);
4248     // Fallthru to tail compare
4249   } else {
4250 
4251     if (UseAVX >= 2) {
4252       // With AVX2, use 32-byte vector compare
4253       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4254 
4255       // Compare 32-byte vectors
4256       testl(len, 0xffffffe0);   // vector count (in bytes)
4257       jccb(Assembler::zero, TAIL_START);
4258 
4259       andl(len, 0xffffffe0);
4260       lea(ary1, Address(ary1, len, Address::times_1));
4261       negptr(len);
4262 
4263       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4264       movdl(vec2, tmp1);
4265       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4266 
4267       bind(COMPARE_WIDE_VECTORS);
4268       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4269       vptest(vec1, vec2);
4270       jccb(Assembler::notZero, BREAK_LOOP);
4271       addptr(len, 32);
4272       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4273 
4274       testl(result, 0x0000001f);   // any bytes remaining?
4275       jcc(Assembler::zero, DONE);
4276 
4277       // Quick test using the already prepared vector mask
4278       movl(len, result);
4279       andl(len, 0x0000001f);
4280       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4281       vptest(vec1, vec2);
4282       jcc(Assembler::zero, DONE);
4283       // There are zeros, jump to the tail to determine exactly where
4284       jmpb(TAIL_START);
4285 
4286       bind(BREAK_LOOP);
4287       // At least one byte in the last 32-byte vector is negative.
4288       // Set up to look at the last 32 bytes as if they were a tail
4289       lea(ary1, Address(ary1, len, Address::times_1));
4290       addptr(result, len);
4291       // Ignore the very last byte: if all others are positive,
4292       // it must be negative, so we can skip right to the 2+1 byte
4293       // end comparison at this point
4294       orl(result, 31);
4295       movl(len, 31);
4296       // Fallthru to tail compare
4297     } else if (UseSSE42Intrinsics) {
4298       // With SSE4.2, use double quad vector compare
4299       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4300 
4301       // Compare 16-byte vectors
4302       testl(len, 0xfffffff0);   // vector count (in bytes)
4303       jcc(Assembler::zero, TAIL_START);
4304 
4305       andl(len, 0xfffffff0);
4306       lea(ary1, Address(ary1, len, Address::times_1));
4307       negptr(len);
4308 
4309       movl(tmp1, 0x80808080);
4310       movdl(vec2, tmp1);
4311       pshufd(vec2, vec2, 0);
4312 
4313       bind(COMPARE_WIDE_VECTORS);
4314       movdqu(vec1, Address(ary1, len, Address::times_1));
4315       ptest(vec1, vec2);
4316       jccb(Assembler::notZero, BREAK_LOOP);
4317       addptr(len, 16);
4318       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4319 
4320       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4321       jcc(Assembler::zero, DONE);
4322 
4323       // Quick test using the already prepared vector mask
4324       movl(len, result);
4325       andl(len, 0x0000000f);   // tail count (in bytes)
4326       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4327       ptest(vec1, vec2);
4328       jcc(Assembler::zero, DONE);
4329       jmpb(TAIL_START);
4330 
4331       bind(BREAK_LOOP);
4332       // At least one byte in the last 16-byte vector is negative.
4333       // Set up and look at the last 16 bytes as if they were a tail
4334       lea(ary1, Address(ary1, len, Address::times_1));
4335       addptr(result, len);
4336       // Ignore the very last byte: if all others are positive,
4337       // it must be negative, so we can skip right to the 2+1 byte
4338       // end comparison at this point
4339       orl(result, 15);
4340       movl(len, 15);
4341       // Fallthru to tail compare
4342     }
4343   }
4344 
4345   bind(TAIL_START);
4346   // Compare 4-byte vectors
4347   andl(len, 0xfffffffc); // vector count (in bytes)
4348   jccb(Assembler::zero, COMPARE_CHAR);
4349 
4350   lea(ary1, Address(ary1, len, Address::times_1));
4351   negptr(len);
4352 
4353   bind(COMPARE_VECTORS);
4354   movl(tmp1, Address(ary1, len, Address::times_1));
4355   andl(tmp1, 0x80808080);
4356   jccb(Assembler::notZero, TAIL_ADJUST);
4357   addptr(len, 4);
4358   jccb(Assembler::notZero, COMPARE_VECTORS);
4359 
4360   // Compare trailing char (final 2-3 bytes), if any
4361   bind(COMPARE_CHAR);
4362 
4363   testl(result, 0x2);   // tail  char
4364   jccb(Assembler::zero, COMPARE_BYTE);
4365   load_unsigned_short(tmp1, Address(ary1, 0));
4366   andl(tmp1, 0x00008080);
4367   jccb(Assembler::notZero, CHAR_ADJUST);
4368   lea(ary1, Address(ary1, 2));
4369 
4370   bind(COMPARE_BYTE);
4371   testl(result, 0x1);   // tail  byte
4372   jccb(Assembler::zero, DONE);
4373   load_unsigned_byte(tmp1, Address(ary1, 0));
4374   testl(tmp1, 0x00000080);
4375   jccb(Assembler::zero, DONE);
4376   subptr(result, 1);
4377   jmpb(DONE);
4378 
4379   bind(TAIL_ADJUST);
4380   // there are negative bits in the last 4 byte block.
4381   // Adjust result and check the next three bytes
4382   addptr(result, len);
4383   orl(result, 3);
4384   lea(ary1, Address(ary1, len, Address::times_1));
4385   jmpb(COMPARE_CHAR);
4386 
4387   bind(CHAR_ADJUST);
4388   // We are looking at a char + optional byte tail, and found that one
4389   // of the bytes in the char is negative. Adjust the result, check the
4390   // first byte and readjust if needed.
4391   andl(result, 0xfffffffc);
4392   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4393   jccb(Assembler::notZero, DONE);
4394   addptr(result, 1);
4395 
4396   // That's it
4397   bind(DONE);
4398   if (UseAVX >= 2) {
4399     // clean upper bits of YMM registers
4400     vpxor(vec1, vec1);
4401     vpxor(vec2, vec2);
4402   }
4403 }
4404 
4405 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4406 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4407                                       Register limit, Register result, Register chr,
4408                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4409                                       KRegister mask, bool expand_ary2) {
4410   // for expand_ary2, limit is the (smaller) size of the second array.
4411   ShortBranchVerifier sbv(this);
4412   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4413 
4414   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4415          "Expansion only implemented for AVX2");
4416 
4417   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4418   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4419 
4420   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4421   int scaleIncr = expand_ary2 ? 8 : 16;
4422 
4423   if (is_array_equ) {
4424     // Check the input args
4425     cmpoop(ary1, ary2);
4426     jcc(Assembler::equal, TRUE_LABEL);
4427 
4428     // Need additional checks for arrays_equals.
4429     testptr(ary1, ary1);
4430     jcc(Assembler::zero, FALSE_LABEL);
4431     testptr(ary2, ary2);
4432     jcc(Assembler::zero, FALSE_LABEL);
4433 
4434     // Check the lengths
4435     movl(limit, Address(ary1, length_offset));
4436     cmpl(limit, Address(ary2, length_offset));
4437     jcc(Assembler::notEqual, FALSE_LABEL);
4438   }
4439 
4440   // count == 0
4441   testl(limit, limit);
4442   jcc(Assembler::zero, TRUE_LABEL);
4443 
4444   if (is_array_equ) {
4445     // Load array address
4446     lea(ary1, Address(ary1, base_offset));
4447     lea(ary2, Address(ary2, base_offset));
4448   }
4449 
4450   if (is_array_equ && is_char) {
4451     // arrays_equals when used for char[].
4452     shll(limit, 1);      // byte count != 0
4453   }
4454   movl(result, limit); // copy
4455 
4456   if (UseAVX >= 2) {
4457     // With AVX2, use 32-byte vector compare
4458     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4459 
4460     // Compare 32-byte vectors
4461     if (expand_ary2) {
4462       andl(result, 0x0000000f);  //   tail count (in bytes)
4463       andl(limit, 0xfffffff0);   // vector count (in bytes)
4464       jcc(Assembler::zero, COMPARE_TAIL);
4465     } else {
4466       andl(result, 0x0000001f);  //   tail count (in bytes)
4467       andl(limit, 0xffffffe0);   // vector count (in bytes)
4468       jcc(Assembler::zero, COMPARE_TAIL_16);
4469     }
4470 
4471     lea(ary1, Address(ary1, limit, scaleFactor));
4472     lea(ary2, Address(ary2, limit, Address::times_1));
4473     negptr(limit);
4474 
4475     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4476       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4477 
4478       cmpl(limit, -64);
4479       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4480 
4481       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4482 
4483       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4484       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4485       kortestql(mask, mask);
4486       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4487       addptr(limit, 64);  // update since we already compared at this addr
4488       cmpl(limit, -64);
4489       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4490 
4491       // At this point we may still need to compare -limit+result bytes.
4492       // We could execute the next two instruction and just continue via non-wide path:
4493       //  cmpl(limit, 0);
4494       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4495       // But since we stopped at the points ary{1,2}+limit which are
4496       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4497       // (|limit| <= 32 and result < 32),
4498       // we may just compare the last 64 bytes.
4499       //
4500       addptr(result, -64);   // it is safe, bc we just came from this area
4501       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4502       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4503       kortestql(mask, mask);
4504       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4505 
4506       jmp(TRUE_LABEL);
4507 
4508       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4509 
4510     }//if (VM_Version::supports_avx512vlbw())
4511 
4512     bind(COMPARE_WIDE_VECTORS);
4513     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4514     if (expand_ary2) {
4515       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4516     } else {
4517       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4518     }
4519     vpxor(vec1, vec2);
4520 
4521     vptest(vec1, vec1);
4522     jcc(Assembler::notZero, FALSE_LABEL);
4523     addptr(limit, scaleIncr * 2);
4524     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4525 
4526     testl(result, result);
4527     jcc(Assembler::zero, TRUE_LABEL);
4528 
4529     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4530     if (expand_ary2) {
4531       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4532     } else {
4533       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4534     }
4535     vpxor(vec1, vec2);
4536 
4537     vptest(vec1, vec1);
4538     jcc(Assembler::notZero, FALSE_LABEL);
4539     jmp(TRUE_LABEL);
4540 
4541     bind(COMPARE_TAIL_16); // limit is zero
4542     movl(limit, result);
4543 
4544     // Compare 16-byte chunks
4545     andl(result, 0x0000000f);  //   tail count (in bytes)
4546     andl(limit, 0xfffffff0);   // vector count (in bytes)
4547     jcc(Assembler::zero, COMPARE_TAIL);
4548 
4549     lea(ary1, Address(ary1, limit, scaleFactor));
4550     lea(ary2, Address(ary2, limit, Address::times_1));
4551     negptr(limit);
4552 
4553     bind(COMPARE_WIDE_VECTORS_16);
4554     movdqu(vec1, Address(ary1, limit, scaleFactor));
4555     if (expand_ary2) {
4556       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4557     } else {
4558       movdqu(vec2, Address(ary2, limit, Address::times_1));
4559     }
4560     pxor(vec1, vec2);
4561 
4562     ptest(vec1, vec1);
4563     jcc(Assembler::notZero, FALSE_LABEL);
4564     addptr(limit, scaleIncr);
4565     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4566 
4567     bind(COMPARE_TAIL); // limit is zero
4568     movl(limit, result);
4569     // Fallthru to tail compare
4570   } else if (UseSSE42Intrinsics) {
4571     // With SSE4.2, use double quad vector compare
4572     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4573 
4574     // Compare 16-byte vectors
4575     andl(result, 0x0000000f);  //   tail count (in bytes)
4576     andl(limit, 0xfffffff0);   // vector count (in bytes)
4577     jcc(Assembler::zero, COMPARE_TAIL);
4578 
4579     lea(ary1, Address(ary1, limit, Address::times_1));
4580     lea(ary2, Address(ary2, limit, Address::times_1));
4581     negptr(limit);
4582 
4583     bind(COMPARE_WIDE_VECTORS);
4584     movdqu(vec1, Address(ary1, limit, Address::times_1));
4585     movdqu(vec2, Address(ary2, limit, Address::times_1));
4586     pxor(vec1, vec2);
4587 
4588     ptest(vec1, vec1);
4589     jcc(Assembler::notZero, FALSE_LABEL);
4590     addptr(limit, 16);
4591     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4592 
4593     testl(result, result);
4594     jcc(Assembler::zero, TRUE_LABEL);
4595 
4596     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4597     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4598     pxor(vec1, vec2);
4599 
4600     ptest(vec1, vec1);
4601     jccb(Assembler::notZero, FALSE_LABEL);
4602     jmpb(TRUE_LABEL);
4603 
4604     bind(COMPARE_TAIL); // limit is zero
4605     movl(limit, result);
4606     // Fallthru to tail compare
4607   }
4608 
4609   // Compare 4-byte vectors
4610   if (expand_ary2) {
4611     testl(result, result);
4612     jccb(Assembler::zero, TRUE_LABEL);
4613   } else {
4614     andl(limit, 0xfffffffc); // vector count (in bytes)
4615     jccb(Assembler::zero, COMPARE_CHAR);
4616   }
4617 
4618   lea(ary1, Address(ary1, limit, scaleFactor));
4619   lea(ary2, Address(ary2, limit, Address::times_1));
4620   negptr(limit);
4621 
4622   bind(COMPARE_VECTORS);
4623   if (expand_ary2) {
4624     // There are no "vector" operations for bytes to shorts
4625     movzbl(chr, Address(ary2, limit, Address::times_1));
4626     cmpw(Address(ary1, limit, Address::times_2), chr);
4627     jccb(Assembler::notEqual, FALSE_LABEL);
4628     addptr(limit, 1);
4629     jcc(Assembler::notZero, COMPARE_VECTORS);
4630     jmp(TRUE_LABEL);
4631   } else {
4632     movl(chr, Address(ary1, limit, Address::times_1));
4633     cmpl(chr, Address(ary2, limit, Address::times_1));
4634     jccb(Assembler::notEqual, FALSE_LABEL);
4635     addptr(limit, 4);
4636     jcc(Assembler::notZero, COMPARE_VECTORS);
4637   }
4638 
4639   // Compare trailing char (final 2 bytes), if any
4640   bind(COMPARE_CHAR);
4641   testl(result, 0x2);   // tail  char
4642   jccb(Assembler::zero, COMPARE_BYTE);
4643   load_unsigned_short(chr, Address(ary1, 0));
4644   load_unsigned_short(limit, Address(ary2, 0));
4645   cmpl(chr, limit);
4646   jccb(Assembler::notEqual, FALSE_LABEL);
4647 
4648   if (is_array_equ && is_char) {
4649     bind(COMPARE_BYTE);
4650   } else {
4651     lea(ary1, Address(ary1, 2));
4652     lea(ary2, Address(ary2, 2));
4653 
4654     bind(COMPARE_BYTE);
4655     testl(result, 0x1);   // tail  byte
4656     jccb(Assembler::zero, TRUE_LABEL);
4657     load_unsigned_byte(chr, Address(ary1, 0));
4658     load_unsigned_byte(limit, Address(ary2, 0));
4659     cmpl(chr, limit);
4660     jccb(Assembler::notEqual, FALSE_LABEL);
4661   }
4662   bind(TRUE_LABEL);
4663   movl(result, 1);   // return true
4664   jmpb(DONE);
4665 
4666   bind(FALSE_LABEL);
4667   xorl(result, result); // return false
4668 
4669   // That's it
4670   bind(DONE);
4671   if (UseAVX >= 2) {
4672     // clean upper bits of YMM registers
4673     vpxor(vec1, vec1);
4674     vpxor(vec2, vec2);
4675   }
4676 }
4677 
4678 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4679 #define __ masm.
4680   Register dst = stub.data<0>();
4681   XMMRegister src = stub.data<1>();
4682   address target = stub.data<2>();
4683   __ bind(stub.entry());
4684   __ subptr(rsp, 8);
4685   __ movdbl(Address(rsp), src);
4686   __ call(RuntimeAddress(target));
4687   __ pop(dst);
4688   __ jmp(stub.continuation());
4689 #undef __
4690 }
4691 
4692 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4693   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4694   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4695 
4696   address slowpath_target;
4697   if (dst_bt == T_INT) {
4698     if (src_bt == T_FLOAT) {
4699       cvttss2sil(dst, src);
4700       cmpl(dst, 0x80000000);
4701       slowpath_target = StubRoutines::x86::f2i_fixup();
4702     } else {
4703       cvttsd2sil(dst, src);
4704       cmpl(dst, 0x80000000);
4705       slowpath_target = StubRoutines::x86::d2i_fixup();
4706     }
4707   } else {
4708     if (src_bt == T_FLOAT) {
4709       cvttss2siq(dst, src);
4710       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4711       slowpath_target = StubRoutines::x86::f2l_fixup();
4712     } else {
4713       cvttsd2siq(dst, src);
4714       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4715       slowpath_target = StubRoutines::x86::d2l_fixup();
4716     }
4717   }
4718 
4719   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4720   jcc(Assembler::equal, stub->entry());
4721   bind(stub->continuation());
4722 }
4723 
4724 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4725                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4726   switch(ideal_opc) {
4727     case Op_LShiftVS:
4728       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4729     case Op_LShiftVI:
4730       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4731     case Op_LShiftVL:
4732       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4733     case Op_RShiftVS:
4734       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4735     case Op_RShiftVI:
4736       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4737     case Op_RShiftVL:
4738       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4739     case Op_URShiftVS:
4740       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4741     case Op_URShiftVI:
4742       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4743     case Op_URShiftVL:
4744       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4745     case Op_RotateRightV:
4746       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4747     case Op_RotateLeftV:
4748       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4749     default:
4750       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4751       break;
4752   }
4753 }
4754 
4755 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4756                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4757   if (is_unsigned) {
4758     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4759   } else {
4760     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4761   }
4762 }
4763 
4764 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4765                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4766   switch (elem_bt) {
4767     case T_BYTE:
4768       if (ideal_opc == Op_SaturatingAddV) {
4769         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4770       } else {
4771         assert(ideal_opc == Op_SaturatingSubV, "");
4772         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4773       }
4774       break;
4775     case T_SHORT:
4776       if (ideal_opc == Op_SaturatingAddV) {
4777         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4778       } else {
4779         assert(ideal_opc == Op_SaturatingSubV, "");
4780         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4781       }
4782       break;
4783     default:
4784       fatal("Unsupported type %s", type2name(elem_bt));
4785       break;
4786   }
4787 }
4788 
4789 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4790                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4791   switch (elem_bt) {
4792     case T_BYTE:
4793       if (ideal_opc == Op_SaturatingAddV) {
4794         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4795       } else {
4796         assert(ideal_opc == Op_SaturatingSubV, "");
4797         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4798       }
4799       break;
4800     case T_SHORT:
4801       if (ideal_opc == Op_SaturatingAddV) {
4802         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4803       } else {
4804         assert(ideal_opc == Op_SaturatingSubV, "");
4805         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4806       }
4807       break;
4808     default:
4809       fatal("Unsupported type %s", type2name(elem_bt));
4810       break;
4811   }
4812 }
4813 
4814 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4815                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4816   if (is_unsigned) {
4817     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4818   } else {
4819     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4820   }
4821 }
4822 
4823 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4824                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4825   switch (elem_bt) {
4826     case T_BYTE:
4827       if (ideal_opc == Op_SaturatingAddV) {
4828         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4829       } else {
4830         assert(ideal_opc == Op_SaturatingSubV, "");
4831         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4832       }
4833       break;
4834     case T_SHORT:
4835       if (ideal_opc == Op_SaturatingAddV) {
4836         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4837       } else {
4838         assert(ideal_opc == Op_SaturatingSubV, "");
4839         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4840       }
4841       break;
4842     default:
4843       fatal("Unsupported type %s", type2name(elem_bt));
4844       break;
4845   }
4846 }
4847 
4848 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4849                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4850   switch (elem_bt) {
4851     case T_BYTE:
4852       if (ideal_opc == Op_SaturatingAddV) {
4853         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4854       } else {
4855         assert(ideal_opc == Op_SaturatingSubV, "");
4856         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4857       }
4858       break;
4859     case T_SHORT:
4860       if (ideal_opc == Op_SaturatingAddV) {
4861         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4862       } else {
4863         assert(ideal_opc == Op_SaturatingSubV, "");
4864         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4865       }
4866       break;
4867     default:
4868       fatal("Unsupported type %s", type2name(elem_bt));
4869       break;
4870   }
4871 }
4872 
4873 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4874                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4875                                     bool is_varshift) {
4876   switch (ideal_opc) {
4877     case Op_AddVB:
4878       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4879     case Op_AddVS:
4880       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4881     case Op_AddVI:
4882       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4883     case Op_AddVL:
4884       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4885     case Op_AddVF:
4886       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4887     case Op_AddVD:
4888       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4889     case Op_SubVB:
4890       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4891     case Op_SubVS:
4892       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4893     case Op_SubVI:
4894       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4895     case Op_SubVL:
4896       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4897     case Op_SubVF:
4898       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4899     case Op_SubVD:
4900       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4901     case Op_MulVS:
4902       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4903     case Op_MulVI:
4904       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4905     case Op_MulVL:
4906       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4907     case Op_MulVF:
4908       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4909     case Op_MulVD:
4910       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4911     case Op_DivVF:
4912       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4913     case Op_DivVD:
4914       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4915     case Op_SqrtVF:
4916       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4917     case Op_SqrtVD:
4918       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4919     case Op_AbsVB:
4920       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4921     case Op_AbsVS:
4922       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4923     case Op_AbsVI:
4924       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4925     case Op_AbsVL:
4926       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4927     case Op_FmaVF:
4928       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4929     case Op_FmaVD:
4930       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4931     case Op_VectorRearrange:
4932       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4933     case Op_LShiftVS:
4934       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4935     case Op_LShiftVI:
4936       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4937     case Op_LShiftVL:
4938       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4939     case Op_RShiftVS:
4940       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4941     case Op_RShiftVI:
4942       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4943     case Op_RShiftVL:
4944       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4945     case Op_URShiftVS:
4946       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4947     case Op_URShiftVI:
4948       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4949     case Op_URShiftVL:
4950       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4951     case Op_RotateLeftV:
4952       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4953     case Op_RotateRightV:
4954       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4955     case Op_MaxV:
4956       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4957     case Op_MinV:
4958       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4959     case Op_UMinV:
4960       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4961     case Op_UMaxV:
4962       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4963     case Op_XorV:
4964       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4965     case Op_OrV:
4966       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4967     case Op_AndV:
4968       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4969     default:
4970       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4971       break;
4972   }
4973 }
4974 
4975 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4976                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4977   switch (ideal_opc) {
4978     case Op_AddVB:
4979       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4980     case Op_AddVS:
4981       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4982     case Op_AddVI:
4983       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4984     case Op_AddVL:
4985       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4986     case Op_AddVF:
4987       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4988     case Op_AddVD:
4989       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4990     case Op_SubVB:
4991       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4992     case Op_SubVS:
4993       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4994     case Op_SubVI:
4995       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4996     case Op_SubVL:
4997       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4998     case Op_SubVF:
4999       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
5000     case Op_SubVD:
5001       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
5002     case Op_MulVS:
5003       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
5004     case Op_MulVI:
5005       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
5006     case Op_MulVL:
5007       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
5008     case Op_MulVF:
5009       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
5010     case Op_MulVD:
5011       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
5012     case Op_DivVF:
5013       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
5014     case Op_DivVD:
5015       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
5016     case Op_FmaVF:
5017       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
5018     case Op_FmaVD:
5019       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
5020     case Op_MaxV:
5021       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5022     case Op_MinV:
5023       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5024     case Op_UMaxV:
5025       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5026     case Op_UMinV:
5027       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5028     case Op_XorV:
5029       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5030     case Op_OrV:
5031       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5032     case Op_AndV:
5033       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5034     default:
5035       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
5036       break;
5037   }
5038 }
5039 
5040 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
5041                                   KRegister src1, KRegister src2) {
5042   BasicType etype = T_ILLEGAL;
5043   switch(mask_len) {
5044     case 2:
5045     case 4:
5046     case 8:  etype = T_BYTE; break;
5047     case 16: etype = T_SHORT; break;
5048     case 32: etype = T_INT; break;
5049     case 64: etype = T_LONG; break;
5050     default: fatal("Unsupported type"); break;
5051   }
5052   assert(etype != T_ILLEGAL, "");
5053   switch(ideal_opc) {
5054     case Op_AndVMask:
5055       kand(etype, dst, src1, src2); break;
5056     case Op_OrVMask:
5057       kor(etype, dst, src1, src2); break;
5058     case Op_XorVMask:
5059       kxor(etype, dst, src1, src2); break;
5060     default:
5061       fatal("Unsupported masked operation"); break;
5062   }
5063 }
5064 
5065 /*
5066  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5067  * If src is NaN, the result is 0.
5068  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
5069  * the result is equal to the value of Integer.MIN_VALUE.
5070  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
5071  * the result is equal to the value of Integer.MAX_VALUE.
5072  */
5073 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5074                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5075                                                                    Register rscratch, AddressLiteral float_sign_flip,
5076                                                                    int vec_enc) {
5077   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5078   Label done;
5079   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
5080   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5081   vptest(xtmp2, xtmp2, vec_enc);
5082   jccb(Assembler::equal, done);
5083 
5084   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
5085   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
5086 
5087   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5088   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
5089   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
5090 
5091   // Recompute the mask for remaining special value.
5092   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
5093   // Extract SRC values corresponding to TRUE mask lanes.
5094   vpand(xtmp4, xtmp2, src, vec_enc);
5095   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
5096   // values are set.
5097   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5098 
5099   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5100   bind(done);
5101 }
5102 
5103 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5104                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5105                                                                     Register rscratch, AddressLiteral float_sign_flip,
5106                                                                     int vec_enc) {
5107   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5108   Label done;
5109   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5110   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5111   kortestwl(ktmp1, ktmp1);
5112   jccb(Assembler::equal, done);
5113 
5114   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5115   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5116   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5117 
5118   kxorwl(ktmp1, ktmp1, ktmp2);
5119   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5120   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5121   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5122   bind(done);
5123 }
5124 
5125 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5126                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5127                                                                      Register rscratch, AddressLiteral double_sign_flip,
5128                                                                      int vec_enc) {
5129   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5130 
5131   Label done;
5132   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5133   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5134   kortestwl(ktmp1, ktmp1);
5135   jccb(Assembler::equal, done);
5136 
5137   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5138   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5139   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5140 
5141   kxorwl(ktmp1, ktmp1, ktmp2);
5142   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5143   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5144   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5145   bind(done);
5146 }
5147 
5148 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5149                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5150                                                                      Register rscratch, AddressLiteral float_sign_flip,
5151                                                                      int vec_enc) {
5152   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5153   Label done;
5154   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5155   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5156   kortestwl(ktmp1, ktmp1);
5157   jccb(Assembler::equal, done);
5158 
5159   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5160   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5161   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5162 
5163   kxorwl(ktmp1, ktmp1, ktmp2);
5164   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5165   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5166   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5167   bind(done);
5168 }
5169 
5170 /*
5171  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5172  * If src is NaN, the result is 0.
5173  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5174  * the result is equal to the value of Long.MIN_VALUE.
5175  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5176  * the result is equal to the value of Long.MAX_VALUE.
5177  */
5178 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5179                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5180                                                                       Register rscratch, AddressLiteral double_sign_flip,
5181                                                                       int vec_enc) {
5182   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5183 
5184   Label done;
5185   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5186   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5187   kortestwl(ktmp1, ktmp1);
5188   jccb(Assembler::equal, done);
5189 
5190   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5191   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5192   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5193 
5194   kxorwl(ktmp1, ktmp1, ktmp2);
5195   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5196   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5197   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5198   bind(done);
5199 }
5200 
5201 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5202                                                              XMMRegister xtmp, int index, int vec_enc) {
5203    assert(vec_enc < Assembler::AVX_512bit, "");
5204    if (vec_enc == Assembler::AVX_256bit) {
5205      vextractf128_high(xtmp, src);
5206      vshufps(dst, src, xtmp, index, vec_enc);
5207    } else {
5208      vshufps(dst, src, zero, index, vec_enc);
5209    }
5210 }
5211 
5212 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5213                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5214                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5215   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5216 
5217   Label done;
5218   // Compare the destination lanes with float_sign_flip
5219   // value to get mask for all special values.
5220   movdqu(xtmp1, float_sign_flip, rscratch);
5221   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5222   ptest(xtmp2, xtmp2);
5223   jccb(Assembler::equal, done);
5224 
5225   // Flip float_sign_flip to get max integer value.
5226   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5227   pxor(xtmp1, xtmp4);
5228 
5229   // Set detination lanes corresponding to unordered source lanes as zero.
5230   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5231   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5232 
5233   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5234   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5235   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5236 
5237   // Recompute the mask for remaining special value.
5238   pxor(xtmp2, xtmp3);
5239   // Extract mask corresponding to non-negative source lanes.
5240   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5241 
5242   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5243   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5244   pand(xtmp3, xtmp2);
5245 
5246   // Replace destination lanes holding special value(0x80000000) with max int
5247   // if corresponding source lane holds a +ve value.
5248   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5249   bind(done);
5250 }
5251 
5252 
5253 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5254                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5255   switch(to_elem_bt) {
5256     case T_SHORT:
5257       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5258       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5259       vpackusdw(dst, dst, zero, vec_enc);
5260       if (vec_enc == Assembler::AVX_256bit) {
5261         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5262       }
5263       break;
5264     case  T_BYTE:
5265       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5266       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5267       vpackusdw(dst, dst, zero, vec_enc);
5268       if (vec_enc == Assembler::AVX_256bit) {
5269         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5270       }
5271       vpackuswb(dst, dst, zero, vec_enc);
5272       break;
5273     default: assert(false, "%s", type2name(to_elem_bt));
5274   }
5275 }
5276 
5277 /*
5278  * Algorithm for vector D2L and F2I conversions:-
5279  * a) Perform vector D2L/F2I cast.
5280  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5281  *    It signifies that source value could be any of the special floating point
5282  *    values(NaN,-Inf,Inf,Max,-Min).
5283  * c) Set destination to zero if source is NaN value.
5284  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5285  */
5286 
5287 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5288                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5289                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5290   int to_elem_sz = type2aelembytes(to_elem_bt);
5291   assert(to_elem_sz <= 4, "");
5292   vcvttps2dq(dst, src, vec_enc);
5293   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5294   if (to_elem_sz < 4) {
5295     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5296     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5297   }
5298 }
5299 
5300 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5301                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5302                                             Register rscratch, int vec_enc) {
5303   int to_elem_sz = type2aelembytes(to_elem_bt);
5304   assert(to_elem_sz <= 4, "");
5305   vcvttps2dq(dst, src, vec_enc);
5306   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5307   switch(to_elem_bt) {
5308     case T_INT:
5309       break;
5310     case T_SHORT:
5311       evpmovdw(dst, dst, vec_enc);
5312       break;
5313     case T_BYTE:
5314       evpmovdb(dst, dst, vec_enc);
5315       break;
5316     default: assert(false, "%s", type2name(to_elem_bt));
5317   }
5318 }
5319 
5320 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5321                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5322                                             Register rscratch, int vec_enc) {
5323   evcvttps2qq(dst, src, vec_enc);
5324   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5325 }
5326 
5327 // Handling for downcasting from double to integer or sub-word types on AVX2.
5328 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5329                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5330                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5331   int to_elem_sz = type2aelembytes(to_elem_bt);
5332   assert(to_elem_sz < 8, "");
5333   vcvttpd2dq(dst, src, vec_enc);
5334   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5335                                               float_sign_flip, vec_enc);
5336   if (to_elem_sz < 4) {
5337     // xtmp4 holds all zero lanes.
5338     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5339   }
5340 }
5341 
5342 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5343                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5344                                             KRegister ktmp2, AddressLiteral sign_flip,
5345                                             Register rscratch, int vec_enc) {
5346   if (VM_Version::supports_avx512dq()) {
5347     evcvttpd2qq(dst, src, vec_enc);
5348     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5349     switch(to_elem_bt) {
5350       case T_LONG:
5351         break;
5352       case T_INT:
5353         evpmovsqd(dst, dst, vec_enc);
5354         break;
5355       case T_SHORT:
5356         evpmovsqd(dst, dst, vec_enc);
5357         evpmovdw(dst, dst, vec_enc);
5358         break;
5359       case T_BYTE:
5360         evpmovsqd(dst, dst, vec_enc);
5361         evpmovdb(dst, dst, vec_enc);
5362         break;
5363       default: assert(false, "%s", type2name(to_elem_bt));
5364     }
5365   } else {
5366     assert(type2aelembytes(to_elem_bt) <= 4, "");
5367     vcvttpd2dq(dst, src, vec_enc);
5368     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5369     switch(to_elem_bt) {
5370       case T_INT:
5371         break;
5372       case T_SHORT:
5373         evpmovdw(dst, dst, vec_enc);
5374         break;
5375       case T_BYTE:
5376         evpmovdb(dst, dst, vec_enc);
5377         break;
5378       default: assert(false, "%s", type2name(to_elem_bt));
5379     }
5380   }
5381 }
5382 
5383 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5384                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5385                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5386   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5387   // and re-instantiate original MXCSR.RC mode after that.
5388   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5389 
5390   mov64(tmp, julong_cast(0.5L));
5391   evpbroadcastq(xtmp1, tmp, vec_enc);
5392   vaddpd(xtmp1, src , xtmp1, vec_enc);
5393   evcvtpd2qq(dst, xtmp1, vec_enc);
5394   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5395                                                 double_sign_flip, vec_enc);;
5396 
5397   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5398 }
5399 
5400 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5401                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5402                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5403   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5404   // and re-instantiate original MXCSR.RC mode after that.
5405   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5406 
5407   movl(tmp, jint_cast(0.5));
5408   movq(xtmp1, tmp);
5409   vbroadcastss(xtmp1, xtmp1, vec_enc);
5410   vaddps(xtmp1, src , xtmp1, vec_enc);
5411   vcvtps2dq(dst, xtmp1, vec_enc);
5412   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5413                                               float_sign_flip, vec_enc);
5414 
5415   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5416 }
5417 
5418 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5419                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5420                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5421   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5422   // and re-instantiate original MXCSR.RC mode after that.
5423   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5424 
5425   movl(tmp, jint_cast(0.5));
5426   movq(xtmp1, tmp);
5427   vbroadcastss(xtmp1, xtmp1, vec_enc);
5428   vaddps(xtmp1, src , xtmp1, vec_enc);
5429   vcvtps2dq(dst, xtmp1, vec_enc);
5430   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5431 
5432   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5433 }
5434 
5435 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5436                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5437   switch (from_elem_bt) {
5438     case T_BYTE:
5439       switch (to_elem_bt) {
5440         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5441         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5442         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5443         default: ShouldNotReachHere();
5444       }
5445       break;
5446     case T_SHORT:
5447       switch (to_elem_bt) {
5448         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5449         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5450         default: ShouldNotReachHere();
5451       }
5452       break;
5453     case T_INT:
5454       assert(to_elem_bt == T_LONG, "");
5455       vpmovzxdq(dst, src, vlen_enc);
5456       break;
5457     default:
5458       ShouldNotReachHere();
5459   }
5460 }
5461 
5462 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5463                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5464   switch (from_elem_bt) {
5465     case T_BYTE:
5466       switch (to_elem_bt) {
5467         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5468         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5469         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5470         default: ShouldNotReachHere();
5471       }
5472       break;
5473     case T_SHORT:
5474       switch (to_elem_bt) {
5475         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5476         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5477         default: ShouldNotReachHere();
5478       }
5479       break;
5480     case T_INT:
5481       assert(to_elem_bt == T_LONG, "");
5482       vpmovsxdq(dst, src, vlen_enc);
5483       break;
5484     default:
5485       ShouldNotReachHere();
5486   }
5487 }
5488 
5489 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5490                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5491   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5492   assert(vlen_enc != AVX_512bit, "");
5493 
5494   int dst_bt_size = type2aelembytes(dst_bt);
5495   int src_bt_size = type2aelembytes(src_bt);
5496   if (dst_bt_size > src_bt_size) {
5497     switch (dst_bt_size / src_bt_size) {
5498       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5499       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5500       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5501       default: ShouldNotReachHere();
5502     }
5503   } else {
5504     assert(dst_bt_size < src_bt_size, "");
5505     switch (src_bt_size / dst_bt_size) {
5506       case 2: {
5507         if (vlen_enc == AVX_128bit) {
5508           vpacksswb(dst, src, src, vlen_enc);
5509         } else {
5510           vpacksswb(dst, src, src, vlen_enc);
5511           vpermq(dst, dst, 0x08, vlen_enc);
5512         }
5513         break;
5514       }
5515       case 4: {
5516         if (vlen_enc == AVX_128bit) {
5517           vpackssdw(dst, src, src, vlen_enc);
5518           vpacksswb(dst, dst, dst, vlen_enc);
5519         } else {
5520           vpackssdw(dst, src, src, vlen_enc);
5521           vpermq(dst, dst, 0x08, vlen_enc);
5522           vpacksswb(dst, dst, dst, AVX_128bit);
5523         }
5524         break;
5525       }
5526       case 8: {
5527         if (vlen_enc == AVX_128bit) {
5528           vpshufd(dst, src, 0x08, vlen_enc);
5529           vpackssdw(dst, dst, dst, vlen_enc);
5530           vpacksswb(dst, dst, dst, vlen_enc);
5531         } else {
5532           vpshufd(dst, src, 0x08, vlen_enc);
5533           vpermq(dst, dst, 0x08, vlen_enc);
5534           vpackssdw(dst, dst, dst, AVX_128bit);
5535           vpacksswb(dst, dst, dst, AVX_128bit);
5536         }
5537         break;
5538       }
5539       default: ShouldNotReachHere();
5540     }
5541   }
5542 }
5543 
5544 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5545                                    bool merge, BasicType bt, int vlen_enc) {
5546   if (bt == T_INT) {
5547     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5548   } else {
5549     assert(bt == T_LONG, "");
5550     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5551   }
5552 }
5553 
5554 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5555                                    bool merge, BasicType bt, int vlen_enc) {
5556   if (bt == T_INT) {
5557     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5558   } else {
5559     assert(bt == T_LONG, "");
5560     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5561   }
5562 }
5563 
5564 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5565                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5566                                                int vec_enc) {
5567   int index = 0;
5568   int vindex = 0;
5569   mov64(rtmp1, 0x0101010101010101L);
5570   pdepq(rtmp1, src, rtmp1);
5571   if (mask_len > 8) {
5572     movq(rtmp2, src);
5573     vpxor(xtmp, xtmp, xtmp, vec_enc);
5574     movq(xtmp, rtmp1);
5575   }
5576   movq(dst, rtmp1);
5577 
5578   mask_len -= 8;
5579   while (mask_len > 0) {
5580     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5581     index++;
5582     if ((index % 2) == 0) {
5583       pxor(xtmp, xtmp);
5584     }
5585     mov64(rtmp1, 0x0101010101010101L);
5586     shrq(rtmp2, 8);
5587     pdepq(rtmp1, rtmp2, rtmp1);
5588     pinsrq(xtmp, rtmp1, index % 2);
5589     vindex = index / 2;
5590     if (vindex) {
5591       // Write entire 16 byte vector when both 64 bit
5592       // lanes are update to save redundant instructions.
5593       if (index % 2) {
5594         vinsertf128(dst, dst, xtmp, vindex);
5595       }
5596     } else {
5597       vmovdqu(dst, xtmp);
5598     }
5599     mask_len -= 8;
5600   }
5601 }
5602 
5603 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5604   switch(opc) {
5605     case Op_VectorMaskTrueCount:
5606       popcntq(dst, tmp);
5607       break;
5608     case Op_VectorMaskLastTrue:
5609       if (VM_Version::supports_lzcnt()) {
5610         lzcntq(tmp, tmp);
5611         movl(dst, 63);
5612         subl(dst, tmp);
5613       } else {
5614         movl(dst, -1);
5615         bsrq(tmp, tmp);
5616         cmov32(Assembler::notZero, dst, tmp);
5617       }
5618       break;
5619     case Op_VectorMaskFirstTrue:
5620       if (VM_Version::supports_bmi1()) {
5621         if (masklen < 32) {
5622           orl(tmp, 1 << masklen);
5623           tzcntl(dst, tmp);
5624         } else if (masklen == 32) {
5625           tzcntl(dst, tmp);
5626         } else {
5627           assert(masklen == 64, "");
5628           tzcntq(dst, tmp);
5629         }
5630       } else {
5631         if (masklen < 32) {
5632           orl(tmp, 1 << masklen);
5633           bsfl(dst, tmp);
5634         } else {
5635           assert(masklen == 32 || masklen == 64, "");
5636           movl(dst, masklen);
5637           if (masklen == 32)  {
5638             bsfl(tmp, tmp);
5639           } else {
5640             bsfq(tmp, tmp);
5641           }
5642           cmov32(Assembler::notZero, dst, tmp);
5643         }
5644       }
5645       break;
5646     case Op_VectorMaskToLong:
5647       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5648       break;
5649     default: assert(false, "Unhandled mask operation");
5650   }
5651 }
5652 
5653 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5654                                               int masklen, int masksize, int vec_enc) {
5655   assert(VM_Version::supports_popcnt(), "");
5656 
5657   if(VM_Version::supports_avx512bw()) {
5658     kmovql(tmp, mask);
5659   } else {
5660     assert(masklen <= 16, "");
5661     kmovwl(tmp, mask);
5662   }
5663 
5664   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5665   // operations needs to be clipped.
5666   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5667     andq(tmp, (1 << masklen) - 1);
5668   }
5669 
5670   vector_mask_operation_helper(opc, dst, tmp, masklen);
5671 }
5672 
5673 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5674                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5675   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5676          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5677   assert(VM_Version::supports_popcnt(), "");
5678 
5679   bool need_clip = false;
5680   switch(bt) {
5681     case T_BOOLEAN:
5682       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5683       vpxor(xtmp, xtmp, xtmp, vec_enc);
5684       vpsubb(xtmp, xtmp, mask, vec_enc);
5685       vpmovmskb(tmp, xtmp, vec_enc);
5686       need_clip = masklen < 16;
5687       break;
5688     case T_BYTE:
5689       vpmovmskb(tmp, mask, vec_enc);
5690       need_clip = masklen < 16;
5691       break;
5692     case T_SHORT:
5693       vpacksswb(xtmp, mask, mask, vec_enc);
5694       if (masklen >= 16) {
5695         vpermpd(xtmp, xtmp, 8, vec_enc);
5696       }
5697       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5698       need_clip = masklen < 16;
5699       break;
5700     case T_INT:
5701     case T_FLOAT:
5702       vmovmskps(tmp, mask, vec_enc);
5703       need_clip = masklen < 4;
5704       break;
5705     case T_LONG:
5706     case T_DOUBLE:
5707       vmovmskpd(tmp, mask, vec_enc);
5708       need_clip = masklen < 2;
5709       break;
5710     default: assert(false, "Unhandled type, %s", type2name(bt));
5711   }
5712 
5713   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5714   // operations needs to be clipped.
5715   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5716     // need_clip implies masklen < 32
5717     andq(tmp, (1 << masklen) - 1);
5718   }
5719 
5720   vector_mask_operation_helper(opc, dst, tmp, masklen);
5721 }
5722 
5723 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5724                                              Register rtmp2, int mask_len) {
5725   kmov(rtmp1, src);
5726   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5727   mov64(rtmp2, -1L);
5728   pextq(rtmp2, rtmp2, rtmp1);
5729   kmov(dst, rtmp2);
5730 }
5731 
5732 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5733                                                     XMMRegister mask, Register rtmp, Register rscratch,
5734                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5735                                                     int vec_enc) {
5736   assert(type2aelembytes(bt) >= 4, "");
5737   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5738   address compress_perm_table = nullptr;
5739   address expand_perm_table = nullptr;
5740   if (type2aelembytes(bt) == 8) {
5741     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5742     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5743     vmovmskpd(rtmp, mask, vec_enc);
5744   } else {
5745     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5746     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5747     vmovmskps(rtmp, mask, vec_enc);
5748   }
5749   shlq(rtmp, 5); // for 32 byte permute row.
5750   if (opcode == Op_CompressV) {
5751     lea(rscratch, ExternalAddress(compress_perm_table));
5752   } else {
5753     lea(rscratch, ExternalAddress(expand_perm_table));
5754   }
5755   addptr(rtmp, rscratch);
5756   vmovdqu(permv, Address(rtmp));
5757   vpermps(dst, permv, src, Assembler::AVX_256bit);
5758   vpxor(xtmp, xtmp, xtmp, vec_enc);
5759   // Blend the result with zero vector using permute mask, each column entry
5760   // in a permute table row contains either a valid permute index or a -1 (default)
5761   // value, this can potentially be used as a blending mask after
5762   // compressing/expanding the source vector lanes.
5763   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5764 }
5765 
5766 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5767                                                bool merge, BasicType bt, int vec_enc) {
5768   if (opcode == Op_CompressV) {
5769     switch(bt) {
5770     case T_BYTE:
5771       evpcompressb(dst, mask, src, merge, vec_enc);
5772       break;
5773     case T_CHAR:
5774     case T_SHORT:
5775       evpcompressw(dst, mask, src, merge, vec_enc);
5776       break;
5777     case T_INT:
5778       evpcompressd(dst, mask, src, merge, vec_enc);
5779       break;
5780     case T_FLOAT:
5781       evcompressps(dst, mask, src, merge, vec_enc);
5782       break;
5783     case T_LONG:
5784       evpcompressq(dst, mask, src, merge, vec_enc);
5785       break;
5786     case T_DOUBLE:
5787       evcompresspd(dst, mask, src, merge, vec_enc);
5788       break;
5789     default:
5790       fatal("Unsupported type %s", type2name(bt));
5791       break;
5792     }
5793   } else {
5794     assert(opcode == Op_ExpandV, "");
5795     switch(bt) {
5796     case T_BYTE:
5797       evpexpandb(dst, mask, src, merge, vec_enc);
5798       break;
5799     case T_CHAR:
5800     case T_SHORT:
5801       evpexpandw(dst, mask, src, merge, vec_enc);
5802       break;
5803     case T_INT:
5804       evpexpandd(dst, mask, src, merge, vec_enc);
5805       break;
5806     case T_FLOAT:
5807       evexpandps(dst, mask, src, merge, vec_enc);
5808       break;
5809     case T_LONG:
5810       evpexpandq(dst, mask, src, merge, vec_enc);
5811       break;
5812     case T_DOUBLE:
5813       evexpandpd(dst, mask, src, merge, vec_enc);
5814       break;
5815     default:
5816       fatal("Unsupported type %s", type2name(bt));
5817       break;
5818     }
5819   }
5820 }
5821 
5822 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5823                                            KRegister ktmp1, int vec_enc) {
5824   if (opcode == Op_SignumVD) {
5825     vsubpd(dst, zero, one, vec_enc);
5826     // if src < 0 ? -1 : 1
5827     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5828     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5829     // if src == NaN, -0.0 or 0.0 return src.
5830     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5831     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5832   } else {
5833     assert(opcode == Op_SignumVF, "");
5834     vsubps(dst, zero, one, vec_enc);
5835     // if src < 0 ? -1 : 1
5836     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5837     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5838     // if src == NaN, -0.0 or 0.0 return src.
5839     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5840     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5841   }
5842 }
5843 
5844 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5845                                           XMMRegister xtmp1, int vec_enc) {
5846   if (opcode == Op_SignumVD) {
5847     vsubpd(dst, zero, one, vec_enc);
5848     // if src < 0 ? -1 : 1
5849     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5850     // if src == NaN, -0.0 or 0.0 return src.
5851     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5852     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5853   } else {
5854     assert(opcode == Op_SignumVF, "");
5855     vsubps(dst, zero, one, vec_enc);
5856     // if src < 0 ? -1 : 1
5857     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5858     // if src == NaN, -0.0 or 0.0 return src.
5859     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5860     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5861   }
5862 }
5863 
5864 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5865   if (VM_Version::supports_avx512bw()) {
5866     if (mask_len > 32) {
5867       kmovql(dst, src);
5868     } else {
5869       kmovdl(dst, src);
5870       if (mask_len != 32) {
5871         kshiftrdl(dst, dst, 32 - mask_len);
5872       }
5873     }
5874   } else {
5875     assert(mask_len <= 16, "");
5876     kmovwl(dst, src);
5877     if (mask_len != 16) {
5878       kshiftrwl(dst, dst, 16 - mask_len);
5879     }
5880   }
5881 }
5882 
5883 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5884   int lane_size = type2aelembytes(bt);
5885   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5886       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5887     movptr(rtmp, imm32);
5888     switch(lane_size) {
5889       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5890       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5891       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5892       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5893       fatal("Unsupported lane size %d", lane_size);
5894       break;
5895     }
5896   } else {
5897     movptr(rtmp, imm32);
5898     movq(dst, rtmp);
5899     switch(lane_size) {
5900       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5901       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5902       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5903       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5904       fatal("Unsupported lane size %d", lane_size);
5905       break;
5906     }
5907   }
5908 }
5909 
5910 //
5911 // Following is lookup table based popcount computation algorithm:-
5912 //       Index   Bit set count
5913 //     [ 0000 ->   0,
5914 //       0001 ->   1,
5915 //       0010 ->   1,
5916 //       0011 ->   2,
5917 //       0100 ->   1,
5918 //       0101 ->   2,
5919 //       0110 ->   2,
5920 //       0111 ->   3,
5921 //       1000 ->   1,
5922 //       1001 ->   2,
5923 //       1010 ->   3,
5924 //       1011 ->   3,
5925 //       1100 ->   2,
5926 //       1101 ->   3,
5927 //       1111 ->   4 ]
5928 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5929 //     shuffle indices for lookup table access.
5930 //  b. Right shift each byte of vector lane by 4 positions.
5931 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5932 //     shuffle indices for lookup table access.
5933 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5934 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5935 //     count of all the bytes of a quadword.
5936 //  f. Perform step e. for upper 128bit vector lane.
5937 //  g. Pack the bitset count of quadwords back to double word.
5938 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5939 
5940 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5941                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5942   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5943   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5944   vpsrlw(dst, src, 4, vec_enc);
5945   vpand(dst, dst, xtmp1, vec_enc);
5946   vpand(xtmp1, src, xtmp1, vec_enc);
5947   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5948   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5949   vpshufb(dst, xtmp2, dst, vec_enc);
5950   vpaddb(dst, dst, xtmp1, vec_enc);
5951 }
5952 
5953 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5954                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5955   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5956   // Following code is as per steps e,f,g and h of above algorithm.
5957   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5958   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5959   vpsadbw(dst, dst, xtmp2, vec_enc);
5960   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5961   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5962   vpackuswb(dst, xtmp1, dst, vec_enc);
5963 }
5964 
5965 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5966                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5967   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5968   // Add the popcount of upper and lower bytes of word.
5969   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5970   vpsrlw(dst, xtmp1, 8, vec_enc);
5971   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5972   vpaddw(dst, dst, xtmp1, vec_enc);
5973 }
5974 
5975 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5976                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5977   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5978   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5979   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5980 }
5981 
5982 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5983                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5984   switch(bt) {
5985     case T_LONG:
5986       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5987       break;
5988     case T_INT:
5989       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5990       break;
5991     case T_CHAR:
5992     case T_SHORT:
5993       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5994       break;
5995     case T_BYTE:
5996     case T_BOOLEAN:
5997       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5998       break;
5999     default:
6000       fatal("Unsupported type %s", type2name(bt));
6001       break;
6002   }
6003 }
6004 
6005 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6006                                                       KRegister mask, bool merge, int vec_enc) {
6007   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6008   switch(bt) {
6009     case T_LONG:
6010       assert(VM_Version::supports_avx512_vpopcntdq(), "");
6011       evpopcntq(dst, mask, src, merge, vec_enc);
6012       break;
6013     case T_INT:
6014       assert(VM_Version::supports_avx512_vpopcntdq(), "");
6015       evpopcntd(dst, mask, src, merge, vec_enc);
6016       break;
6017     case T_CHAR:
6018     case T_SHORT:
6019       assert(VM_Version::supports_avx512_bitalg(), "");
6020       evpopcntw(dst, mask, src, merge, vec_enc);
6021       break;
6022     case T_BYTE:
6023     case T_BOOLEAN:
6024       assert(VM_Version::supports_avx512_bitalg(), "");
6025       evpopcntb(dst, mask, src, merge, vec_enc);
6026       break;
6027     default:
6028       fatal("Unsupported type %s", type2name(bt));
6029       break;
6030   }
6031 }
6032 
6033 // Bit reversal algorithm first reverses the bits of each byte followed by
6034 // a byte level reversal for multi-byte primitive types (short/int/long).
6035 // Algorithm performs a lookup table access to get reverse bit sequence
6036 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6037 // is obtained by swapping the reverse bit sequences of upper and lower
6038 // nibble of a byte.
6039 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6040                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
6041   if (VM_Version::supports_avx512vlbw()) {
6042 
6043     // Get the reverse bit sequence of lower nibble of each byte.
6044     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6045     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6046     evpandq(dst, xtmp2, src, vec_enc);
6047     vpshufb(dst, xtmp1, dst, vec_enc);
6048     vpsllq(dst, dst, 4, vec_enc);
6049 
6050     // Get the reverse bit sequence of upper nibble of each byte.
6051     vpandn(xtmp2, xtmp2, src, vec_enc);
6052     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6053     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6054 
6055     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6056     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6057     evporq(xtmp2, dst, xtmp2, vec_enc);
6058     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6059 
6060   } else if(vec_enc == Assembler::AVX_512bit) {
6061     // Shift based bit reversal.
6062     assert(bt == T_LONG || bt == T_INT, "");
6063 
6064     // Swap lower and upper nibble of each byte.
6065     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6066 
6067     // Swap two least and most significant bits of each nibble.
6068     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6069 
6070     // Swap adjacent pair of bits.
6071     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6072     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6073 
6074     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6075     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6076   } else {
6077     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6078     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6079 
6080     // Get the reverse bit sequence of lower nibble of each byte.
6081     vpand(dst, xtmp2, src, vec_enc);
6082     vpshufb(dst, xtmp1, dst, vec_enc);
6083     vpsllq(dst, dst, 4, vec_enc);
6084 
6085     // Get the reverse bit sequence of upper nibble of each byte.
6086     vpandn(xtmp2, xtmp2, src, vec_enc);
6087     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6088     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6089 
6090     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6091     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6092     vpor(xtmp2, dst, xtmp2, vec_enc);
6093     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6094   }
6095 }
6096 
6097 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6098                                                 XMMRegister xtmp, Register rscratch) {
6099   assert(VM_Version::supports_gfni(), "");
6100   assert(rscratch != noreg || always_reachable(mask), "missing");
6101 
6102   // Galois field instruction based bit reversal based on following algorithm.
6103   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6104   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6105   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6106   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6107 }
6108 
6109 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6110                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6111   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6112   evpandq(dst, xtmp1, src, vec_enc);
6113   vpsllq(dst, dst, nbits, vec_enc);
6114   vpandn(xtmp1, xtmp1, src, vec_enc);
6115   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6116   evporq(dst, dst, xtmp1, vec_enc);
6117 }
6118 
6119 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6120                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6121   // Shift based bit reversal.
6122   assert(VM_Version::supports_evex(), "");
6123   switch(bt) {
6124     case T_LONG:
6125       // Swap upper and lower double word of each quad word.
6126       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6127       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6128       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6129       break;
6130     case T_INT:
6131       // Swap upper and lower word of each double word.
6132       evprord(xtmp1, k0, src, 16, true, vec_enc);
6133       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6134       break;
6135     case T_CHAR:
6136     case T_SHORT:
6137       // Swap upper and lower byte of each word.
6138       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6139       break;
6140     case T_BYTE:
6141       evmovdquq(dst, k0, src, true, vec_enc);
6142       break;
6143     default:
6144       fatal("Unsupported type %s", type2name(bt));
6145       break;
6146   }
6147 }
6148 
6149 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6150   if (bt == T_BYTE) {
6151     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6152       evmovdquq(dst, k0, src, true, vec_enc);
6153     } else {
6154       vmovdqu(dst, src);
6155     }
6156     return;
6157   }
6158   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6159   // pre-computed shuffle indices.
6160   switch(bt) {
6161     case T_LONG:
6162       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6163       break;
6164     case T_INT:
6165       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6166       break;
6167     case T_CHAR:
6168     case T_SHORT:
6169       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6170       break;
6171     default:
6172       fatal("Unsupported type %s", type2name(bt));
6173       break;
6174   }
6175   vpshufb(dst, src, dst, vec_enc);
6176 }
6177 
6178 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6179                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6180                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6181   assert(is_integral_type(bt), "");
6182   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6183   assert(VM_Version::supports_avx512cd(), "");
6184   switch(bt) {
6185     case T_LONG:
6186       evplzcntq(dst, ktmp, src, merge, vec_enc);
6187       break;
6188     case T_INT:
6189       evplzcntd(dst, ktmp, src, merge, vec_enc);
6190       break;
6191     case T_SHORT:
6192       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6193       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6194       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6195       vpunpckhwd(dst, xtmp1, src, vec_enc);
6196       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6197       vpackusdw(dst, xtmp2, dst, vec_enc);
6198       break;
6199     case T_BYTE:
6200       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6201       // accessing the lookup table.
6202       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6203       // accessing the lookup table.
6204       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6205       assert(VM_Version::supports_avx512bw(), "");
6206       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6207       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6208       vpand(xtmp2, dst, src, vec_enc);
6209       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6210       vpsrlw(xtmp3, src, 4, vec_enc);
6211       vpand(xtmp3, dst, xtmp3, vec_enc);
6212       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6213       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6214       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6215       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6216       break;
6217     default:
6218       fatal("Unsupported type %s", type2name(bt));
6219       break;
6220   }
6221 }
6222 
6223 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6224                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6225   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6226   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6227   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6228   // accessing the lookup table.
6229   vpand(dst, xtmp2, src, vec_enc);
6230   vpshufb(dst, xtmp1, dst, vec_enc);
6231   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6232   // accessing the lookup table.
6233   vpsrlw(xtmp3, src, 4, vec_enc);
6234   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6235   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6236   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6237   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6238   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6239   vpaddb(dst, dst, xtmp2, vec_enc);
6240   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6241 }
6242 
6243 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6244                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6245   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6246   // Add zero counts of lower byte and upper byte of a word if
6247   // upper byte holds a zero value.
6248   vpsrlw(xtmp3, src, 8, vec_enc);
6249   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6250   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6251   vpsllw(xtmp2, dst, 8, vec_enc);
6252   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6253   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6254   vpsrlw(dst, dst, 8, vec_enc);
6255 }
6256 
6257 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6258                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6259   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6260   // hence biased exponent can be used to compute leading zero count as per
6261   // following formula:-
6262   // LZCNT = 31 - (biased_exp - 127)
6263   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6264 
6265   // Broadcast 0xFF
6266   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6267   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6268 
6269   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6270   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6271   // contributes to the leading number of zeros.
6272   vpsrld(xtmp2, src, 1, vec_enc);
6273   vpandn(xtmp3, xtmp2, src, vec_enc);
6274 
6275   // Extract biased exponent.
6276   vcvtdq2ps(dst, xtmp3, vec_enc);
6277   vpsrld(dst, dst, 23, vec_enc);
6278   vpand(dst, dst, xtmp1, vec_enc);
6279 
6280   // Broadcast 127.
6281   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6282   // Exponent = biased_exp - 127
6283   vpsubd(dst, dst, xtmp1, vec_enc);
6284 
6285   // Exponent_plus_one = Exponent + 1
6286   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6287   vpaddd(dst, dst, xtmp3, vec_enc);
6288 
6289   // Replace -ve exponent with zero, exponent is -ve when src
6290   // lane contains a zero value.
6291   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6292   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6293 
6294   // Rematerialize broadcast 32.
6295   vpslld(xtmp1, xtmp3, 5, vec_enc);
6296   // Exponent is 32 if corresponding source lane contains max_int value.
6297   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6298   // LZCNT = 32 - exponent_plus_one
6299   vpsubd(dst, xtmp1, dst, vec_enc);
6300 
6301   // Replace LZCNT with a value 1 if corresponding source lane
6302   // contains max_int value.
6303   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6304 
6305   // Replace biased_exp with 0 if source lane value is less than zero.
6306   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6307   vblendvps(dst, dst, xtmp2, src, vec_enc);
6308 }
6309 
6310 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6311                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6312   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6313   // Add zero counts of lower word and upper word of a double word if
6314   // upper word holds a zero value.
6315   vpsrld(xtmp3, src, 16, vec_enc);
6316   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6317   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6318   vpslld(xtmp2, dst, 16, vec_enc);
6319   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6320   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6321   vpsrld(dst, dst, 16, vec_enc);
6322   // Add zero counts of lower doubleword and upper doubleword of a
6323   // quadword if upper doubleword holds a zero value.
6324   vpsrlq(xtmp3, src, 32, vec_enc);
6325   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6326   vpsllq(xtmp2, dst, 32, vec_enc);
6327   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6328   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6329   vpsrlq(dst, dst, 32, vec_enc);
6330 }
6331 
6332 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6333                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6334                                                        Register rtmp, int vec_enc) {
6335   assert(is_integral_type(bt), "unexpected type");
6336   assert(vec_enc < Assembler::AVX_512bit, "");
6337   switch(bt) {
6338     case T_LONG:
6339       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6340       break;
6341     case T_INT:
6342       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6343       break;
6344     case T_SHORT:
6345       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6346       break;
6347     case T_BYTE:
6348       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6349       break;
6350     default:
6351       fatal("Unsupported type %s", type2name(bt));
6352       break;
6353   }
6354 }
6355 
6356 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6357   switch(bt) {
6358     case T_BYTE:
6359       vpsubb(dst, src1, src2, vec_enc);
6360       break;
6361     case T_SHORT:
6362       vpsubw(dst, src1, src2, vec_enc);
6363       break;
6364     case T_INT:
6365       vpsubd(dst, src1, src2, vec_enc);
6366       break;
6367     case T_LONG:
6368       vpsubq(dst, src1, src2, vec_enc);
6369       break;
6370     default:
6371       fatal("Unsupported type %s", type2name(bt));
6372       break;
6373   }
6374 }
6375 
6376 // Trailing zero count computation is based on leading zero count operation as per
6377 // following equation. All AVX3 targets support AVX512CD feature which offers
6378 // direct vector instruction to compute leading zero count.
6379 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6380 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6381                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6382                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6383   assert(is_integral_type(bt), "");
6384   // xtmp = -1
6385   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6386   // xtmp = xtmp + src
6387   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6388   // xtmp = xtmp & ~src
6389   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6390   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6391   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6392   vpsub(bt, dst, xtmp4, dst, vec_enc);
6393 }
6394 
6395 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6396 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6397 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6398                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6399   assert(is_integral_type(bt), "");
6400   // xtmp = 0
6401   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6402   // xtmp = 0 - src
6403   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6404   // xtmp = xtmp | src
6405   vpor(xtmp3, xtmp3, src, vec_enc);
6406   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6407   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6408   vpsub(bt, dst, xtmp1, dst, vec_enc);
6409 }
6410 
6411 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6412   Label done;
6413   Label neg_divisor_fastpath;
6414   cmpl(divisor, 0);
6415   jccb(Assembler::less, neg_divisor_fastpath);
6416   xorl(rdx, rdx);
6417   divl(divisor);
6418   jmpb(done);
6419   bind(neg_divisor_fastpath);
6420   // Fastpath for divisor < 0:
6421   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6422   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6423   movl(rdx, rax);
6424   subl(rdx, divisor);
6425   if (VM_Version::supports_bmi1()) {
6426     andnl(rax, rdx, rax);
6427   } else {
6428     notl(rdx);
6429     andl(rax, rdx);
6430   }
6431   shrl(rax, 31);
6432   bind(done);
6433 }
6434 
6435 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6436   Label done;
6437   Label neg_divisor_fastpath;
6438   cmpl(divisor, 0);
6439   jccb(Assembler::less, neg_divisor_fastpath);
6440   xorl(rdx, rdx);
6441   divl(divisor);
6442   jmpb(done);
6443   bind(neg_divisor_fastpath);
6444   // Fastpath when divisor < 0:
6445   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6446   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6447   movl(rdx, rax);
6448   subl(rax, divisor);
6449   if (VM_Version::supports_bmi1()) {
6450     andnl(rax, rax, rdx);
6451   } else {
6452     notl(rax);
6453     andl(rax, rdx);
6454   }
6455   sarl(rax, 31);
6456   andl(rax, divisor);
6457   subl(rdx, rax);
6458   bind(done);
6459 }
6460 
6461 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6462   Label done;
6463   Label neg_divisor_fastpath;
6464 
6465   cmpl(divisor, 0);
6466   jccb(Assembler::less, neg_divisor_fastpath);
6467   xorl(rdx, rdx);
6468   divl(divisor);
6469   jmpb(done);
6470   bind(neg_divisor_fastpath);
6471   // Fastpath for divisor < 0:
6472   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6473   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6474   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6475   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6476   movl(rdx, rax);
6477   subl(rax, divisor);
6478   if (VM_Version::supports_bmi1()) {
6479     andnl(rax, rax, rdx);
6480   } else {
6481     notl(rax);
6482     andl(rax, rdx);
6483   }
6484   movl(tmp, rax);
6485   shrl(rax, 31); // quotient
6486   sarl(tmp, 31);
6487   andl(tmp, divisor);
6488   subl(rdx, tmp); // remainder
6489   bind(done);
6490 }
6491 
6492 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6493                                  XMMRegister xtmp2, Register rtmp) {
6494   if(VM_Version::supports_gfni()) {
6495     // Galois field instruction based bit reversal based on following algorithm.
6496     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6497     mov64(rtmp, 0x8040201008040201L);
6498     movq(xtmp1, src);
6499     movq(xtmp2, rtmp);
6500     gf2p8affineqb(xtmp1, xtmp2, 0);
6501     movq(dst, xtmp1);
6502   } else {
6503     // Swap even and odd numbered bits.
6504     movl(rtmp, src);
6505     andl(rtmp, 0x55555555);
6506     shll(rtmp, 1);
6507     movl(dst, src);
6508     andl(dst, 0xAAAAAAAA);
6509     shrl(dst, 1);
6510     orl(dst, rtmp);
6511 
6512     // Swap LSB and MSB 2 bits of each nibble.
6513     movl(rtmp, dst);
6514     andl(rtmp, 0x33333333);
6515     shll(rtmp, 2);
6516     andl(dst, 0xCCCCCCCC);
6517     shrl(dst, 2);
6518     orl(dst, rtmp);
6519 
6520     // Swap LSB and MSB 4 bits of each byte.
6521     movl(rtmp, dst);
6522     andl(rtmp, 0x0F0F0F0F);
6523     shll(rtmp, 4);
6524     andl(dst, 0xF0F0F0F0);
6525     shrl(dst, 4);
6526     orl(dst, rtmp);
6527   }
6528   bswapl(dst);
6529 }
6530 
6531 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6532                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6533   if(VM_Version::supports_gfni()) {
6534     // Galois field instruction based bit reversal based on following algorithm.
6535     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6536     mov64(rtmp1, 0x8040201008040201L);
6537     movq(xtmp1, src);
6538     movq(xtmp2, rtmp1);
6539     gf2p8affineqb(xtmp1, xtmp2, 0);
6540     movq(dst, xtmp1);
6541   } else {
6542     // Swap even and odd numbered bits.
6543     movq(rtmp1, src);
6544     mov64(rtmp2, 0x5555555555555555L);
6545     andq(rtmp1, rtmp2);
6546     shlq(rtmp1, 1);
6547     movq(dst, src);
6548     notq(rtmp2);
6549     andq(dst, rtmp2);
6550     shrq(dst, 1);
6551     orq(dst, rtmp1);
6552 
6553     // Swap LSB and MSB 2 bits of each nibble.
6554     movq(rtmp1, dst);
6555     mov64(rtmp2, 0x3333333333333333L);
6556     andq(rtmp1, rtmp2);
6557     shlq(rtmp1, 2);
6558     notq(rtmp2);
6559     andq(dst, rtmp2);
6560     shrq(dst, 2);
6561     orq(dst, rtmp1);
6562 
6563     // Swap LSB and MSB 4 bits of each byte.
6564     movq(rtmp1, dst);
6565     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6566     andq(rtmp1, rtmp2);
6567     shlq(rtmp1, 4);
6568     notq(rtmp2);
6569     andq(dst, rtmp2);
6570     shrq(dst, 4);
6571     orq(dst, rtmp1);
6572   }
6573   bswapq(dst);
6574 }
6575 
6576 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6577   Label done;
6578   Label neg_divisor_fastpath;
6579   cmpq(divisor, 0);
6580   jccb(Assembler::less, neg_divisor_fastpath);
6581   xorl(rdx, rdx);
6582   divq(divisor);
6583   jmpb(done);
6584   bind(neg_divisor_fastpath);
6585   // Fastpath for divisor < 0:
6586   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6587   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6588   movq(rdx, rax);
6589   subq(rdx, divisor);
6590   if (VM_Version::supports_bmi1()) {
6591     andnq(rax, rdx, rax);
6592   } else {
6593     notq(rdx);
6594     andq(rax, rdx);
6595   }
6596   shrq(rax, 63);
6597   bind(done);
6598 }
6599 
6600 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6601   Label done;
6602   Label neg_divisor_fastpath;
6603   cmpq(divisor, 0);
6604   jccb(Assembler::less, neg_divisor_fastpath);
6605   xorq(rdx, rdx);
6606   divq(divisor);
6607   jmp(done);
6608   bind(neg_divisor_fastpath);
6609   // Fastpath when divisor < 0:
6610   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6611   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6612   movq(rdx, rax);
6613   subq(rax, divisor);
6614   if (VM_Version::supports_bmi1()) {
6615     andnq(rax, rax, rdx);
6616   } else {
6617     notq(rax);
6618     andq(rax, rdx);
6619   }
6620   sarq(rax, 63);
6621   andq(rax, divisor);
6622   subq(rdx, rax);
6623   bind(done);
6624 }
6625 
6626 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6627   Label done;
6628   Label neg_divisor_fastpath;
6629   cmpq(divisor, 0);
6630   jccb(Assembler::less, neg_divisor_fastpath);
6631   xorq(rdx, rdx);
6632   divq(divisor);
6633   jmp(done);
6634   bind(neg_divisor_fastpath);
6635   // Fastpath for divisor < 0:
6636   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6637   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6638   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6639   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6640   movq(rdx, rax);
6641   subq(rax, divisor);
6642   if (VM_Version::supports_bmi1()) {
6643     andnq(rax, rax, rdx);
6644   } else {
6645     notq(rax);
6646     andq(rax, rdx);
6647   }
6648   movq(tmp, rax);
6649   shrq(rax, 63); // quotient
6650   sarq(tmp, 63);
6651   andq(tmp, divisor);
6652   subq(rdx, tmp); // remainder
6653   bind(done);
6654 }
6655 
6656 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6657                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6658                                         int vlen_enc) {
6659   assert(VM_Version::supports_avx512bw(), "");
6660   // Byte shuffles are inlane operations and indices are determined using
6661   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6662   // normalized to index range 0-15. This makes sure that all the multiples
6663   // of an index value are placed at same relative position in 128 bit
6664   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6665   // will be 16th element in their respective 128 bit lanes.
6666   movl(rtmp, 16);
6667   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6668 
6669   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6670   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6671   // original shuffle indices and move the shuffled lanes corresponding to true
6672   // mask to destination vector.
6673   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6674   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6675   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6676 
6677   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6678   // and broadcasting second 128 bit lane.
6679   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6680   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6681   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6682   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6683   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6684 
6685   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6686   // and broadcasting third 128 bit lane.
6687   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6688   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6689   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6690   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6691   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6692 
6693   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6694   // and broadcasting third 128 bit lane.
6695   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6696   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6697   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6698   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6699   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6700 }
6701 
6702 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6703                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6704   if (vlen_enc == AVX_128bit) {
6705     vpermilps(dst, src, shuffle, vlen_enc);
6706   } else if (bt == T_INT) {
6707     vpermd(dst, shuffle, src, vlen_enc);
6708   } else {
6709     assert(bt == T_FLOAT, "");
6710     vpermps(dst, shuffle, src, vlen_enc);
6711   }
6712 }
6713 
6714 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6715   switch(opcode) {
6716     case Op_AddHF: vaddsh(dst, src1, src2); break;
6717     case Op_SubHF: vsubsh(dst, src1, src2); break;
6718     case Op_MulHF: vmulsh(dst, src1, src2); break;
6719     case Op_DivHF: vdivsh(dst, src1, src2); break;
6720     default: assert(false, "%s", NodeClassNames[opcode]); break;
6721   }
6722 }
6723 
6724 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6725   switch(elem_bt) {
6726     case T_BYTE:
6727       if (ideal_opc == Op_SaturatingAddV) {
6728         vpaddsb(dst, src1, src2, vlen_enc);
6729       } else {
6730         assert(ideal_opc == Op_SaturatingSubV, "");
6731         vpsubsb(dst, src1, src2, vlen_enc);
6732       }
6733       break;
6734     case T_SHORT:
6735       if (ideal_opc == Op_SaturatingAddV) {
6736         vpaddsw(dst, src1, src2, vlen_enc);
6737       } else {
6738         assert(ideal_opc == Op_SaturatingSubV, "");
6739         vpsubsw(dst, src1, src2, vlen_enc);
6740       }
6741       break;
6742     default:
6743       fatal("Unsupported type %s", type2name(elem_bt));
6744       break;
6745   }
6746 }
6747 
6748 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6749   switch(elem_bt) {
6750     case T_BYTE:
6751       if (ideal_opc == Op_SaturatingAddV) {
6752         vpaddusb(dst, src1, src2, vlen_enc);
6753       } else {
6754         assert(ideal_opc == Op_SaturatingSubV, "");
6755         vpsubusb(dst, src1, src2, vlen_enc);
6756       }
6757       break;
6758     case T_SHORT:
6759       if (ideal_opc == Op_SaturatingAddV) {
6760         vpaddusw(dst, src1, src2, vlen_enc);
6761       } else {
6762         assert(ideal_opc == Op_SaturatingSubV, "");
6763         vpsubusw(dst, src1, src2, vlen_enc);
6764       }
6765       break;
6766     default:
6767       fatal("Unsupported type %s", type2name(elem_bt));
6768       break;
6769   }
6770 }
6771 
6772 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6773                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6774   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6775   // overflow_mask = Inp1 <u Inp2
6776   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6777   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6778   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6779 }
6780 
6781 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6782                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6783   // Emulate unsigned comparison using signed comparison
6784   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6785   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6786   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6787   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6788 
6789   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6790 
6791   // Res = INP1 - INP2 (non-commutative and non-associative)
6792   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6793   // Res = Mask ? Zero : Res
6794   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6795   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6796 }
6797 
6798 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6799                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6800   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6801   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6802   // Res = Signed Add INP1, INP2
6803   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6804   // T1 = SRC1 | SRC2
6805   vpor(xtmp1, src1, src2, vlen_enc);
6806   // Max_Unsigned = -1
6807   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6808   // Unsigned compare:  Mask = Res <u T1
6809   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6810   // res  = Mask ? Max_Unsigned : Res
6811   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6812 }
6813 
6814 //
6815 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6816 // unsigned addition operation.
6817 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6818 //
6819 // We empirically determined its semantic equivalence to following reduced expression
6820 //    overflow_mask =  (a + b) <u (a | b)
6821 //
6822 // and also verified it though Alive2 solver.
6823 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6824 //
6825 
6826 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6827                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6828   // Res = Signed Add INP1, INP2
6829   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6830   // Compute T1 = INP1 | INP2
6831   vpor(xtmp3, src1, src2, vlen_enc);
6832   // T1 = Minimum signed value.
6833   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6834   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6835   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6836   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6837   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6838   // Compute overflow detection mask = Res<1> <s T1
6839   if (elem_bt == T_INT) {
6840     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6841   } else {
6842     assert(elem_bt == T_LONG, "");
6843     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6844   }
6845   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6846 }
6847 
6848 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6849                                       int vlen_enc, bool xtmp2_hold_M1) {
6850   if (VM_Version::supports_avx512dq()) {
6851     evpmovq2m(ktmp, src, vlen_enc);
6852   } else {
6853     assert(VM_Version::supports_evex(), "");
6854     if (!xtmp2_hold_M1) {
6855       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6856     }
6857     evpsraq(xtmp1, src, 63, vlen_enc);
6858     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6859   }
6860 }
6861 
6862 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6863                                       int vlen_enc, bool xtmp2_hold_M1) {
6864   if (VM_Version::supports_avx512dq()) {
6865     evpmovd2m(ktmp, src, vlen_enc);
6866   } else {
6867     assert(VM_Version::supports_evex(), "");
6868     if (!xtmp2_hold_M1) {
6869       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6870     }
6871     vpsrad(xtmp1, src, 31, vlen_enc);
6872     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6873   }
6874 }
6875 
6876 
6877 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6878   if (elem_bt == T_LONG) {
6879     if (VM_Version::supports_evex()) {
6880       evpsraq(dst, src, 63, vlen_enc);
6881     } else {
6882       vpsrad(dst, src, 31, vlen_enc);
6883       vpshufd(dst, dst, 0xF5, vlen_enc);
6884     }
6885   } else {
6886     assert(elem_bt == T_INT, "");
6887     vpsrad(dst, src, 31, vlen_enc);
6888   }
6889 }
6890 
6891 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6892   if (compute_allones) {
6893     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6894       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6895     } else {
6896       vpcmpeqq(allones, allones, allones, vlen_enc);
6897     }
6898   }
6899   if (elem_bt == T_LONG) {
6900     vpsrlq(dst, allones, 1, vlen_enc);
6901   } else {
6902     assert(elem_bt == T_INT, "");
6903     vpsrld(dst, allones, 1, vlen_enc);
6904   }
6905 }
6906 
6907 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6908   if (compute_allones) {
6909     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6910       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6911     } else {
6912       vpcmpeqq(allones, allones, allones, vlen_enc);
6913     }
6914   }
6915   if (elem_bt == T_LONG) {
6916     vpsllq(dst, allones, 63, vlen_enc);
6917   } else {
6918     assert(elem_bt == T_INT, "");
6919     vpslld(dst, allones, 31, vlen_enc);
6920   }
6921 }
6922 
6923 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6924                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6925   switch(elem_bt) {
6926     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6927     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6928     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6929     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6930     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6931   }
6932 }
6933 
6934 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6935   switch(elem_bt) {
6936     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6937     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6938     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6939     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6940     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6941   }
6942 }
6943 
6944 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6945                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6946   if (elem_bt == T_LONG) {
6947     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6948   } else {
6949     assert(elem_bt == T_INT, "");
6950     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6951   }
6952 }
6953 
6954 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6955                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6956                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6957   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6958   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6959   // Overflow detection based on Hacker's delight section 2-13.
6960   if (ideal_opc == Op_SaturatingAddV) {
6961     // res = src1 + src2
6962     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6963     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6964     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6965     vpxor(xtmp1, dst, src1, vlen_enc);
6966     vpxor(xtmp2, dst, src2, vlen_enc);
6967     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6968   } else {
6969     assert(ideal_opc == Op_SaturatingSubV, "");
6970     // res = src1 - src2
6971     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6972     // Overflow occurs when both inputs have opposite polarity and
6973     // result polarity does not comply with first input polarity.
6974     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6975     vpxor(xtmp1, src1, src2, vlen_enc);
6976     vpxor(xtmp2, dst, src1, vlen_enc);
6977     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6978   }
6979 
6980   // Compute overflow detection mask.
6981   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6982   // Note: xtmp1 hold -1 in all its lanes after above call.
6983 
6984   // Compute mask based on first input polarity.
6985   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6986 
6987   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6988   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6989 
6990   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6991   // set bits in first input polarity mask holds a min value.
6992   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6993   // Blend destination lanes with saturated values using overflow detection mask.
6994   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6995 }
6996 
6997 
6998 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6999                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
7000                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
7001   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
7002   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
7003   // Overflow detection based on Hacker's delight section 2-13.
7004   if (ideal_opc == Op_SaturatingAddV) {
7005     // res = src1 + src2
7006     vpadd(elem_bt, dst, src1, src2, vlen_enc);
7007     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
7008     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
7009     vpxor(xtmp1, dst, src1, vlen_enc);
7010     vpxor(xtmp2, dst, src2, vlen_enc);
7011     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
7012   } else {
7013     assert(ideal_opc == Op_SaturatingSubV, "");
7014     // res = src1 - src2
7015     vpsub(elem_bt, dst, src1, src2, vlen_enc);
7016     // Overflow occurs when both inputs have opposite polarity and
7017     // result polarity does not comply with first input polarity.
7018     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
7019     vpxor(xtmp1, src1, src2, vlen_enc);
7020     vpxor(xtmp2, dst, src1, vlen_enc);
7021     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
7022   }
7023 
7024   // Sign-extend to compute overflow detection mask.
7025   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
7026 
7027   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
7028   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
7029   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
7030 
7031   // Compose saturating min/max vector using first input polarity mask.
7032   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
7033   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
7034 
7035   // Blend result with saturating vector using overflow detection mask.
7036   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
7037 }
7038 
7039 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7040   switch(elem_bt) {
7041     case T_BYTE:
7042       if (ideal_opc == Op_SaturatingAddV) {
7043         vpaddsb(dst, src1, src2, vlen_enc);
7044       } else {
7045         assert(ideal_opc == Op_SaturatingSubV, "");
7046         vpsubsb(dst, src1, src2, vlen_enc);
7047       }
7048       break;
7049     case T_SHORT:
7050       if (ideal_opc == Op_SaturatingAddV) {
7051         vpaddsw(dst, src1, src2, vlen_enc);
7052       } else {
7053         assert(ideal_opc == Op_SaturatingSubV, "");
7054         vpsubsw(dst, src1, src2, vlen_enc);
7055       }
7056       break;
7057     default:
7058       fatal("Unsupported type %s", type2name(elem_bt));
7059       break;
7060   }
7061 }
7062 
7063 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7064   switch(elem_bt) {
7065     case T_BYTE:
7066       if (ideal_opc == Op_SaturatingAddV) {
7067         vpaddusb(dst, src1, src2, vlen_enc);
7068       } else {
7069         assert(ideal_opc == Op_SaturatingSubV, "");
7070         vpsubusb(dst, src1, src2, vlen_enc);
7071       }
7072       break;
7073     case T_SHORT:
7074       if (ideal_opc == Op_SaturatingAddV) {
7075         vpaddusw(dst, src1, src2, vlen_enc);
7076       } else {
7077         assert(ideal_opc == Op_SaturatingSubV, "");
7078         vpsubusw(dst, src1, src2, vlen_enc);
7079       }
7080       break;
7081     default:
7082       fatal("Unsupported type %s", type2name(elem_bt));
7083       break;
7084   }
7085 }
7086 
7087 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7088                                                      XMMRegister src2, int vlen_enc) {
7089   switch(elem_bt) {
7090     case T_BYTE:
7091       evpermi2b(dst, src1, src2, vlen_enc);
7092       break;
7093     case T_SHORT:
7094       evpermi2w(dst, src1, src2, vlen_enc);
7095       break;
7096     case T_INT:
7097       evpermi2d(dst, src1, src2, vlen_enc);
7098       break;
7099     case T_LONG:
7100       evpermi2q(dst, src1, src2, vlen_enc);
7101       break;
7102     case T_FLOAT:
7103       evpermi2ps(dst, src1, src2, vlen_enc);
7104       break;
7105     case T_DOUBLE:
7106       evpermi2pd(dst, src1, src2, vlen_enc);
7107       break;
7108     default:
7109       fatal("Unsupported type %s", type2name(elem_bt));
7110       break;
7111   }
7112 }
7113 
7114 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7115   if (is_unsigned) {
7116     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7117   } else {
7118     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7119   }
7120 }
7121 
7122 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7123   if (is_unsigned) {
7124     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7125   } else {
7126     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7127   }
7128 }
7129 
7130 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7131   switch(opcode) {
7132     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7133     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7134     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7135     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7136     default: assert(false, "%s", NodeClassNames[opcode]); break;
7137   }
7138 }
7139 
7140 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7141   switch(opcode) {
7142     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7143     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7144     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7145     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7146     default: assert(false, "%s", NodeClassNames[opcode]); break;
7147   }
7148 }
7149 
7150 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7151                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7152   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7153 }
7154 
7155 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7156                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7157   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7158     // Move sign bits of src2 to mask register.
7159     evpmovw2m(ktmp, src2, vlen_enc);
7160     // xtmp1 = src2 < 0 ? src2 : src1
7161     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7162     // xtmp2 = src2 < 0 ? ? src1 : src2
7163     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7164     // Idea behind above swapping is to make seconds source operand a +ve value.
7165     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7166     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7167     // the second source operand, either a NaN or a valid floating-point value, is returned
7168     // dst = max(xtmp1, xtmp2)
7169     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7170     // isNaN = is_unordered_quiet(xtmp1)
7171     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7172     // Final result is same as first source if its a NaN value,
7173     // in case second operand holds a NaN value then as per above semantics
7174     // result is same as second operand.
7175     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7176   } else {
7177     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7178     // Move sign bits of src1 to mask register.
7179     evpmovw2m(ktmp, src1, vlen_enc);
7180     // xtmp1 = src1 < 0 ? src2 : src1
7181     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7182     // xtmp2 = src1 < 0 ? src1 : src2
7183     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7184     // Idea behind above swapping is to make seconds source operand a -ve value.
7185     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7186     // the second source operand is returned.
7187     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7188     // or a valid floating-point value, is written to the result.
7189     // dst = min(xtmp1, xtmp2)
7190     evminph(dst, xtmp1, xtmp2, vlen_enc);
7191     // isNaN = is_unordered_quiet(xtmp1)
7192     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7193     // Final result is same as first source if its a NaN value,
7194     // in case second operand holds a NaN value then as per above semantics
7195     // result is same as second operand.
7196     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7197   }
7198 }